From dc27b0e72779243f37f601de65b6f2ffd6ad7461 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 7 Nov 2023 16:42:20 -0500
Subject: [PATCH] fix: update build dependencies and format chat prompt (#569)

chore: update correct check and format prompt

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
---
 openllm-python/src/openllm/_llm.py               |  9 +++++----
 openllm-python/src/openllm/bundle/_package.py    | 16 ++++++----------
 openllm-python/src/openllm/bundle/oci/Dockerfile |  4 ++--
 .../src/openllm/entrypoints/_openapi.py          | 10 ++++++----
 openllm-python/src/openllm/entrypoints/openai.py |  4 ++--
 5 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 2702df8b..d3fc52d1 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -39,6 +39,7 @@ from openllm_core.utils import DEBUG
 from openllm_core.utils import LazyLoader
 from openllm_core.utils import ReprMixin
 from openllm_core.utils import apply
+from openllm_core.utils import check_bool_env
 from openllm_core.utils import codegen
 from openllm_core.utils import converter
 from openllm_core.utils import first_not_none
@@ -205,7 +206,7 @@ class LLM(t.Generic[M, T]):
   @property
   def import_kwargs(self)->tuple[dict[str, t.Any],dict[str, t.Any]]: return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {'padding_side': 'left', 'truncation_side': 'left'}
   @property
-  def trust_remote_code(self)->bool:return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE',False),default=self.config['trust_remote_code'])
+  def trust_remote_code(self)->bool:return first_not_none(check_bool_env('TRUST_REMOTE_CODE',False),default=self.config['trust_remote_code'])
   @property
   def runner_name(self)->str:return f"llm-{self.config['start_name']}-runner"
   @property
@@ -227,14 +228,14 @@ class LLM(t.Generic[M, T]):
       elif self._quantise is not None:self.__llm_quantization_config__,self._model_attrs=infer_quantisation_config(self, self._quantise, **self._model_attrs)
       else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
     return self.__llm_quantization_config__
-  def save_pretrained(self)->bentoml.Model: return openllm.import_model(self.config['start_name'], model_id=self.model_id, model_version=self._revision, backend=self.__llm_backend__, quantize=self._quantise)
+  def save_pretrained(self)->bentoml.Model:return openllm.import_model(self.config['start_name'], model_id=self.model_id, model_version=self._revision, backend=self.__llm_backend__, quantize=self._quantise)
   @property
-  def has_adapters(self)->bool: return self._adapter_map is not None
+  def has_adapters(self)->bool:return self._adapter_map is not None
   # NOTE: The section below defines a loose contract with langchain's LLM interface.
   @property
   def llm_type(self)->str:return normalise_model_name(self._model_id)
   @property
-  def identifying_params(self)->DictStrAny: return {'configuration': self.config.model_dump_json().decode(),'model_ids': orjson.dumps(self.config['model_ids']).decode(),'model_id': self.model_id}
+  def identifying_params(self)->DictStrAny:return {'configuration': self.config.model_dump_json().decode(),'model_ids': orjson.dumps(self.config['model_ids']).decode(),'model_id': self.model_id}
   @property
   def llm_parameters(self)->tuple[tuple[tuple[t.Any,...],DictStrAny],DictStrAny]:return (self._model_decls,self._model_attrs),self._tokenizer_attrs
   # yapf: enable
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index e4d89169..5d8ae6ac 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,6 +1,5 @@
 # mypy: disable-error-code="misc"
 from __future__ import annotations
-import importlib.metadata
 import inspect
 import logging
 import os
@@ -82,10 +81,9 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
   env['backend_value']
   if not openllm_core.utils.is_torch_available():
     raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
-  packages.extend([f'torch>={importlib.metadata.version("torch")}'])
+  packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22'])  # XXX: Currently locking this for correctness
   wheels: list[str] = []
-  built_wheels: list[str |
-                     None] = [build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')]
+  built_wheels = [build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')]
   if all(i for i in built_wheels):
     wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
   return PythonOptions(packages=packages,
@@ -93,11 +91,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
                        lock_packages=False,
                        extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])
 
-def construct_docker_options(llm: openllm.LLM[t.Any,
-                                              t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, adapter_map: dict[str, str] | None, dockerfile_template: str | None,
+def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, quantize: LiteralString | None, adapter_map: dict[str, str] | None, dockerfile_template: str | None,
                              serialisation: LiteralSerialisation, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
   from openllm.cli._factory import parse_config_options
-  environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
+  environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
   env: openllm_core.utils.EnvVarMixin = llm.config['env']
   env_dict = {
       env.backend: env['backend_value'],
@@ -221,13 +218,12 @@ def create_bento(bento_tag: bentoml.Tag,
   build_config = BentoBuildConfig(service=f"{llm.config['service_name']}:svc",
                                   name=bento_tag.name,
                                   labels=labels,
+                                  models=[llm_spec],
                                   description=f"OpenLLM service for {llm.config['start_name']}",
                                   include=list(llm_fs.walk.files()),
                                   exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
                                   python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
-                                  models=[llm_spec],
-                                  docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, _serialisation, container_registry,
-                                                                  container_version_strategy))
+                                  docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation, container_registry, container_version_strategy))
 
   bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
   # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
diff --git a/openllm-python/src/openllm/bundle/oci/Dockerfile b/openllm-python/src/openllm/bundle/oci/Dockerfile
index c3f4e4ce..946974a0 100644
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -57,7 +57,7 @@ FROM pytorch-install as kernel-builder
 RUN apt-get update && apt-get install -y --no-install-recommends ninja-build && \
     rm -rf /var/lib/apt/lists/*
 
-RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
+RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 && \
     /opt/conda/bin/conda clean -ya
 
 # base image
@@ -95,7 +95,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
   pip install --extra-index-url "https://download.pytorch.org/whl/cu118" \
               --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
               -v --no-cache-dir \
-              "ray==2.6.0" "einops" "vllm>=0.1.4" "auto-gptq[triton]" "torch>=2.0.1+cu118" xformers "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]"
+              "ray==2.6.0" "einops" "vllm==0.2.1.post1" "auto-gptq[triton]" "torch==2.0.1+cu118" xformers
 
 FROM base-container
 
diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py
index 54150a3e..8b8b4363 100644
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -45,10 +45,12 @@ responses:
     content:
       application/json:
         example:
-          id: davinci
-          object: model
-          created: 1686935002
-          owned_by: openai
+          object: 'list'
+          data:
+            - id: meta-llama--Llama-2-13-chat-hf
+              object: model
+              created: 1686935002
+              owned_by: 'na'
         schema:
           $ref: '#/components/schemas/ModelList'
 '''
diff --git a/openllm-python/src/openllm/entrypoints/openai.py b/openllm-python/src/openllm/entrypoints/openai.py
index 6c7d73dd..b559fbee 100644
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -41,7 +41,6 @@ from ..protocol.openai import LogProbs
 from ..protocol.openai import ModelCard
 from ..protocol.openai import ModelList
 from ..protocol.openai import UsageInfo
-from ..protocol.openai import get_conversation_prompt
 
 schemas = get_generator(
     'openai',
@@ -129,7 +128,8 @@ async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Respo
 
   model_name, request_id = request.model, gen_random_uuid('chatcmpl')
   created_time = int(time.monotonic())
-  prompt = await get_conversation_prompt(request, llm.config)
+  prompt = llm.tokenizer.apply_chat_template(request.messages, tokenize=False)
+  logger.debug('Prompt: %r', prompt)
   config = llm.config.with_openai_request(request)
 
   try: