From 5442d9cd10021b3c206f39728a41c4771fbe3005 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Wed, 22 Nov 2023 17:03:13 -0500
Subject: [PATCH] fix(trust_remote_code): handle args correctly (#727)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 README.md                                     | 19 +++++++++++++++++++
 .../src/openllm_core/config/__init__.py       |  1 +
 .../config/configuration_mistral.py           | 19 ++++++++++++++++---
 .../openllm_core/config/configuration_phi.py  |  6 ++++--
 openllm-python/README.md                      | 19 +++++++++++++++++++
 openllm-python/pyproject.toml                 |  2 +-
 openllm-python/src/openllm_cli/entrypoint.py  |  4 ++--
 tools/dependencies.py                         |  2 +-
 8 files changed, 63 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 73c466a0..e1a96b2f 100644
--- a/README.md
+++ b/README.md
@@ -839,6 +839,25 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm
 openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
index f707d7dc..68dad614 100644
--- a/openllm-core/src/openllm_core/config/__init__.py
+++ b/openllm-core/src/openllm_core/config/__init__.py
@@ -15,6 +15,7 @@ from .configuration_llama import LlamaConfig as LlamaConfig
 from .configuration_mistral import MistralConfig as MistralConfig
 from .configuration_mpt import MPTConfig as MPTConfig
 from .configuration_opt import OPTConfig as OPTConfig
+from .configuration_phi import PhiConfig as PhiConfig
 from .configuration_stablelm import StableLMConfig as StableLMConfig
 from .configuration_starcoder import StarCoderConfig as StarCoderConfig
 from .configuration_yi import YiConfig as YiConfig
diff --git a/openllm-core/src/openllm_core/config/configuration_mistral.py b/openllm-core/src/openllm_core/config/configuration_mistral.py
index 36cdfb88..fe8c1bfd 100644
--- a/openllm-core/src/openllm_core/config/configuration_mistral.py
+++ b/openllm-core/src/openllm_core/config/configuration_mistral.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
-import openllm_core
+import openllm_core, typing as t
+
+if t.TYPE_CHECKING:
+  from openllm_core._schemas import MessageParam
 
 SINST_KEY, EINST_KEY, BOS_TOKEN, EOS_TOKEN = '[INST]', '[/INST]', '<s>', '</s>'
 
@@ -20,8 +23,6 @@ class MistralConfig(openllm_core.LLMConfig):
     'architecture': 'MistralForCausalLM',
     'default_id': 'mistralai/Mistral-7B-Instruct-v0.1',
     'serialisation': 'safetensors',
-    'backend': ('pt', 'vllm'),
-    # NOTE: see https://docs.mistral.ai/usage/guardrailing/ and https://docs.mistral.ai/llm/mistral-instruct-v0.1
     'model_ids': [
       'HuggingFaceH4/zephyr-7b-alpha',
       'HuggingFaceH4/zephyr-7b-beta',
@@ -43,6 +44,7 @@ class MistralConfig(openllm_core.LLMConfig):
     best_of: int = 1
     presence_penalty: float = 0.5
 
+  # NOTE: see https://docs.mistral.ai/usage/guardrailing/ and https://docs.mistral.ai/llm/mistral-instruct-v0.1
   @property
   def template(self) -> str:
     return '''{start_key}{start_inst} {system_message} {instruction} {end_inst}\n'''.format(
@@ -57,3 +59,14 @@ class MistralConfig(openllm_core.LLMConfig):
   @property
   def system_message(self) -> str:
     return '''Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.'''
+
+  @property
+  def chat_template(self) -> str:
+    return repr("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}")
+
+  @property
+  def chat_messages(self) -> list[MessageParam]:
+    from openllm_core._schemas import MessageParam
+    return [MessageParam(role='user', content='What is your favourite condiment?'),
+            MessageParam(role='assistant', content="Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"),
+            MessageParam(role='user', content='Do you have mayonnaise recipes?')]
diff --git a/openllm-core/src/openllm_core/config/configuration_phi.py b/openllm-core/src/openllm_core/config/configuration_phi.py
index 0eb4c6f5..e326b693 100644
--- a/openllm-core/src/openllm_core/config/configuration_phi.py
+++ b/openllm-core/src/openllm_core/config/configuration_phi.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
-import openllm_core
-from openllm_core._schemas import MessageParam
+import openllm_core, typing as t
 
+if t.TYPE_CHECKING:
+  from openllm_core._schemas import MessageParam
 
 class PhiConfig(openllm_core.LLMConfig):
   """The language model phi-1.5 is a Transformer with 1.3 billion parameters.
@@ -42,6 +43,7 @@ class PhiConfig(openllm_core.LLMConfig):
 
   @property
   def chat_messages(self) -> list[MessageParam]:
+    from openllm_core._schemas import MessageParam
     return [MessageParam(role='user', content="I don't know why, I'm struggling to maintain focus while studying. Any suggestions?"),
             MessageParam(role='assistant', content='Have you tried using a timer? It can help you stay on track and avoid distractions.'),
             MessageParam(role='user', content="That's a good idea. I'll give it a try. What else can I do to boost my productivity?")]
diff --git a/openllm-python/README.md b/openllm-python/README.md
index 73c466a0..e1a96b2f 100644
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -839,6 +839,25 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm
 openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 660344e3..2761b1b8 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -106,7 +106,7 @@ all = ["openllm[full]"]
 awq = ["autoawq"]
 baichuan = ["cpm-kernels"]
 chatglm = ["cpm-kernels"]
-ctranslate = ["ctranslate2"]
+ctranslate = ["ctranslate2>=3.22.0"]
 falcon = ["xformers"]
 fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"]
 full = [
diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py
index 2e887cc2..8dabd075 100644
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -442,7 +442,7 @@ def start_command(
     serialisation=serialisation,
     dtype=dtype,
     max_model_len=max_model_len,
-    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'),
+    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE', False),
   )
   backend_warning(llm.__llm_backend__)
 
@@ -573,7 +573,7 @@ def start_grpc_command(
     serialisation=serialisation,
     dtype=dtype,
     max_model_len=max_model_len,
-    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'),
+    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE', False),
   )
   backend_warning(llm.__llm_backend__)
 
diff --git a/tools/dependencies.py b/tools/dependencies.py
index dbf24c84..1c44f8dc 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -152,7 +152,7 @@ OPENAI_DEPS = ['openai[datalib]>=1', 'tiktoken']
 AGENTS_DEPS = [f'transformers[agents]>={_TRANSFORMERS_CONSTRAINTS}', 'diffusers', 'soundfile']
 PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
 GGML_DEPS = ['ctransformers']
-CTRANSLATE_DEPS = ['ctranslate2']
+CTRANSLATE_DEPS = ['ctranslate2>=3.22.0']
 AWQ_DEPS = ['autoawq']
 GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2']
 VLLM_DEPS = ['vllm>=0.2.2']