From 3ab78cd105d7d959d4f7ee0b7190aaf85f7c2f92 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Wed, 13 Dec 2023 09:03:56 -0500
Subject: [PATCH] feat(mixtral): correct support for mixtral (#772)

feat(mixtral): support inference with pt

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 README.md                                     | 69 +++++++++++++++++++
 local.sh                                      |  7 ++
 .../src/openllm_core/config/__init__.py       | 10 +--
 .../openllm_core/config/configuration_auto.py |  8 ++-
 .../config/configuration_mistral.py           |  1 +
 .../config/configuration_mixtral.py           | 58 ++++++++++++++++
 openllm-python/README.md                      | 69 +++++++++++++++++++
 openllm-python/src/openllm/__init__.pyi       |  2 +-
 tools/update-config-stubs.py                  | 16 +++--
 9 files changed, 226 insertions(+), 14 deletions(-)
 create mode 100644 openllm-core/src/openllm_core/config/configuration_mixtral.py
diff --git a/README.md b/README.md
index dcd0b5c0..8b0b9725 100644
--- a/README.md
+++ b/README.md
@@ -724,6 +724,7 @@ You can specify any of the following Mistral models via `openllm start`:
 
 - [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
 - [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
+- [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
 - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 
@@ -765,6 +766,74 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 
 <details>
 
+<summary>Mixtral</summary>
+
+
+### Quickstart
+
+Run the following command to quickly spin up a Mixtral server:
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+```
+In a different terminal, run the following command to interact with the server:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'What are large language models?'
+```
+
+
+> **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Mixtral models via `openllm start`:
+
+
+- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+
+### Supported backends
+
+OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
+
+
+
+> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
+
+
+
+- vLLM (Recommended):
+
+
+To install vLLM, run `pip install "openllm[vllm]"`
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend vllm
+```
+
+
+> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
+
+
+
+> **Note:** Currently, adapters are yet to be supported with vLLM.
+
+
+- PyTorch:
+
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend pt
+```
+
+</details>
+
+<details>
+
 <summary>MPT</summary>
 
 
diff --git a/local.sh b/local.sh
index 974c22fa..e2a6134a 100755
--- a/local.sh
+++ b/local.sh
@@ -82,11 +82,18 @@ done
 
 validate_extensions
 
+# Check if .python-version file exists from GIT_ROOT, otherwise symlink from .python-version-default to .python-version
+if [ ! -f "$GIT_ROOT/.python-version" ]; then
+  echo "Symlinking .python-version-default to .python-version"
+  ln -s "$GIT_ROOT/.python-version-default" "$GIT_ROOT/.python-version"
+fi
+
 # Check if the EXTENSIONS array is empty
 if [ ${#EXTENSIONS[@]} -eq 0 ]; then
   echo "No extensions specified"
   EXTENSIONS_STR=""
 else
+  echo "Installing extensions: ${EXTENSIONS[*]}"
   EXTENSIONS_STR="[${EXTENSIONS[*]}]"
   EXTENSIONS_STR=${EXTENSIONS_STR// /,} # Replace spaces with commas
 fi
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
index 2c37949e..29e43b1c 100644
--- a/openllm-core/src/openllm_core/config/__init__.py
+++ b/openllm-core/src/openllm_core/config/__init__.py
@@ -1,10 +1,5 @@
-from __future__ import annotations
-
-from .configuration_auto import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-)
+# AUTOGENERATED BY update-config-stubs.py. DO NOT EDIT
+from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig
 from .configuration_baichuan import BaichuanConfig as BaichuanConfig
 from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
 from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
@@ -13,6 +8,7 @@ from .configuration_flan_t5 import FlanT5Config as FlanT5Config
 from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
 from .configuration_llama import LlamaConfig as LlamaConfig
 from .configuration_mistral import MistralConfig as MistralConfig
+from .configuration_mixtral import MixtralConfig as MixtralConfig
 from .configuration_mpt import MPTConfig as MPTConfig
 from .configuration_opt import OPTConfig as OPTConfig
 from .configuration_phi import PhiConfig as PhiConfig
diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
index 6b9259b4..9a76df84 100644
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -26,11 +26,11 @@ CONFIG_MAPPING_NAMES = OrderedDict(
     [
       ('flan_t5', 'FlanT5Config'),
       ('baichuan', 'BaichuanConfig'),
-      ('chatglm', 'ChatGLMConfig'),  #
+      ('chatglm', 'ChatGLMConfig'),
       ('falcon', 'FalconConfig'),
       ('gpt_neox', 'GPTNeoXConfig'),
       ('dolly_v2', 'DollyV2Config'),
-      ('stablelm', 'StableLMConfig'),  #
+      ('stablelm', 'StableLMConfig'),
       ('llama', 'LlamaConfig'),
       ('mpt', 'MPTConfig'),
       ('opt', 'OPTConfig'),
@@ -38,6 +38,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
       ('qwen', 'QwenConfig'),
       ('starcoder', 'StarCoderConfig'),
       ('mistral', 'MistralConfig'),
+      ('mixtral', 'MixtralConfig'),
       ('yi', 'YiConfig'),
     ]
   )
@@ -137,6 +138,9 @@ class AutoConfig:
   def for_model(cls, model_name: t.Literal['mistral'], **attrs: t.Any) -> openllm_core.config.MistralConfig: ...
   @t.overload
   @classmethod
+  def for_model(cls, model_name: t.Literal['mixtral'], **attrs: t.Any) -> openllm_core.config.MixtralConfig: ...
+  @t.overload
+  @classmethod
   def for_model(cls, model_name: t.Literal['mpt'], **attrs: t.Any) -> openllm_core.config.MPTConfig: ...
   @t.overload
   @classmethod
diff --git a/openllm-core/src/openllm_core/config/configuration_mistral.py b/openllm-core/src/openllm_core/config/configuration_mistral.py
index fe8c1bfd..fe8a16d0 100644
--- a/openllm-core/src/openllm_core/config/configuration_mistral.py
+++ b/openllm-core/src/openllm_core/config/configuration_mistral.py
@@ -26,6 +26,7 @@ class MistralConfig(openllm_core.LLMConfig):
     'model_ids': [
       'HuggingFaceH4/zephyr-7b-alpha',
       'HuggingFaceH4/zephyr-7b-beta',
+      'mistralai/Mistral-7B-Instruct-v0.2',
       'mistralai/Mistral-7B-Instruct-v0.1',
       'mistralai/Mistral-7B-v0.1',
     ],
diff --git a/openllm-core/src/openllm_core/config/configuration_mixtral.py b/openllm-core/src/openllm_core/config/configuration_mixtral.py
new file mode 100644
index 00000000..8be032ac
--- /dev/null
+++ b/openllm-core/src/openllm_core/config/configuration_mixtral.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import openllm_core, typing as t
+
+if t.TYPE_CHECKING:
+  from openllm_core._schemas import MessageParam
+
+SINST_KEY, EINST_KEY, BOS_TOKEN, EOS_TOKEN = '[INST]', '[/INST]', '<s>', '</s>'
+
+
+class MixtralConfig(openllm_core.LLMConfig):
+  """The Mixtral-8x7B Large Language Model (LLM) is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks we tested.
+
+  Refer to [Mixtral's HuggingFace page](https://huggingface.co/docs/transformers/main/model_doc/mixtral)
+  for more information.
+  """
+
+  __config__ = {
+    'name_type': 'lowercase',
+    'url': 'https://mistral.ai',
+    'architecture': 'MixtralForCausalLM',
+    'default_id': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
+    'model_ids': ['mistralai/Mixtral-8x7B-Instruct-v0.1', 'mistralai/Mixtral-8x7B-v0.1'],
+  }
+
+  class GenerationConfig:
+    max_new_tokens: int = 20
+    temperature: float = 0.7
+
+  class SamplingParams:
+    best_of: int = 1
+
+  # NOTE: see https://docs.mistral.ai/usage/guardrailing/ and https://docs.mistral.ai/llm/mistral-instruct-v0.1
+  @property
+  def template(self) -> str:
+    return '''{start_key}{start_inst} {system_message} {instruction} {end_inst}\n'''.format(
+      start_inst=SINST_KEY,
+      end_inst=EINST_KEY,
+      start_key=BOS_TOKEN,
+      system_message='{system_message}',
+      instruction='{instruction}',
+    )
+
+  # NOTE: https://docs.mistral.ai/usage/guardrailing/
+  @property
+  def system_message(self) -> str:
+    return '''Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.'''
+
+  @property
+  def chat_template(self) -> str:
+    return repr("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}")
+
+  @property
+  def chat_messages(self) -> list[MessageParam]:
+    from openllm_core._schemas import MessageParam
+    return [MessageParam(role='user', content='What is your favourite condiment?'),
+            MessageParam(role='assistant', content="Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"),
+            MessageParam(role='user', content='Do you have mayonnaise recipes?')]
diff --git a/openllm-python/README.md b/openllm-python/README.md
index dcd0b5c0..8b0b9725 100644
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -724,6 +724,7 @@ You can specify any of the following Mistral models via `openllm start`:
 
 - [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
 - [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
+- [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
 - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 
@@ -765,6 +766,74 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 
 <details>
 
+<summary>Mixtral</summary>
+
+
+### Quickstart
+
+Run the following command to quickly spin up a Mixtral server:
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+```
+In a different terminal, run the following command to interact with the server:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'What are large language models?'
+```
+
+
+> **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Mixtral models via `openllm start`:
+
+
+- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+
+### Supported backends
+
+OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
+
+
+
+> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
+
+
+
+- vLLM (Recommended):
+
+
+To install vLLM, run `pip install "openllm[vllm]"`
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend vllm
+```
+
+
+> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
+
+
+
+> **Note:** Currently, adapters are yet to be supported with vLLM.
+
+
+- PyTorch:
+
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend pt
+```
+
+</details>
+
+<details>
+
 <summary>MPT</summary>
 
 
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
index 3b966a90..200b0764 100644
--- a/openllm-python/src/openllm/__init__.pyi
+++ b/openllm-python/src/openllm/__init__.pyi
@@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 '''
 
 # update-config-stubs.py: import stubs start
-from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 # update-config-stubs.py: import stubs stop
 
 from openllm_cli._sdk import (
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index e72d99af..6ca318c6 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -18,11 +18,12 @@ START_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs star
 END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'
 
 ROOT = Path(__file__).parent.parent
-_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
-_TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
-_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
+_TARGET_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'_configuration.py'
+_TARGET_AUTO_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'configuration_auto.py'
+_TARGET_CORE_INIT_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'__init__.py'
+_TARGET_INIT_FILE = ROOT/'openllm-python'/'src'/'openllm'/'__init__.pyi'
 
-sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
+sys.path.insert(0, (ROOT/'openllm-core'/'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings, SamplingParams
 from openllm_core.config.configuration_auto import CONFIG_MAPPING_NAMES
 from openllm_core.utils import codegen
@@ -217,6 +218,13 @@ def main() -> int:
   processed = processed[:start_import_stubs_idx] + [START_IMPORT_STUBS_COMMENT, lines, END_IMPORT_STUBS_COMMENT] + processed[end_import_stubs_idx + 1 :]
   with _TARGET_INIT_FILE.open('w') as f: f.writelines(processed)
 
+  lines = [
+    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
+    'from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig\n',
+    *[f'from .configuration_{k} import {a} as {a}\n' for k, a in CONFIG_MAPPING_NAMES.items()]
+  ]
+  with _TARGET_CORE_INIT_FILE.open('w') as f: f.writelines(lines)
+
   return 0