From bf28f977bc4a8e48312220196d9d1f66db09ccc0 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 2 Jun 2024 10:16:08 -0400
Subject: [PATCH] feat(models): command-r (#1005)

* feat(models): add support for command-r

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* feat(models): support command-r and remove deadcode and extensions

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* chore: update local.sh script

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>
---
 README.md                                     | 356 +++++++-----------
 hatch.toml                                    |   1 -
 local.sh                                      |  82 +---
 .../src/openllm_core/_typing_compat.py        |   4 +-
 .../src/openllm_core/config/__init__.py       |   1 +
 .../openllm_core/config/configuration_auto.py |  23 +-
 .../config/configuration_baichuan.py          |  70 ++--
 .../config/configuration_chatglm.py           |  74 ++--
 .../config/configuration_commandr.py          |  35 ++
 .../config/configuration_gpt_neox.py          |  54 ++-
 .../openllm_core/config/configuration_mpt.py  |  66 ++--
 .../openllm_core/config/configuration_opt.py  |  74 ++--
 .../openllm_core/config/configuration_qwen.py |  66 ++--
 .../config/configuration_starcoder.py         |  61 ++-
 .../openllm_core/config/configuration_yi.py   |  62 +--
 .../src/openllm_core/utils/__init__.py        |   1 -
 .../src/openllm_core/utils/__init__.pyi       |   1 -
 .../src/openllm_core/utils/import_utils.py    |   1 -
 .../src/openllm_core/utils/import_utils.pyi   |   1 -
 openllm-python/README.md                      | 356 +++++++-----------
 openllm-python/pyproject.toml                 |  37 --
 .../src/_openllm_tiny/_entrypoint.py          |   2 +-
 openllm-python/src/_openllm_tiny/_llm.py      |  44 ++-
 openllm-python/src/openllm/__init__.pyi       |   2 +-
 openllm-python/src/openllm/utils.pyi          |   1 -
 tools/dependencies.py                         |  59 +--
 tools/update-config-stubs.py                  |   1 -
 tools/update-readme.py                        |  16 +-
 28 files changed, 628 insertions(+), 923 deletions(-)
 create mode 100644 openllm-core/src/openllm_core/config/configuration_commandr.py

diff --git a/README.md b/README.md
index 47b81e8a..f93ca3a5 100644
--- a/README.md
+++ b/README.md
@@ -101,24 +101,16 @@ OpenLLM currently supports the following models. By default, OpenLLM doesn't inc
 
 ### Quickstart
 
-
-
-> **Note:** Baichuan requires to install with:
-> ```bash
-> pip install "openllm[baichuan]"
-> ```
-
-
 Run the following command to quickly spin up a Baichuan server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start baichuan-inc/baichuan-7b
+openllm start baichuan-inc/baichuan-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -145,24 +137,16 @@ You can specify any of the following Baichuan models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** ChatGLM requires to install with:
-> ```bash
-> pip install "openllm[chatglm]"
-> ```
-
-
 Run the following command to quickly spin up a ChatGLM server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start thudm/chatglm-6b
+openllm start thudm/chatglm-6b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -186,29 +170,55 @@ You can specify any of the following ChatGLM models via `openllm start`:
 
 <details>
 
+<summary>Cohere</summary>
+
+
+### Quickstart
+
+Run the following command to quickly spin up a Cohere server:
+
+```bash
+openllm start CohereForAI/c4ai-command-r-plus --trust-remote-code
+```
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
+```
+
+
+> **Note:** Any Cohere variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=commandr) to see more Cohere-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Cohere models via `openllm start`:
+
+
+- [CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
+- [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+
+</details>
+
+<details>
+
 <summary>Dbrx</summary>
 
 
 ### Quickstart
 
-
-
-> **Note:** Dbrx requires to install with:
-> ```bash
-> pip install "openllm[dbrx]"
-> ```
-
-
 Run the following command to quickly spin up a Dbrx server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start databricks/dbrx-instruct
+openllm start databricks/dbrx-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -236,13 +246,13 @@ You can specify any of the following Dbrx models via `openllm start`:
 Run the following command to quickly spin up a DollyV2 server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start databricks/dolly-v2-3b
+openllm start databricks/dolly-v2-3b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -268,24 +278,16 @@ You can specify any of the following DollyV2 models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Falcon requires to install with:
-> ```bash
-> pip install "openllm[falcon]"
-> ```
-
-
 Run the following command to quickly spin up a Falcon server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start tiiuae/falcon-7b
+openllm start tiiuae/falcon-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -312,24 +314,16 @@ You can specify any of the following Falcon models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Gemma requires to install with:
-> ```bash
-> pip install "openllm[gemma]"
-> ```
-
-
 Run the following command to quickly spin up a Gemma server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start google/gemma-7b
+openllm start google/gemma-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -359,13 +353,13 @@ You can specify any of the following Gemma models via `openllm start`:
 Run the following command to quickly spin up a GPTNeoX server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start eleutherai/gpt-neox-20b
+openllm start eleutherai/gpt-neox-20b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -389,24 +383,16 @@ You can specify any of the following GPTNeoX models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Llama requires to install with:
-> ```bash
-> pip install "openllm[llama]"
-> ```
-
-
 Run the following command to quickly spin up a Llama server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start NousResearch/llama-2-7b-hf
+openllm start NousResearch/llama-2-7b-hf --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -441,24 +427,16 @@ You can specify any of the following Llama models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Mistral requires to install with:
-> ```bash
-> pip install "openllm[mistral]"
-> ```
-
-
 Run the following command to quickly spin up a Mistral server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mistralai/Mistral-7B-Instruct-v0.1
+openllm start mistralai/Mistral-7B-Instruct-v0.1 --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -486,24 +464,16 @@ You can specify any of the following Mistral models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Mixtral requires to install with:
-> ```bash
-> pip install "openllm[mixtral]"
-> ```
-
-
 Run the following command to quickly spin up a Mixtral server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -528,24 +498,16 @@ You can specify any of the following Mixtral models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** MPT requires to install with:
-> ```bash
-> pip install "openllm[mpt]"
-> ```
-
-
 Run the following command to quickly spin up a MPT server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b-instruct
+openllm start mosaicml/mpt-7b-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -575,24 +537,16 @@ You can specify any of the following MPT models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** OPT requires to install with:
-> ```bash
-> pip install "openllm[opt]"
-> ```
-
-
 Run the following command to quickly spin up a OPT server:
 
 ```bash
 openllm start facebook/opt-1.3b
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -621,24 +575,16 @@ You can specify any of the following OPT models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Phi requires to install with:
-> ```bash
-> pip install "openllm[phi]"
-> ```
-
-
 Run the following command to quickly spin up a Phi server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start microsoft/Phi-3-mini-4k-instruct
+openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -667,24 +613,16 @@ You can specify any of the following Phi models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Qwen requires to install with:
-> ```bash
-> pip install "openllm[qwen]"
-> ```
-
-
 Run the following command to quickly spin up a Qwen server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat
+openllm start qwen/Qwen-7B-Chat --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -713,24 +651,16 @@ You can specify any of the following Qwen models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** StableLM requires to install with:
-> ```bash
-> pip install "openllm[stablelm]"
-> ```
-
-
 Run the following command to quickly spin up a StableLM server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start stabilityai/stablelm-tuned-alpha-3b
+openllm start stabilityai/stablelm-tuned-alpha-3b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -757,24 +687,16 @@ You can specify any of the following StableLM models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** StarCoder requires to install with:
-> ```bash
-> pip install "openllm[starcoder]"
-> ```
-
-
 Run the following command to quickly spin up a StarCoder server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start bigcode/starcoder
+openllm start bigcode/starcoder --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -799,24 +721,16 @@ You can specify any of the following StarCoder models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Yi requires to install with:
-> ```bash
-> pip install "openllm[yi]"
-> ```
-
-
 Run the following command to quickly spin up a Yi server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start 01-ai/Yi-6B
+openllm start 01-ai/Yi-6B --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
diff --git a/hatch.toml b/hatch.toml
index 53ada4dd..d97fd289 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -9,7 +9,6 @@ dependencies = [
     "pre-commit",
     # NOTE: towncrier for changelog
     "towncrier",
-    # NOTE: Using under ./tools/update-optional-dependencies.py
     "tomlkit",
     # NOTE: For fancy PyPI readme
     "hatch-fancy-pypi-readme",
diff --git a/local.sh b/local.sh
index 28d85d2c..9ae21d5d 100755
--- a/local.sh
+++ b/local.sh
@@ -6,7 +6,7 @@ GIT_ROOT=$(git rev-parse --show-toplevel)
 cd "$GIT_ROOT" || exit 1
 
 # check if uv is installed
-if ! command -v uv > /dev/null 2>&1; then
+if ! command -v uv >/dev/null 2>&1; then
   echo "Installing uv..."
   curl -LsSf https://astral.sh/uv/install.sh | sh
 fi
@@ -26,90 +26,30 @@ fi
 . "$GIT_ROOT/.venv/bin/activate"
 
 print_usage() {
-  echo "Usage: $0 [OPTIONS]"
-  echo "Options:"
-  echo "  -e, -E, --ext  Specify extensions for OpenLLM. Can be used multiple times or as a comma-separated list."
-  echo "                  Example: $0 -e ext1,ext2"
-  echo "                  Example: $0 --ext ext1 --ext ext2"
-  echo ""
-  echo "This script installs various components with optional extensions."
+  echo "Usage: $0"
 }
 
-split_csv() {
-  local IFS=','
-  read -ra ADDR <<< "$1"
-  for i in "${ADDR[@]}"; do
-    EXTENSIONS+=("$i")
-  done
-}
-
-# Function to validate extensions
-validate_extensions() {
-  uv pip install tomlkit pre-commit mypy
-  local valid_extensions
-  valid_extensions=$(python -c "
-import tomlkit
-
-with open('$GIT_ROOT/openllm-python/pyproject.toml', 'r') as file:
-    data = tomlkit.load(file)
-    optional_dependencies = data['project']['optional-dependencies']
-    print(' '.join(optional_dependencies.keys()))
-  ")
-
-  COMMENT="[${valid_extensions[*]}]"
-  COMMENT=${COMMENT// /,} # Replace spaces with commas
-  for ext in "${EXTENSIONS[@]}"; do
-    if ! [[ $valid_extensions =~ (^|[[:space:]])$ext($|[[:space:]]) ]]; then
-      echo "Invalid extension: $ext. Available extensions are: $COMMENT"
-      exit 1
-    fi
-  done
-}
-
-EXTENSIONS=()
-
 # Parse command line arguments
 while [[ "$#" -gt 0 ]]; do
   case $1 in
-    --extensions|-e|-E|--ext)
-      if [[ -n $2 && $2 != -* ]]; then
-        split_csv "$2"
-        shift
-      else
-        print_usage
-        exit 1
-      fi
-      ;;
-    --help|-h)
-      print_usage
-      exit 0
-      ;;
-    *)
-      print_usage
-      exit 1
-      ;;
+  --help | -h)
+    print_usage
+    exit 0
+    ;;
+  *)
+    print_usage
+    exit 1
+    ;;
   esac
   shift
 done
 
-validate_extensions
-
-# Check if the EXTENSIONS array is empty
-if [ ${#EXTENSIONS[@]} -eq 0 ]; then
-  echo "No extensions specified"
-  EXTENSIONS_STR=""
-else
-  echo "Installing extensions: ${EXTENSIONS[*]}"
-  EXTENSIONS_STR="[${EXTENSIONS[*]}]"
-  EXTENSIONS_STR=${EXTENSIONS_STR// /,} # Replace spaces with commas
-fi
-
 PRERELEASE=${PRERELEASE:-false}
 
 ARGS=()
 [[ "${PRERELEASE}" == "true" ]] && ARGS+=("--prerelease=allow")
 
-uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-python$EXTENSIONS_STR"
+uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-python"
 uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-client"
 uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-core"
 
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index daca29c6..95c677eb 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -18,8 +18,8 @@ ListStr = List[str]
 At = TypeVar('At', bound=attr.AttrsInstance)
 LiteralDtype = Literal['float16', 'float32', 'bfloat16', 'int8', 'int16']
 LiteralSerialisation = Literal['safetensors', 'legacy']
-LiteralQuantise = Literal['int8', 'int4', 'gptq', 'awq', 'squeezellm']
-LiteralBackend = Literal['pt', 'vllm', 'triton', 'ggml']  # TODO: ggml
+LiteralQuantise = Literal['aqlm', 'fp8', 'gptq', 'awq', 'squeezellm', 'gptq_marlin', 'marlin']
+LiteralBackend = Literal['pt', 'vllm']  # TODO: ggml
 AdapterType = Literal[
   'lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr'
 ]
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
index 50bd7f76..f1c9e1a2 100644
--- a/openllm-core/src/openllm_core/config/__init__.py
+++ b/openllm-core/src/openllm_core/config/__init__.py
@@ -3,6 +3,7 @@
 from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig
 from .configuration_baichuan import BaichuanConfig as BaichuanConfig
 from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
+from .configuration_commandr import CohereConfig as CohereConfig
 from .configuration_dbrx import DbrxConfig as DbrxConfig
 from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
 from .configuration_falcon import FalconConfig as FalconConfig
diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
index 86468f1c..7f28c945 100644
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -2,12 +2,10 @@ from __future__ import annotations
 
 import importlib, typing as t, inflection
 from collections import OrderedDict
-from ..exceptions import MissingDependencyError
-from ..utils import ReprMixin, is_bentoml_available
+from ..utils import ReprMixin
 
 if t.TYPE_CHECKING:
   import types
-  from bentoml import Model
   from collections import _odict_items, _odict_keys, _odict_values
 
   import openllm, openllm_core
@@ -23,6 +21,7 @@ else:
 ModelType: t.TypeAlias = t.Literal[
   'baichuan',
   'chatglm',
+  'commandr',
   'falcon',
   'gemma',
   'gpt_neox',
@@ -45,6 +44,7 @@ CONFIG_MAPPING_NAMES: OrderedDict[ModelType, str] = OrderedDict(
   sorted([
     ('baichuan', 'BaichuanConfig'),
     ('chatglm', 'ChatGLMConfig'),
+    ('commandr', 'CohereConfig'),
     ('falcon', 'FalconConfig'),
     ('gpt_neox', 'GPTNeoXConfig'),
     ('gemma', 'GemmaConfig'),
@@ -134,6 +134,9 @@ class AutoConfig:
   def for_model(cls, model_name: t.Literal['chatglm'], **attrs: t.Any) -> openllm_core.config.ChatGLMConfig: ...
   @t.overload
   @classmethod
+  def for_model(cls, model_name: t.Literal['commandr'], **attrs: t.Any) -> openllm_core.config.CohereConfig: ...
+  @t.overload
+  @classmethod
   def for_model(cls, model_name: t.Literal['dbrx'], **attrs: t.Any) -> openllm_core.config.DbrxConfig: ...
   @t.overload
   @classmethod
@@ -197,23 +200,11 @@ class AutoConfig:
   def from_llm(cls, llm: openllm.LLM, **attrs: t.Any) -> openllm_core.LLMConfig:
     config_cls = llm.config.__class__.__name__
     if config_cls in CONFIG_TO_ALIAS_NAMES:
-      return cls.for_model(CONFIG_TO_ALIAS_NAMES[config_cls]).model_construct_env(**attrs)
+      return cls.from_id(llm.model_id, trust_remote_code=llm.trust_remote_code, **attrs)
     raise ValueError(
       f"Failed to determine config class for '{llm.model_id}'. Make sure {llm.model_id} is saved with openllm."
     )
 
-  @classmethod
-  def from_bentomodel(cls, bentomodel: Model, **attrs: t.Any) -> openllm_core.LLMConfig:
-    if not is_bentoml_available():
-      raise MissingDependencyError("Requires 'bentoml' to be available. Do 'pip install bentoml'")
-
-    arch = bentomodel.info.metadata['architectures'][0]
-    if arch in cls._architecture_mappings:
-      return cls.for_model(cls._architecture_mappings[arch]).model_construct_env(**attrs)
-    raise ValueError(
-      f"Failed to determine config class for '{bentomodel.name}'. Make sure {bentomodel.name} is saved with openllm."
-    )
-
   @classmethod
   def from_id(cls, model_id: str, *, trust_remote_code: bool = False, **attrs: t.Any) -> openllm_core.LLMConfig:
     import transformers
diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
index c1a37e9f..8476f344 100644
--- a/openllm-core/src/openllm_core/config/configuration_baichuan.py
+++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -4,42 +4,42 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
+class BaichuanConfig(openllm_core.LLMConfig):
+  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
 
-Baichuan-7B is based on Transformer architecture,
-which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
-It supports both Chinese and English languages with a context window length of 4096.
-It has achieved the best performance among models of the same size on standard Chinese
-and English benchmarks (C-Eval, MMLU, etc).
-Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
-"""
+  Baichuan-7B is based on Transformer architecture,
+  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
+  It supports both Chinese and English languages with a context window length of 4096.
+  It has achieved the best performance among models of the same size on standard Chinese
+  and English benchmarks (C-Eval, MMLU, etc).
+  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'trust_remote_code': True,
-  'timeout': 3600000,
-  'url': 'https://github.com/baichuan-inc/Baichuan-7B',
-  'requirements': ['cpm-kernels'],
-  'architecture': 'BaichuanForCausalLM',
-  # NOTE: See the following
-  # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
-  # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
-  # https://github.com/baichuan-inc/Baichuan-13B/issues/25
-  'default_id': 'baichuan-inc/baichuan-7b',
-  'model_ids': [
-    'baichuan-inc/baichuan2-7b-base',
-    'baichuan-inc/baichuan2-7b-chat',
-    'baichuan-inc/baichuan2-13b-base',
-    'baichuan-inc/baichuan2-13b-chat',
-  ],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config = openllm_core.GenerationConfig.model_construct(max_new_tokens=2048, top_p=0.7, temperature=0.95)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'trust_remote_code': True,
+      'timeout': 3600000,
+      'url': 'https://github.com/baichuan-inc/Baichuan-7B',
+      'requirements': ['cpm-kernels'],
+      'architecture': 'BaichuanForCausalLM',
+      # NOTE: See the following
+      # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
+      # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
+      # https://github.com/baichuan-inc/Baichuan-13B/issues/25
+      'default_id': 'baichuan-inc/baichuan-7b',
+      'model_ids': [
+        'baichuan-inc/baichuan2-7b-base',
+        'baichuan-inc/baichuan2-7b-chat',
+        'baichuan-inc/baichuan2-13b-base',
+        'baichuan-inc/baichuan2-13b-chat',
+      ],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-BaichuanConfig = pydantic.create_model(
-  'BaichuanConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(max_new_tokens=2048, top_p=0.7, temperature=0.95)
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py
index 81fa29d0..0feb7b70 100644
--- a/openllm-core/src/openllm_core/config/configuration_chatglm.py
+++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py
@@ -4,46 +4,46 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
+class ChatGLMConfig(openllm_core.LLMConfig):
+  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
 
-With the quantization technique, users can deploy locally on consumer-grade graphics cards
-(only 6GB of GPU memory is required at the INT4 quantization level).
+  With the quantization technique, users can deploy locally on consumer-grade graphics cards
+  (only 6GB of GPU memory is required at the INT4 quantization level).
 
-ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
-The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
-feedback bootstrap, and reinforcement learning wit human feedback.
-With only about 6.2 billion parameters, the model is able to generate answers that are in line
-with human preference.
+  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
+  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
+  feedback bootstrap, and reinforcement learning wit human feedback.
+  With only about 6.2 billion parameters, the model is able to generate answers that are in line
+  with human preference.
 
-Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
-"""
+  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'trust_remote_code': True,
-  'timeout': 3600000,
-  'url': 'https://github.com/THUDM/ChatGLM-6B',
-  'requirements': ['cpm-kernels'],
-  'architecture': 'ChatGLMModel',
-  'default_id': 'thudm/chatglm-6b',
-  'model_ids': [
-    'thudm/chatglm-6b',
-    'thudm/chatglm-6b-int8',
-    'thudm/chatglm-6b-int4',
-    'thudm/chatglm2-6b',
-    'thudm/chatglm2-6b-int4',
-    'thudm/chatglm3-6b',
-  ],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config = openllm_core.GenerationConfig.model_construct(
-  max_new_tokens=2048, num_beams=1, top_p=0.7, temperature=0.95
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'trust_remote_code': True,
+      'timeout': 3600000,
+      'url': 'https://github.com/THUDM/ChatGLM-6B',
+      'requirements': ['cpm-kernels'],
+      'architecture': 'ChatGLMModel',
+      'default_id': 'thudm/chatglm-6b',
+      'model_ids': [
+        'thudm/chatglm-6b',
+        'thudm/chatglm-6b-int8',
+        'thudm/chatglm-6b-int4',
+        'thudm/chatglm2-6b',
+        'thudm/chatglm2-6b-int4',
+        'thudm/chatglm3-6b',
+      ],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-ChatGLMConfig = pydantic.create_model(
-  'ChatGLMConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(
+      max_new_tokens=2048, num_beams=1, top_p=0.7, temperature=0.95
+    )
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_commandr.py b/openllm-core/src/openllm_core/config/configuration_commandr.py
new file mode 100644
index 00000000..9244d120
--- /dev/null
+++ b/openllm-core/src/openllm_core/config/configuration_commandr.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import openllm_core, pydantic
+from openllm_core._configuration import ModelSettings
+
+
+class CohereConfig(openllm_core.LLMConfig):
+  """C4AI Command R+ is an open weights research release of a 104B billion parameter model with highly
+  advanced capabilities, this includes Retrieval Augmented Generation (RAG) and tool use to
+  automate sophisticated tasks.
+
+  Refer to [CohereForAI's org card](https://huggingface.co/CohereForAI) for more information on Command-R
+  for more information.
+  """
+
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
+
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'name_type': 'lowercase',
+      'url': 'https://huggingface.co/CohereForAI',
+      'architecture': 'CohereForCausalLM',
+      'default_id': 'CohereForAI/c4ai-command-r-plus',
+      'serialisation': 'safetensors',
+      'model_ids': ['CohereForAI/c4ai-command-r-plus', 'CohereForAI/c4ai-command-r-v01'],
+    },
+    repr=False,
+    exclude=True,
+  )
+
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(
+      max_new_tokens=128, temperature=0.6, top_p=0.9, top_k=12, best_of=1, presence_penalty=0.5
+    )
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
index 0bd1e872..b4446183 100644
--- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
+++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
@@ -4,38 +4,36 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
+class GPTNeoXConfig(openllm_core.LLMConfig):
+  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
 
-It is, to the best of our knowledge, the largest dense autoregressive model
-that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
-can be found at https://github.com/EleutherAI/gpt-neox.
+  It is, to the best of our knowledge, the largest dense autoregressive model
+  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
+  can be found at https://github.com/EleutherAI/gpt-neox.
 
-GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
+  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
 
-Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
-also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
+  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
+  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
 
-Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
-for more information.
-"""
+  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
+  for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'architecture': 'GPTNeoXForCausalLM',
-  # NOTE: See https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B
-  'url': 'https://github.com/EleutherAI/gpt-neox',
-  'default_id': 'eleutherai/gpt-neox-20b',
-  'model_ids': ['eleutherai/gpt-neox-20b'],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  temperature=0.9, max_new_tokens=100
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'architecture': 'GPTNeoXForCausalLM',
+      # NOTE: See https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B
+      'url': 'https://github.com/EleutherAI/gpt-neox',
+      'default_id': 'eleutherai/gpt-neox-20b',
+      'model_ids': ['eleutherai/gpt-neox-20b'],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-GPTNeoXConfig = pydantic.create_model(
-  'GPTNeoXConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(temperature=0.9, max_new_tokens=100)
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py
index fc88f9da..f74d3d81 100644
--- a/openllm-core/src/openllm_core/config/configuration_mpt.py
+++ b/openllm-core/src/openllm_core/config/configuration_mpt.py
@@ -4,43 +4,39 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-MPT is a decoder-style transformer pretrained from scratch on English text and code.
+class MPTConfig(openllm_core.LLMConfig):
+  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
 
-This model was trained by [MosaicML](https://www.mosaicml.com/).
+  This model was trained by [MosaicML](https://www.mosaicml.com/).
 
-``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
-on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
-for more details on specific models.
-"""
+  Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml) for more details on specific models.
+  """
 
-metadata_config: ModelSettings = {
-  'trust_remote_code': True,
-  'url': 'https://huggingface.co/mosaicml',
-  'timeout': int(36e6),
-  'requirements': ['triton'],
-  'architecture': 'MPTForCausalLM',
-  # NOTE: See https://huggingface.co/TheBloke/mpt-30B-chat-GGML/discussions/4
-  'default_id': 'mosaicml/mpt-7b-instruct',
-  'model_ids': [
-    'mosaicml/mpt-7b',
-    'mosaicml/mpt-7b-instruct',
-    'mosaicml/mpt-7b-chat',
-    'mosaicml/mpt-7b-storywriter',
-    'mosaicml/mpt-30b',
-    'mosaicml/mpt-30b-instruct',
-    'mosaicml/mpt-30b-chat',
-  ],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  max_new_tokens=128, temperature=0, top_p=0.8
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'trust_remote_code': True,
+      'url': 'https://huggingface.co/mosaicml',
+      'timeout': int(36e6),
+      'requirements': ['triton'],
+      'architecture': 'MPTForCausalLM',
+      # NOTE: See https://huggingface.co/TheBloke/mpt-30B-chat-GGML/discussions/4
+      'default_id': 'mosaicml/mpt-7b-instruct',
+      'model_ids': [
+        'mosaicml/mpt-7b',
+        'mosaicml/mpt-7b-instruct',
+        'mosaicml/mpt-7b-chat',
+        'mosaicml/mpt-7b-storywriter',
+        'mosaicml/mpt-30b',
+        'mosaicml/mpt-30b-instruct',
+        'mosaicml/mpt-30b-chat',
+      ],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-MPTConfig = pydantic.create_model(
-  'MPTConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(max_new_tokens=128, temperature=0, top_p=0.8)
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
index b41cfbf8..26931b0b 100644
--- a/openllm-core/src/openllm_core/config/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -4,50 +4,40 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
+class OPTConfig(openllm_core.LLMConfig):
+  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
 
-OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
-within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
-objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
-the self-supervised causal language modeling objective.
+  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
+  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
+  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
+  the self-supervised causal language modeling objective.
 
-Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
-"""
+  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'trust_remote_code': False,
-  'url': 'https://huggingface.co/docs/transformers/model_doc/opt',
-  'default_id': 'facebook/opt-1.3b',
-  'architecture': 'OPTForCausalLM',
-  'model_ids': [
-    'facebook/opt-125m',
-    'facebook/opt-350m',
-    'facebook/opt-1.3b',
-    'facebook/opt-2.7b',
-    'facebook/opt-6.7b',
-    'facebook/opt-66b',
-  ],
-  'fine_tune_strategies': (
-    {
-      'adapter_type': 'lora',
-      'r': 16,
-      'lora_alpha': 32,
-      'target_modules': ['q_proj', 'v_proj'],
-      'lora_dropout': 0.05,
-      'bias': 'none',
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
+
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'trust_remote_code': False,
+      'url': 'https://huggingface.co/docs/transformers/model_doc/opt',
+      'default_id': 'facebook/opt-1.3b',
+      'architecture': 'OPTForCausalLM',
+      'model_ids': [
+        'facebook/opt-125m',
+        'facebook/opt-350m',
+        'facebook/opt-1.3b',
+        'facebook/opt-2.7b',
+        'facebook/opt-6.7b',
+        'facebook/opt-66b',
+      ],
     },
-  ),
-}
+    repr=False,
+    exclude=True,
+  )
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  top_k=15, temperature=0.75, max_new_tokens=256, num_return_sequences=1
-)
-
-OPTConfig = pydantic.create_model(
-  'OPTConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(
+      top_k=15, temperature=0.75, max_new_tokens=256, num_return_sequences=1
+    )
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_qwen.py b/openllm-core/src/openllm_core/config/configuration_qwen.py
index 1a630996..af2c3294 100644
--- a/openllm-core/src/openllm_core/config/configuration_qwen.py
+++ b/openllm-core/src/openllm_core/config/configuration_qwen.py
@@ -4,40 +4,38 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-Qwen-7B is the 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen),
-proposed by Alibaba Cloud. Qwen-14B is a Transformer-based large language model,
-which is pretrained on a large volume of data, including web texts, books, codes, etc.
-Additionally, based on the pretrained Qwen-14B, we release Qwen-14B-Chat, a large-model-based AI assistant,
-which is trained with alignment techniques.
-Refer to [Qwen's GitHub page](https://github.com/QwenLM/Qwen) for more information.
-"""
+class QwenConfig(openllm_core.LLMConfig):
+  """Qwen-7B is the 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen),
+  proposed by Alibaba Cloud. Qwen-14B is a Transformer-based large language model,
+  which is pretrained on a large volume of data, including web texts, books, codes, etc.
+  Additionally, based on the pretrained Qwen-14B, we release Qwen-14B-Chat, a large-model-based AI assistant,
+  which is trained with alignment techniques.
+  Refer to [Qwen's GitHub page](https://github.com/QwenLM/Qwen) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'trust_remote_code': True,
-  'timeout': 3600000,
-  'url': 'https://github.com/QwenLM/Qwen',
-  'requirements': ['cpm-kernels', 'tiktoken'],
-  'architecture': 'QWenLMHeadModel',
-  'default_id': 'qwen/Qwen-7B-Chat',
-  'model_ids': [
-    'qwen/Qwen-7B-Chat',
-    'qwen/Qwen-7B-Chat-Int8',
-    'qwen/Qwen-7B-Chat-Int4',
-    'qwen/Qwen-14B-Chat',
-    'qwen/Qwen-14B-Chat-Int8',
-    'qwen/Qwen-14B-Chat-Int4',
-  ],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  max_new_tokens=2048, top_p=0.7, temperature=0.95
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'trust_remote_code': True,
+      'timeout': 3600000,
+      'url': 'https://github.com/QwenLM/Qwen',
+      'requirements': ['cpm-kernels', 'tiktoken'],
+      'architecture': 'QWenLMHeadModel',
+      'default_id': 'qwen/Qwen-7B-Chat',
+      'model_ids': [
+        'qwen/Qwen-7B-Chat',
+        'qwen/Qwen-7B-Chat-Int8',
+        'qwen/Qwen-7B-Chat-Int4',
+        'qwen/Qwen-14B-Chat',
+        'qwen/Qwen-14B-Chat-Int8',
+        'qwen/Qwen-14B-Chat-Int4',
+      ],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-QwenConfig = pydantic.create_model(
-  'QwenConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(max_new_tokens=2048, top_p=0.7, temperature=0.95)
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py
index 69086237..d73921be 100644
--- a/openllm-core/src/openllm_core/config/configuration_starcoder.py
+++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py
@@ -4,39 +4,38 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
+class StarCoderConfig(openllm_core.LLMConfig):
+  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
 
-The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
-[a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
-[Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
+  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
+  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
+  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
 
-Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
-"""
+  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'url': 'https://github.com/bigcode-project/starcoder',
-  'architecture': 'GPTBigCodeForCausalLM',
-  'requirements': ['bitsandbytes'],
-  'default_id': 'bigcode/starcoder',
-  'model_ids': ['bigcode/starcoder', 'bigcode/starcoderbase'],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  temperature=0.2,
-  max_new_tokens=256,
-  min_new_tokens=32,
-  top_k=50,
-  top_p=0.95,
-  pad_token_id=49152,
-  repetition_penalty=1.2,
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'url': 'https://github.com/bigcode-project/starcoder',
+      'architecture': 'GPTBigCodeForCausalLM',
+      'requirements': ['bitsandbytes'],
+      'default_id': 'bigcode/starcoder',
+      'model_ids': ['bigcode/starcoder', 'bigcode/starcoderbase'],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-
-StarCoderConfig = pydantic.create_model(
-  'StarCoderConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(
+      temperature=0.2,
+      max_new_tokens=256,
+      min_new_tokens=32,
+      top_k=50,
+      top_p=0.95,
+      pad_token_id=49152,
+      repetition_penalty=1.2,
+    )
+  )
diff --git a/openllm-core/src/openllm_core/config/configuration_yi.py b/openllm-core/src/openllm_core/config/configuration_yi.py
index 0fd7efca..e9af59a5 100644
--- a/openllm-core/src/openllm_core/config/configuration_yi.py
+++ b/openllm-core/src/openllm_core/config/configuration_yi.py
@@ -4,39 +4,39 @@ import openllm_core, pydantic
 from openllm_core._configuration import ModelSettings
 
 
-docs = """\
-The Yi series models are large language models trained from scratch by developers at 01.AI.
+class YiConfig(openllm_core.LLMConfig):
+  """The Yi series models are large language models trained from scratch by developers at 01.AI.
 
-The first public release contains two bilingual(English/Chinese) base models with the parameter sizes of 6B(Yi-6B) and 34B(Yi-34B).
-Both of them are trained with 4K sequence length and can be extended to 32K during inference time. The Yi-6B-200K and Yi-34B-200K are base model with 200K context length.
+  The first public release contains two bilingual(English/Chinese) base models with the parameter sizes of 6B(Yi-6B) and 34B(Yi-34B).
+  Both of them are trained with 4K sequence length and can be extended to 32K during inference time. The Yi-6B-200K and Yi-34B-200K are base model with 200K context length.
 
-See [Yi's GitHub](https://github.com/01-ai/Yi) for more information.
-"""
+  See [Yi's GitHub](https://github.com/01-ai/Yi) for more information.
+  """
 
-metadata_config: ModelSettings = {
-  'url': 'https://01.ai/',
-  'architecture': 'YiForCausalLM',
-  'trust_remote_code': True,
-  'default_id': '01-ai/Yi-6B',
-  'serialisation': 'safetensors',
-  'model_ids': ['01-ai/Yi-6B', '01-ai/Yi-34B', '01-ai/Yi-6B-200K', '01-ai/Yi-34B-200K'],
-}
+  model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=())
 
-generation_config: openllm_core.GenerationConfig = openllm_core.GenerationConfig.model_construct(
-  max_new_tokens=256,
-  temperature=0.7,
-  repetition_penalty=1.3,
-  no_repeat_ngram_size=5,
-  top_p=0.9,
-  top_k=40,
-  best_of=1,
-  presence_penalty=0.5,
-)
+  metadata_config: ModelSettings = pydantic.Field(
+    default={
+      'url': 'https://01.ai/',
+      'architecture': 'YiForCausalLM',
+      'trust_remote_code': True,
+      'default_id': '01-ai/Yi-6B',
+      'serialisation': 'safetensors',
+      'model_ids': ['01-ai/Yi-6B', '01-ai/Yi-34B', '01-ai/Yi-6B-200K', '01-ai/Yi-34B-200K'],
+    },
+    repr=False,
+    exclude=True,
+  )
 
-YiConfig = pydantic.create_model(
-  'YiConfig',
-  __doc__=docs,
-  __base__=(openllm_core.LLMConfig,),
-  metadata_config=(ModelSettings, pydantic.Field(default=metadata_config, repr=False, exclude=True)),
-  generation_config=(openllm_core.GenerationConfig, pydantic.Field(default=generation_config)),
-)
+  generation_config: openllm_core.GenerationConfig = pydantic.Field(
+    default=openllm_core.GenerationConfig.model_construct(
+      max_new_tokens=256,
+      temperature=0.7,
+      repetition_penalty=1.3,
+      no_repeat_ngram_size=5,
+      top_p=0.9,
+      top_k=40,
+      best_of=1,
+      presence_penalty=0.5,
+    )
+  )
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 16985dce..aa8adcef 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -427,7 +427,6 @@ __lazy = LazyModule(
     'representation': ['ReprMixin'],
     'serde': ['converter'],
     'import_utils': [
-      'OPTIONAL_DEPENDENCIES',
       'is_vllm_available',
       'is_torch_available',
       'is_bitsandbytes_available',
diff --git a/openllm-core/src/openllm_core/utils/__init__.pyi b/openllm-core/src/openllm_core/utils/__init__.pyi
index 6cb84925..7e20a430 100644
--- a/openllm-core/src/openllm_core/utils/__init__.pyi
+++ b/openllm-core/src/openllm_core/utils/__init__.pyi
@@ -6,7 +6,6 @@ from openllm_core._typing_compat import overload, AnyCallable, ParamSpec, Concat
 from bentoml._internal.types import PathType
 from openllm_core.utils.import_utils import (
   ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
-  OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
   is_autoawq_available as is_autoawq_available,
   is_autogptq_available as is_autogptq_available,
   is_bentoml_available as is_bentoml_available,
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index 183b81a6..77b59856 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -4,7 +4,6 @@ import importlib, importlib.metadata, importlib.util, os, inspect, typing as t
 from .codegen import _make_method
 from ._constants import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
 
-OPTIONAL_DEPENDENCIES = {'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc', 'awq'}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({'AUTO'})
 USE_VLLM = os.getenv('USE_VLLM', 'AUTO').upper()
 
diff --git a/openllm-core/src/openllm_core/utils/import_utils.pyi b/openllm-core/src/openllm_core/utils/import_utils.pyi
index f4fcc80c..90902361 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.pyi
+++ b/openllm-core/src/openllm_core/utils/import_utils.pyi
@@ -16,4 +16,3 @@ def is_jupytext_available() -> bool: ...
 def is_notebook_available() -> bool: ...
 def is_autogptq_available() -> bool: ...
 ENV_VARS_TRUE_VALUES: t.Set[str] = ...
-OPTIONAL_DEPENDENCIES: t.Set[str] = ...
diff --git a/openllm-python/README.md b/openllm-python/README.md
index 47b81e8a..f93ca3a5 100644
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -101,24 +101,16 @@ OpenLLM currently supports the following models. By default, OpenLLM doesn't inc
 
 ### Quickstart
 
-
-
-> **Note:** Baichuan requires to install with:
-> ```bash
-> pip install "openllm[baichuan]"
-> ```
-
-
 Run the following command to quickly spin up a Baichuan server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start baichuan-inc/baichuan-7b
+openllm start baichuan-inc/baichuan-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -145,24 +137,16 @@ You can specify any of the following Baichuan models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** ChatGLM requires to install with:
-> ```bash
-> pip install "openllm[chatglm]"
-> ```
-
-
 Run the following command to quickly spin up a ChatGLM server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start thudm/chatglm-6b
+openllm start thudm/chatglm-6b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -186,29 +170,55 @@ You can specify any of the following ChatGLM models via `openllm start`:
 
 <details>
 
+<summary>Cohere</summary>
+
+
+### Quickstart
+
+Run the following command to quickly spin up a Cohere server:
+
+```bash
+openllm start CohereForAI/c4ai-command-r-plus --trust-remote-code
+```
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
+```
+
+
+> **Note:** Any Cohere variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=commandr) to see more Cohere-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Cohere models via `openllm start`:
+
+
+- [CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
+- [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+
+</details>
+
+<details>
+
 <summary>Dbrx</summary>
 
 
 ### Quickstart
 
-
-
-> **Note:** Dbrx requires to install with:
-> ```bash
-> pip install "openllm[dbrx]"
-> ```
-
-
 Run the following command to quickly spin up a Dbrx server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start databricks/dbrx-instruct
+openllm start databricks/dbrx-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -236,13 +246,13 @@ You can specify any of the following Dbrx models via `openllm start`:
 Run the following command to quickly spin up a DollyV2 server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start databricks/dolly-v2-3b
+openllm start databricks/dolly-v2-3b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -268,24 +278,16 @@ You can specify any of the following DollyV2 models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Falcon requires to install with:
-> ```bash
-> pip install "openllm[falcon]"
-> ```
-
-
 Run the following command to quickly spin up a Falcon server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start tiiuae/falcon-7b
+openllm start tiiuae/falcon-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -312,24 +314,16 @@ You can specify any of the following Falcon models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Gemma requires to install with:
-> ```bash
-> pip install "openllm[gemma]"
-> ```
-
-
 Run the following command to quickly spin up a Gemma server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start google/gemma-7b
+openllm start google/gemma-7b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -359,13 +353,13 @@ You can specify any of the following Gemma models via `openllm start`:
 Run the following command to quickly spin up a GPTNeoX server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start eleutherai/gpt-neox-20b
+openllm start eleutherai/gpt-neox-20b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -389,24 +383,16 @@ You can specify any of the following GPTNeoX models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Llama requires to install with:
-> ```bash
-> pip install "openllm[llama]"
-> ```
-
-
 Run the following command to quickly spin up a Llama server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start NousResearch/llama-2-7b-hf
+openllm start NousResearch/llama-2-7b-hf --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -441,24 +427,16 @@ You can specify any of the following Llama models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Mistral requires to install with:
-> ```bash
-> pip install "openllm[mistral]"
-> ```
-
-
 Run the following command to quickly spin up a Mistral server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mistralai/Mistral-7B-Instruct-v0.1
+openllm start mistralai/Mistral-7B-Instruct-v0.1 --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -486,24 +464,16 @@ You can specify any of the following Mistral models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Mixtral requires to install with:
-> ```bash
-> pip install "openllm[mixtral]"
-> ```
-
-
 Run the following command to quickly spin up a Mixtral server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -528,24 +498,16 @@ You can specify any of the following Mixtral models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** MPT requires to install with:
-> ```bash
-> pip install "openllm[mpt]"
-> ```
-
-
 Run the following command to quickly spin up a MPT server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b-instruct
+openllm start mosaicml/mpt-7b-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -575,24 +537,16 @@ You can specify any of the following MPT models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** OPT requires to install with:
-> ```bash
-> pip install "openllm[opt]"
-> ```
-
-
 Run the following command to quickly spin up a OPT server:
 
 ```bash
 openllm start facebook/opt-1.3b
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -621,24 +575,16 @@ You can specify any of the following OPT models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Phi requires to install with:
-> ```bash
-> pip install "openllm[phi]"
-> ```
-
-
 Run the following command to quickly spin up a Phi server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start microsoft/Phi-3-mini-4k-instruct
+openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -667,24 +613,16 @@ You can specify any of the following Phi models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Qwen requires to install with:
-> ```bash
-> pip install "openllm[qwen]"
-> ```
-
-
 Run the following command to quickly spin up a Qwen server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat
+openllm start qwen/Qwen-7B-Chat --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -713,24 +651,16 @@ You can specify any of the following Qwen models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** StableLM requires to install with:
-> ```bash
-> pip install "openllm[stablelm]"
-> ```
-
-
 Run the following command to quickly spin up a StableLM server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start stabilityai/stablelm-tuned-alpha-3b
+openllm start stabilityai/stablelm-tuned-alpha-3b --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -757,24 +687,16 @@ You can specify any of the following StableLM models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** StarCoder requires to install with:
-> ```bash
-> pip install "openllm[starcoder]"
-> ```
-
-
 Run the following command to quickly spin up a StarCoder server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start bigcode/starcoder
+openllm start bigcode/starcoder --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
@@ -799,24 +721,16 @@ You can specify any of the following StarCoder models via `openllm start`:
 
 ### Quickstart
 
-
-
-> **Note:** Yi requires to install with:
-> ```bash
-> pip install "openllm[yi]"
-> ```
-
-
 Run the following command to quickly spin up a Yi server:
 
 ```bash
-TRUST_REMOTE_CODE=True openllm start 01-ai/Yi-6B
+openllm start 01-ai/Yi-6B --trust-remote-code
 ```
-In a different terminal, run the following command to interact with the server:
-
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+You can run the following code in a different terminal to interact with the server:
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```
 
 
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index b548bd6a..483631b0 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -39,21 +39,16 @@ classifiers = [
 ]
 dependencies = [
     "bentoml[io]>=1.2.16",
-    "transformers[torch,tokenizers]>=4.36.0",
     "openllm-client>=0.5.4",
     "openllm-core>=0.5.4",
     "safetensors",
     "vllm>=0.4.2",
-    "optimum>=1.12.0",
-    "accelerate",
     "ghapi",
     "einops",
     "sentencepiece",
     "scipy",
-    "build[virtualenv]<1",
     "click>=8.1.3",
     "cuda-python;platform_system!=\"Darwin\"",
-    "bitsandbytes<0.42",
 ]
 description = "OpenLLM: Run any open-source LLMs, such as Llama 2, Mistral, as OpenAI compatible API endpoint in the cloud."
 dynamic = ["version", "readme"]
@@ -94,38 +89,6 @@ Homepage = "https://bentoml.com"
 Tracker = "https://github.com/bentoml/OpenLLM/issues"
 Twitter = "https://twitter.com/bentomlai"
 
-[project.optional-dependencies]
-agents = ["transformers[agents]>=4.36.0", "diffusers", "soundfile"]
-all = ["openllm[full]"]
-awq = ["autoawq"]
-baichuan = ["cpm-kernels"]
-chatglm = ["cpm-kernels"]
-dbrx = ["cpm-kernels"]
-dolly-v2 = ["cpm-kernels"]
-falcon = ["xformers"]
-fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"]
-full = [
-    "openllm[agents,awq,baichuan,chatglm,dbrx,dolly-v2,falcon,fine-tune,gemma,ggml,gpt-neox,gptq,grpc,llama,mistral,mixtral,mpt,openai,opt,phi,playground,qwen,stablelm,starcoder,vllm,yi]",
-]
-gemma = ["xformers"]
-ggml = ["ctransformers"]
-gpt-neox = ["xformers"]
-gptq = ["auto-gptq[triton]>=0.4.2"]
-grpc = ["bentoml[grpc]>=1.2.16"]
-llama = ["xformers"]
-mistral = ["xformers"]
-mixtral = ["xformers"]
-mpt = ["triton"]
-openai = ["openai[datalib]>=1", "tiktoken", "fastapi"]
-opt = ["triton"]
-phi = ["triton"]
-playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
-qwen = ["cpm-kernels", "tiktoken"]
-stablelm = ["cpm-kernels", "tiktoken"]
-starcoder = ["bitsandbytes"]
-vllm = ["vllm==0.4.2"]
-yi = ["bitsandbytes"]
-
 [tool.hatch.version]
 fallback-version = "0.0.0"
 source = "vcs"
diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py
index 65a65e78..b730ea0c 100644
--- a/openllm-python/src/_openllm_tiny/_entrypoint.py
+++ b/openllm-python/src/_openllm_tiny/_entrypoint.py
@@ -105,7 +105,7 @@ def optimization_decorator(fn: t.Callable[..., t.Any]):
       '--quantise',
       '--quantize',
       'quantise',
-      type=str,
+      type=click.Choice(get_literal_args(LiteralQuantise)),
       default=None,
       envvar='QUANTIZE',
       show_envvar=True,
diff --git a/openllm-python/src/_openllm_tiny/_llm.py b/openllm-python/src/_openllm_tiny/_llm.py
index f314fde6..938c167f 100644
--- a/openllm-python/src/_openllm_tiny/_llm.py
+++ b/openllm-python/src/_openllm_tiny/_llm.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import inspect, orjson, dataclasses, bentoml, functools, attr, openllm_core, traceback, openllm, typing as t
+import inspect, orjson, logging, dataclasses, bentoml, functools, attr, os, openllm_core, traceback, openllm, typing as t
 
 from openllm_core.utils import (
   get_debug_mode,
@@ -10,11 +10,13 @@ from openllm_core.utils import (
   dict_filter_none,
   Counter,
 )
-from openllm_core._typing_compat import LiteralQuantise, LiteralSerialisation, LiteralDtype
+from openllm_core._typing_compat import LiteralQuantise, LiteralSerialisation, LiteralDtype, get_literal_args
 from openllm_core._schemas import GenerationOutput
 
 Dtype = t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']]
 
+logger = logging.getLogger(__name__)
+
 if t.TYPE_CHECKING:
   from vllm import AsyncEngineArgs, EngineArgs, RequestOutput
 
@@ -30,11 +32,24 @@ def check_engine_args(_, attr: attr.Attribute[dict[str, t.Any]], v: dict[str, t.
 
 
 def check_quantization(_, attr: attr.Attribute[LiteralQuantise], v: str | None) -> LiteralQuantise | None:
-  if v is not None and v not in {'gptq', 'awq', 'squeezellm'}:
+  if v is not None and v not in get_literal_args(LiteralQuantise):
     raise ValueError(f'Invalid quantization method: {v}')
   return v
 
 
+def update_engine_args(v: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
+  env_json_string = os.environ.get('ENGINE_CONFIG', None)
+
+  config_from_env = {}
+  if env_json_string is not None:
+    try:
+      config_from_env = orjson.loads(env_json_string)
+    except orjson.JSONDecodeError as e:
+      raise RuntimeError("Failed to parse 'ENGINE_CONFIG' as valid JSON string.") from e
+  config_from_env.update(v)
+  return config_from_env
+
+
 @attr.define(init=False)
 class LLM:
   model_id: str
@@ -44,7 +59,7 @@ class LLM:
   dtype: Dtype
   quantise: t.Optional[LiteralQuantise] = attr.field(default=None, validator=check_quantization)
   trust_remote_code: bool = attr.field(default=False)
-  engine_args: t.Dict[str, t.Any] = attr.field(factory=dict, validator=check_engine_args)
+  engine_args: t.Dict[str, t.Any] = attr.field(factory=dict, validator=check_engine_args, converter=update_engine_args)
 
   _mode: t.Literal['batch', 'async'] = attr.field(default='async', repr=False)
   _path: str = attr.field(
@@ -117,18 +132,27 @@ class LLM:
     num_gpus, dev = 1, openllm.utils.device_count()
     if dev >= 2:
       num_gpus = min(dev // 2 * 2, dev)
-    dtype = 'float16' if self.quantise == 'gptq' else self.dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
 
-    self.engine_args.update({
-      'worker_use_ray': False,
-      'tokenizer_mode': 'auto',
+    overriden_dict = {
       'tensor_parallel_size': num_gpus,
       'model': self._path,
       'tokenizer': self._path,
       'trust_remote_code': self.trust_remote_code,
-      'dtype': dtype,
+      'dtype': self.dtype,
       'quantization': self.quantise,
-    })
+    }
+    if any(k in self.engine_args for k in overriden_dict.keys()):
+      logger.warning(
+        'The following key will be overriden by openllm: %s (got %s set)',
+        list(overriden_dict),
+        [k for k in overriden_dict if k in self.engine_args],
+      )
+
+    self.engine_args.update(overriden_dict)
+    if 'worker_use_ray' not in self.engine_args:
+      self.engine_args['worker_use_ray'] = False
+    if 'tokenizer_mode' not in self.engine_args:
+      self.engine_args['tokenizer_mode'] = 'auto'
     if 'disable_log_stats' not in self.engine_args:
       self.engine_args['disable_log_stats'] = not get_debug_mode()
     if 'gpu_memory_utilization' not in self.engine_args:
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
index 2138637c..f8e09b7d 100644
--- a/openllm-python/src/openllm/__init__.pyi
+++ b/openllm-python/src/openllm/__init__.pyi
@@ -13,7 +13,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 # fmt: off
 # update-config-stubs.py: import stubs start
 from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
-from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DbrxConfig as DbrxConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, CohereConfig as CohereConfig, DbrxConfig as DbrxConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig
 from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput, MessageParam as MessageParam
 from openllm_core.utils import api as api
diff --git a/openllm-python/src/openllm/utils.pyi b/openllm-python/src/openllm/utils.pyi
index 21307fdc..0690cbde 100644
--- a/openllm-python/src/openllm/utils.pyi
+++ b/openllm-python/src/openllm/utils.pyi
@@ -6,7 +6,6 @@ from openllm_core.utils import (
   DEV_DEBUG_VAR as DEV_DEBUG_VAR,
   ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
   MYPY as MYPY,
-  OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
   QUIET_ENV_VAR as QUIET_ENV_VAR,
   SHOW_CODEGEN as SHOW_CODEGEN,
   LazyLoader as LazyLoader,
diff --git a/tools/dependencies.py b/tools/dependencies.py
index 9ceb9e43..0fcec2b8 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -13,10 +13,9 @@ if t.TYPE_CHECKING:
   from tomlkit.items import Array, Table
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
-sys.path.insert(1, os.path.join(ROOT, 'openllm-core', 'src'))
+sys.path.insert(0, os.path.join(ROOT, 'openllm-core', 'src'))
 
-import openllm
+import openllm_core as core
 
 _OWNER, _REPO = 'bentoml', 'openllm'
 
@@ -143,35 +142,7 @@ class Dependencies:
 
 _LOWER_BENTOML_CONSTRAINT = '1.2.16'
 _BENTOML_EXT = ['io']
-_TRANSFORMERS_EXT = ['torch', 'tokenizers']
-_TRANSFORMERS_CONSTRAINTS = '4.36.0'
 
-FINE_TUNE_DEPS = ['peft>=0.6.0', 'datasets', 'trl', 'huggingface-hub']
-GRPC_DEPS = [f'bentoml[grpc]>={_LOWER_BENTOML_CONSTRAINT}']
-OPENAI_DEPS = ['openai[datalib]>=1', 'tiktoken', 'fastapi']
-AGENTS_DEPS = [f'transformers[agents]>={_TRANSFORMERS_CONSTRAINTS}', 'diffusers', 'soundfile']
-PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
-GGML_DEPS = ['ctransformers']
-AWQ_DEPS = ['autoawq']
-GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2']
-VLLM_DEPS = ['vllm==0.4.2']
-
-_base_requirements: dict[str, t.Any] = {
-  inflection.dasherize(name): config_cls()['requirements']
-  for name, config_cls in openllm.CONFIG_MAPPING.items()
-  if 'requirements' in config_cls()
-}
-
-# shallow copy from locals()
-_locals = locals().copy()
-
-# NOTE: update this table when adding new external dependencies
-# sync with openllm.utils.OPTIONAL_DEPENDENCIES
-_base_requirements.update({
-  v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES
-})
-
-_base_requirements = {k: v for k, v in sorted(_base_requirements.items())}
 
 fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}'
 
@@ -201,21 +172,6 @@ def create_classifiers() -> Array:
   return arr.multiline(True)
 
 
-def create_optional_table() -> Table:
-  all_array = tomlkit.array()
-  all_array.append(f"openllm[{','.join([k for k, v in _base_requirements.items() if v])}]")
-
-  table = tomlkit.table(is_super_table=True)
-  _base_requirements.update({
-    'full': correct_style(all_array.multiline(True)),
-    'all': tomlkit.array('["openllm[full]"]'),
-  })
-  table.update({k: v for k, v in sorted(_base_requirements.items()) if v})
-  table.add(tomlkit.nl())
-
-  return table
-
-
 def create_url_table(_info: t.Any) -> Table:
   table = tomlkit.table()
   _urls = {
@@ -282,25 +238,23 @@ def main(args) -> int:
   if args.release_version is not None:
     release_version = args.release_version
   else:
-    release_version = openllm.bundle.RefResolver.from_strategy('release').version
+    try:
+      release_version = api.repos.get_latest_release()['name'].lstrip('v')
+    except Exception as err:
+      raise err
 
   _BASE_DEPENDENCIES = [
     Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=_LOWER_BENTOML_CONSTRAINT),
-    Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint=_TRANSFORMERS_CONSTRAINTS),
     Dependencies(name='openllm-client', lower_constraint=release_version),
     Dependencies(name='openllm-core', lower_constraint=release_version),
     Dependencies(name='safetensors'),
     Dependencies(name='vllm', lower_constraint='0.4.2'),
-    Dependencies(name='optimum', lower_constraint='1.12.0'),
-    Dependencies(name='accelerate'),
     Dependencies(name='ghapi'),
     Dependencies(name='einops'),
     Dependencies(name='sentencepiece'),
     Dependencies(name='scipy'),
-    Dependencies(name='build', upper_constraint='1', extensions=['virtualenv']),
     Dependencies(name='click', lower_constraint='8.1.3'),
     Dependencies(name='cuda-python', platform=('Darwin', 'ne')),
-    Dependencies(name='bitsandbytes', upper_constraint='0.42'),  # 0.41 works with CUDA 11.8
   ]
 
   dependencies_array = correct_style(tomlkit.array())
@@ -321,7 +275,6 @@ def main(args) -> int:
 
   pyproject['project']['urls'] = create_url_table(_info)
   pyproject['project']['scripts'] = build_cli_extensions()
-  pyproject['project']['optional-dependencies'] = create_optional_table()
 
   with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f:
     f.write(tomlkit.dumps(pyproject))
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index bce06fc8..672abd68 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -159,7 +159,6 @@ def main() -> int:
     'def is_vllm_available() -> bool: ...\n',
     *[f'def {k}() -> bool: ...\n' for k in iutils.caller],
     'ENV_VARS_TRUE_VALUES: t.Set[str] = ...\n',
-    'OPTIONAL_DEPENDENCIES: t.Set[str] = ...\n',
   ]
   with _TARGET_IMPORT_UTILS_FILE.open('w') as f:
     f.writelines(lines)
diff --git a/tools/update-readme.py b/tools/update-readme.py
index a8369c57..01625fb3 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -19,8 +19,6 @@ def markdown_importantblock(text: str):
 
 
 def main() -> int:
-  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
-    deps = tomlkit.parse(f.read()).value['project']['optional-dependencies']
   with open(os.path.join(ROOT, 'README.md'), 'r') as f:
     readme = f.readlines()
 
@@ -33,20 +31,18 @@ def main() -> int:
     architecture_name = it.__class__.__name__[:-6]
     details_block = ['<details>\n', f'<summary>{architecture_name}</summary>\n\n', '### Quickstart\n']
     nitem = CONFIG_TO_ALIAS_NAMES[it.__class__.__name__]
-    if nitem in deps:
-      instruction = f'> ```bash\n> pip install "openllm[{nitem}]"\n> ```'
-      details_block.extend(markdown_noteblock(f'{architecture_name} requires to install with:\n{instruction}\n'))
     details_block.extend([
       f'Run the following command to quickly spin up a {architecture_name} server:\n',
       f"""\
 ```bash
-{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['default_id']}
+openllm start {it['default_id']}{'' if not it['trust_remote_code'] else ' --trust-remote-code'}
 ```""",
-      'In a different terminal, run the following command to interact with the server:\n',
+      'You can run the following code in a different terminal to interact with the server:',
       """\
-```bash
-export OPENLLM_ENDPOINT=http://localhost:3000
-openllm query 'What are large language models?'
+```python
+import openllm_client
+client = openllm_client.HTTPClient('http://localhost:3000')
+client.generate('What are large language models?')
 ```""",
       *markdown_noteblock(
         f'Any {architecture_name} variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search={nitem}) to see more {architecture_name}-compatible models.\n'