From 3ffb25a8727d51a93e5dbcc162ab5f15db37696e Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 22 Aug 2023 08:55:46 -0400
Subject: [PATCH] refactor: packages (#249)

---
 .gitattributes                                |   2 +-
 .github/workflows/binary-releases.yml         |   4 +-
 .github/workflows/build-embedding.yml         |   2 +-
 .github/workflows/ci.yml                      |   2 +-
 .github/workflows/compile-pypi.yml            |  47 ++-
 .gitignore                                    |   2 +-
 DEVELOPMENT.md                                |   2 +-
 changelog.d/249.refactor.md                   |   5 +
 compile.sh                                    |   3 +
 cz.py                                         |  10 +-
 hatch.toml                                    |  13 +-
 local.sh                                      |  10 +
 openllm-client/.git_archival.txt              |   1 +
 openllm-client/.python-version-default        |   1 +
 openllm-client/LICENSE.md                     | 194 +++++++++++
 openllm-client/README.md                      |  86 +++++
 openllm-client/pyproject.toml                 | 172 ++++++++++
 openllm-client/src/openllm_client/__init__.py |   5 +
 .../src/openllm_client/_adapters.py           |   0
 openllm-client/src/openllm_client/_base.py    | 203 ++++++++++++
 .../src/openllm_client/benmin/__init__.py     |  87 +++++
 .../src/openllm_client/benmin/_grpc.py        | 187 +++++++++++
 .../src/openllm_client/benmin/_http.py        | 137 ++++++++
 openllm-client/src/openllm_client/client.py   |  33 ++
 openllm-core/.git_archival.txt                |   1 +
 openllm-core/.python-version-default          |   1 +
 openllm-core/LICENSE.md                       | 194 +++++++++++
 openllm-core/README.md                        |  75 +++++
 openllm-core/pyproject.toml                   | 195 +++++++++++
 openllm-core/src/openllm_core/__init__.py     |   7 +
 .../src/openllm_core}/_configuration.py       |  78 +++--
 .../src/openllm_core}/_prompt.py              |   0
 .../src/openllm_core}/_schema.py              |  10 +-
 .../src/openllm_core}/_strategies.py          |   9 +-
 .../src/openllm_core}/_typing_compat.py       |  14 +-
 .../src/openllm_core/config/__init__.py       |  13 +
 .../config}/configuration_auto.py             |  24 +-
 .../config}/configuration_baichuan.py         |  41 +--
 .../config}/configuration_chatglm.py          |  66 ++--
 .../config}/configuration_dolly_v2.py         |  51 +--
 .../config}/configuration_falcon.py           |  41 +--
 .../config}/configuration_flan_t5.py          |  39 ++-
 .../config}/configuration_gpt_neox.py         |  53 +--
 .../config}/configuration_llama.py            |  61 ++--
 .../openllm_core/config}/configuration_mpt.py |  56 ++--
 .../openllm_core/config}/configuration_opt.py |  53 +--
 .../config}/configuration_stablelm.py         |  54 ++--
 .../config}/configuration_starcoder.py        |  53 +--
 openllm-core/src/openllm_core/exceptions.py   |  19 ++
 .../src/openllm_core/utils/__init__.py        | 304 +++++++++++++++++
 .../src/openllm_core}/utils/analytics.py      |  13 +-
 .../src/openllm_core}/utils/codegen.py        |  55 +---
 .../src/openllm_core}/utils/dantic.py         |   6 +-
 .../src/openllm_core}/utils/import_utils.py   |  20 +-
 .../src/openllm_core}/utils/lazy.py           |   5 +-
 .../src/openllm_core}/utils/representation.py |   7 +-
 openllm-python/ADDING_NEW_MODEL.md            |  17 +-
 openllm-python/pyproject.toml                 |  54 +---
 openllm-python/src/openllm/__init__.py        |  53 ++-
 openllm-python/src/openllm/_llm.py            |  97 +++---
 openllm-python/src/openllm/_quantisation.py   |  11 +-
 openllm-python/src/openllm/bundle/__init__.py |  15 +-
 openllm-python/src/openllm/bundle/_package.py |  69 ++--
 .../src/openllm/bundle/oci/__init__.py        |  27 +-
 openllm-python/src/openllm/cli/_factory.py    |  20 +-
 openllm-python/src/openllm/cli/_sdk.py        |  12 +-
 openllm-python/src/openllm/cli/entrypoint.py  |  30 +-
 .../cli/extension/build_base_container.py     |  27 +-
 .../src/openllm/cli/extension/dive_bentos.py  |  22 +-
 .../cli/extension/get_containerfile.py        |  18 +-
 .../src/openllm/cli/extension/get_prompt.py   |  21 +-
 .../src/openllm/cli/extension/list_bentos.py  |  14 +-
 .../src/openllm/cli/extension/list_models.py  |   9 +-
 .../src/openllm/cli/extension/playground.py   |   9 +-
 openllm-python/src/openllm/cli/termui.py      |   2 +-
 openllm-python/src/openllm/client.py          |  17 +
 openllm-python/src/openllm/client/__init__.py |  22 --
 .../src/openllm/client/runtimes/__init__.py   |  15 -
 .../src/openllm/client/runtimes/base.py       | 238 --------------
 .../src/openllm/client/runtimes/grpc.py       |  93 ------
 .../src/openllm/client/runtimes/http.py       |  96 ------
 openllm-python/src/openllm/exceptions.py      |  18 +-
 openllm-python/src/openllm/models/__init__.py |  10 +-
 .../src/openllm/models/auto/__init__.py       |  10 +-
 .../src/openllm/models/auto/factory.py        |   4 +-
 .../src/openllm/models/auto/modeling_auto.py  |   2 +-
 .../openllm/models/auto/modeling_flax_auto.py |   2 +-
 .../openllm/models/auto/modeling_tf_auto.py   |   2 +-
 .../openllm/models/auto/modeling_vllm_auto.py |   2 +-
 .../src/openllm/models/baichuan/__init__.py   |  13 +-
 .../models/baichuan/modeling_baichuan.py      |   9 +-
 .../models/baichuan/modeling_vllm_baichuan.py |   3 -
 .../src/openllm/models/chatglm/__init__.py    |  13 +-
 .../models/chatglm/modeling_chatglm.py        |  21 +-
 .../src/openllm/models/dolly_v2/__init__.py   |  13 +-
 .../models/dolly_v2/modeling_dolly_v2.py      |   7 +-
 .../models/dolly_v2/modeling_vllm_dolly_v2.py |   3 -
 .../src/openllm/models/falcon/__init__.py     |  13 +-
 .../openllm/models/falcon/modeling_falcon.py  |   4 -
 .../models/falcon/modeling_vllm_falcon.py     |   3 -
 .../src/openllm/models/flan_t5/__init__.py    |  13 +-
 .../models/flan_t5/modeling_flan_t5.py        |   9 +-
 .../models/flan_t5/modeling_flax_flan_t5.py   |   5 +-
 .../models/flan_t5/modeling_tf_flan_t5.py     |   4 -
 .../src/openllm/models/gpt_neox/__init__.py   |  13 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |  13 +-
 .../models/gpt_neox/modeling_vllm_gpt_neox.py |   6 +-
 .../src/openllm/models/llama/__init__.py      |  15 +-
 .../openllm/models/llama/modeling_llama.py    |  15 +-
 .../models/llama/modeling_vllm_llama.py       |   7 +-
 .../src/openllm/models/mpt/__init__.py        |  15 +-
 .../src/openllm/models/mpt/modeling_mpt.py    |  27 +-
 .../openllm/models/mpt/modeling_vllm_mpt.py   |  16 +-
 .../src/openllm/models/opt/__init__.py        |  13 +-
 .../openllm/models/opt/modeling_flax_opt.py   |   6 +-
 .../src/openllm/models/opt/modeling_opt.py    |  15 +-
 .../src/openllm/models/opt/modeling_tf_opt.py |  15 +-
 .../openllm/models/opt/modeling_vllm_opt.py   |   8 +-
 .../src/openllm/models/stablelm/__init__.py   |  13 +-
 .../models/stablelm/modeling_stablelm.py      |  26 +-
 .../models/stablelm/modeling_vllm_stablelm.py |  10 -
 .../src/openllm/models/starcoder/__init__.py  |  13 +-
 .../models/starcoder/modeling_starcoder.py    |  24 +-
 .../starcoder/modeling_vllm_starcoder.py      |  13 -
 .../src/openllm/serialisation/__init__.py     |   2 +-
 .../src/openllm/serialisation/ggml.py         |   2 +-
 .../serialisation/transformers/__init__.py    |   2 +-
 .../serialisation/transformers/_helpers.py    |   8 +-
 .../serialisation/transformers/weights.py     |   2 +-
 openllm-python/src/openllm/utils/__init__.py  | 305 +-----------------
 .../src/openllm/utils/dummy_flax_objects.py   |   2 +-
 .../src/openllm/utils/dummy_pt_objects.py     |   2 +-
 .../src/openllm/utils/dummy_tf_objects.py     |   2 +-
 .../src/openllm/utils/dummy_vllm_objects.py   |   2 +-
 .../tests/_strategies/_configuration.py       |   8 +-
 openllm-python/tests/compiled_test.py         |  11 -
 openllm-python/tests/configuration_test.py    |  31 +-
 openllm-python/tests/conftest.py              |   2 +-
 openllm-python/tests/models/conftest.py       |   2 +-
 openllm-python/tests/models_test.py           |   8 +-
 openllm-python/tests/package_test.py          |  12 +-
 openllm-python/tests/strategies_test.py       |  16 +-
 package.json                                  |   2 +-
 pyproject.toml                                |  64 ++--
 tools/dependencies.py                         |  22 +-
 tools/update-config-stubs.py                  |  12 +-
 tools/update-dummy.py                         |   7 +-
 tools/update-models-import.py                 |  10 +-
 148 files changed, 2899 insertions(+), 1937 deletions(-)
 create mode 100644 changelog.d/249.refactor.md
 create mode 100755 local.sh
 create mode 120000 openllm-client/.git_archival.txt
 create mode 120000 openllm-client/.python-version-default
 create mode 100644 openllm-client/LICENSE.md
 create mode 100644 openllm-client/README.md
 create mode 100644 openllm-client/pyproject.toml
 create mode 100644 openllm-client/src/openllm_client/__init__.py
 create mode 100644 openllm-client/src/openllm_client/_adapters.py
 create mode 100644 openllm-client/src/openllm_client/_base.py
 create mode 100644 openllm-client/src/openllm_client/benmin/__init__.py
 create mode 100644 openllm-client/src/openllm_client/benmin/_grpc.py
 create mode 100644 openllm-client/src/openllm_client/benmin/_http.py
 create mode 100644 openllm-client/src/openllm_client/client.py
 create mode 120000 openllm-core/.git_archival.txt
 create mode 120000 openllm-core/.python-version-default
 create mode 100644 openllm-core/LICENSE.md
 create mode 100644 openllm-core/README.md
 create mode 100644 openllm-core/pyproject.toml
 create mode 100644 openllm-core/src/openllm_core/__init__.py
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/_configuration.py (96%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/_prompt.py (100%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/_schema.py (78%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/_strategies.py (96%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/_typing_compat.py (94%)
 create mode 100644 openllm-core/src/openllm_core/config/__init__.py
 rename {openllm-python/src/openllm/models/auto => openllm-core/src/openllm_core/config}/configuration_auto.py (85%)
 rename {openllm-python/src/openllm/models/baichuan => openllm-core/src/openllm_core/config}/configuration_baichuan.py (73%)
 rename {openllm-python/src/openllm/models/chatglm => openllm-core/src/openllm_core/config}/configuration_chatglm.py (54%)
 rename {openllm-python/src/openllm/models/dolly_v2 => openllm-core/src/openllm_core/config}/configuration_dolly_v2.py (78%)
 rename {openllm-python/src/openllm/models/falcon => openllm-core/src/openllm_core/config}/configuration_falcon.py (71%)
 rename {openllm-python/src/openllm/models/flan_t5 => openllm-core/src/openllm_core/config}/configuration_flan_t5.py (68%)
 rename {openllm-python/src/openllm/models/gpt_neox => openllm-core/src/openllm_core/config}/configuration_gpt_neox.py (71%)
 rename {openllm-python/src/openllm/models/llama => openllm-core/src/openllm_core/config}/configuration_llama.py (81%)
 rename {openllm-python/src/openllm/models/mpt => openllm-core/src/openllm_core/config}/configuration_mpt.py (61%)
 rename {openllm-python/src/openllm/models/opt => openllm-core/src/openllm_core/config}/configuration_opt.py (66%)
 rename {openllm-python/src/openllm/models/stablelm => openllm-core/src/openllm_core/config}/configuration_stablelm.py (72%)
 rename {openllm-python/src/openllm/models/starcoder => openllm-core/src/openllm_core/config}/configuration_starcoder.py (63%)
 create mode 100644 openllm-core/src/openllm_core/exceptions.py
 create mode 100644 openllm-core/src/openllm_core/utils/__init__.py
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/analytics.py (83%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/codegen.py (68%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/dantic.py (99%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/import_utils.py (95%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/lazy.py (96%)
 rename {openllm-python/src/openllm => openllm-core/src/openllm_core}/utils/representation.py (91%)
 create mode 100644 openllm-python/src/openllm/client.py
 delete mode 100644 openllm-python/src/openllm/client/__init__.py
 delete mode 100644 openllm-python/src/openllm/client/runtimes/__init__.py
 delete mode 100644 openllm-python/src/openllm/client/runtimes/base.py
 delete mode 100644 openllm-python/src/openllm/client/runtimes/grpc.py
 delete mode 100644 openllm-python/src/openllm/client/runtimes/http.py
 delete mode 100644 openllm-python/tests/compiled_test.py

diff --git a/.gitattributes b/.gitattributes
index 77abb8ba..a6e8db87 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,7 +2,7 @@
 contrib/clojure/pnpm-lock.yaml linguist-generated=true
 contrib/clojure/src/generated/** linguist-generated=true
 
-# Python core
+# Python sdk
 openllm-python/tests/models/__snapshots__/* linguist-generated=true
 openllm-python/src/openllm/utils/dummy_*.py linguist-generated=true
 openllm-python/src/openllm/models/__init__.py linguist-generated=true
diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml
index 16b0bd69..4d1eb38d 100644
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -75,7 +75,9 @@ jobs:
       - name: Install build frontend
         run: python -m pip install --upgrade build
       - name: Build
-        run: python -m build -sw openllm-python/
+        run: |
+          bash local.sh
+          python -m build -sw openllm-python/
       - name: Upload artifacts
         uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
diff --git a/.github/workflows/build-embedding.yml b/.github/workflows/build-embedding.yml
index 61a36d93..82d5205b 100644
--- a/.github/workflows/build-embedding.yml
+++ b/.github/workflows/build-embedding.yml
@@ -112,7 +112,7 @@ jobs:
       - name: Build OPT Bento with base embeddings
         id: bento-tag
         run: |
-          pip install -e ${{ github.workspace }}/openllm-python
+          bash local.sh
           openllm build opt --serialisation legacy --bento-version sha-${{ env.GITHUB_SHA_SHORT }} --machine --dockerfile-template - <<EOF
             {% extends "python_debian.j2" %}
             {% block SETUP_BENTO_BASE_IMAGE %}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cbe1c324..dc66c9ce 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -116,7 +116,7 @@ jobs:
           bentoml-version: 'main'
           python-version-file: .python-version-default
       - name: Install self
-        run: pip install ./openllm-python
+        run: bash local.sh
       - name: Speed
         run: hyperfine -m 100 --warmup 10 openllm
   brew-dry-run:
diff --git a/.github/workflows/compile-pypi.yml b/.github/workflows/compile-pypi.yml
index 57d897ac..7a3add0b 100644
--- a/.github/workflows/compile-pypi.yml
+++ b/.github/workflows/compile-pypi.yml
@@ -66,11 +66,15 @@ jobs:
           echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
           echo github.ref ${{ github.ref }}
   pure-wheels-sdist:
-    name: Pure wheels and sdist distribution
+    name: Pure wheels and sdist distribution (${{ matrix.directory }})
     runs-on: ubuntu-latest
     needs: get_commit_message
     if: >-
       contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
+    strategy:
+      fail-fast: false
+      matrix:
+        directory: ["openllm-core", "openllm-python", "openllm-client"]
     steps:
       - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
@@ -82,37 +86,27 @@ jobs:
           python-version-file: .python-version-default
       - name: Build
         run: hatch build
-        working-directory: openllm-python
+        working-directory: ${{ matrix.directory }}
       - name: Upload artifacts
         uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: python-artefacts
-          path: openllm-python/dist/*
+          path: ${{ matrix.directory }}/dist/*
           if-no-files-found: error
   mypyc:
-    name: Compiled mypyc wheels (${{ matrix.name }})
-    runs-on: ${{ matrix.os }}
-    defaults:
-      run:
-        working-directory: ./openllm-python
+    name: Compiled wheels (${{ matrix.buildplatform[1] }}-${{ matrix.directory }})
+    runs-on: ${{ matrix.buildplatform[0] }}
     strategy:
       fail-fast: false
       matrix:
-        include:
-          - os: ubuntu-latest
-            name: linux-x86_64
-          # NOTE: darwin amd64
-          - os: macos-latest
-            name: macos-x86_64
-            macos_arch: "x86_64"
-          # NOTE: darwin arm64
-          - os: macos-latest
-            name: macos-arm64
-            macos_arch: "arm64"
-          # NOTE: darwin universal2
-          - os: macos-latest
-            name: macos-universal2
-            macos_arch: "universal2"
+        # Github Actions doesn't support pairing matrix values together, let's improvise
+        # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
+        buildplatform:
+          - [ubuntu-latest, linux-x86_64, ""]
+          - [macos-latest, macos-x86_64, "x86_64"]
+          - [macos-latest, macos-arm64, "arm64"]
+          - [macos-latest, macos-universal2, "universal2"]
+        directory: ["openllm-core", "openllm-python", "openllm-client"]
     needs: get_commit_message
     if: >-
       contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
@@ -128,17 +122,16 @@ jobs:
       - name: Build wheels via cibuildwheel
         uses: pypa/cibuildwheel@39a63b5912f086dd459cf6fcb13dcdd3fe3bc24d # ratchet:pypa/cibuildwheel@v2.15.0
         with:
-          package-dir: openllm-python
+          package-dir: ${{ matrix.directory }}
           config-file: pyproject.toml
         env:
           CIBW_PRERELEASE_PYTHONS: True
-          CIBW_BEFORE_BUILD_MACOS: "rustup target add aarch64-apple-darwin"
-          CIBW_ARCHS_MACOS: "${{ matrix.macos_arch }}"
+          CIBW_ARCHS_MACOS: "${{ matrix.buildplatform[2] }}"
           MYPYPATH: /project/typings
       - name: Upload wheels as workflow artifacts
         uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
-          name: ${{ matrix.name }}-mypyc-wheels
+          name: ${{ matrix.buildplatform[1] }}-mypyc-wheels
           path: ./wheelhouse/*.whl
   check-download-artefacts:
     name: dry-run for downloading artefacts
diff --git a/.gitignore b/.gitignore
index 5496bede..a488593d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,4 +141,4 @@ pyapp
 /target
 
 .pdm-python
-/openllm-python/src/openllm/_version.py
+**/_version.py
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 3334dbdf..45156c29 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -72,7 +72,7 @@ Before you can start developing, you'll need to set up your environment:
    dependencies.
 
 > [!NOTE]
-> If you want to install editable, make sure to install it from `openllm-python` folder
+> If you don't want to work with hatch, you can use the editable workflow with running `bash local.sh`
 
 ## Project Structure
 
diff --git a/changelog.d/249.refactor.md b/changelog.d/249.refactor.md
new file mode 100644
index 00000000..fcc6f2da
--- /dev/null
+++ b/changelog.d/249.refactor.md
@@ -0,0 +1,5 @@
+OpenLLM now comprise of three packages:
+
+- openllm-core: main building blocks of OpenLLM, that doesn't depend on transformers and heavy DL libraries
+- openllm-client: The implementation of `openllm.client`
+- openllm: = openllm-core + openllm-client + DL features
diff --git a/compile.sh b/compile.sh
index 8b1e843a..f7e0ac2c 100644
--- a/compile.sh
+++ b/compile.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+pip install -U mypy==1.5.1 build==0.10.0
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-python -w -C--global-option=--verbose "$@"
+HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-core -w -C--global-option=--verbose "$@"
+HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-client -w -C--global-option=--verbose "$@"
 hatch clean
diff --git a/cz.py b/cz.py
index 8068b77a..ac560eaf 100755
--- a/cz.py
+++ b/cz.py
@@ -4,21 +4,25 @@ import itertools, os, token, tokenize
 from tabulate import tabulate
 
 TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING]
-def main() -> int:
+def run_cz(dir: str, package: str):
   headers = ["Name", "Lines", "Tokens/Line"]
   table = []
-  for path, _, files in os.walk(os.path.join("openllm-python", "src", "openllm")):
+  for path, _, files in os.walk(os.path.join(dir, "src", package)):
     for name in files:
       if not name.endswith(".py"): continue
       filepath = os.path.join(path, name)
       with tokenize.open(filepath) as file_:
         tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
         token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
-        table.append([filepath.replace(os.path.join("openllm-python","src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
+        table.append([filepath.replace(os.path.join(dir ,"src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
   print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers="firstrow", floatfmt=".1f") + "\n")
   for dir_name, group in itertools.groupby(sorted([(x[0].rsplit("/", 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
     print(f"{dir_name:35s} : {sum([x[1] for x in group]):6d}")
   print(f"\ntotal line count: {sum([x[1] for x in table])}")
+def main() -> int:
+  run_cz("openllm-python", "openllm")
+  run_cz("openllm-core", "openllm_core")
+  run_cz("openllm-client", "openllm_client")
   return 0
 
 if __name__ == "__main__": raise SystemExit(main())
diff --git a/hatch.toml b/hatch.toml
index bdc45979..3146b9d0 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -1,5 +1,7 @@
 [envs.default]
 dependencies = [
+    "openllm-core @ {root:uri}/openllm-core",
+    "openllm-client @ {root:uri}/openllm-client",
     "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
     # NOTE: To run all hooks
     "pre-commit",
@@ -26,7 +28,6 @@ check-stubs = [
     "./tools/update-models-import.py",
     "update-dummy",
 ]
-compile = "bash ./compile.sh {args}"
 inplace-changelog = "towncrier build --version main --keep"
 quality = [
     "./tools/dependencies.py",
@@ -36,13 +37,12 @@ quality = [
     "check-stubs",
     "- pre-commit run --all-files",
 ]
-recompile = ["bash ./clean.sh", "compile"]
 setup = [
     "pre-commit install",
     "- ln -s .python-version-default .python-version",
     "curl -fsSL https://raw.githubusercontent.com/clj-kondo/clj-kondo/master/script/install-clj-kondo | bash -",
 ]
-tool = ["quality", "recompile -nx"]
+tool = ["quality", "bash ./clean.sh", "bash ./compile.sh {args}"]
 typing = [
     "- pre-commit run mypy {args:-a}",
     "- pre-commit run pyright {args:-a}",
@@ -50,6 +50,8 @@ typing = [
 update-dummy = ["- ./tools/update-dummy.py", "./tools/update-dummy.py"]
 [envs.tests]
 dependencies = [
+    "openllm-core @ {root:uri}/openllm-core",
+    "openllm-client @ {root:uri}/openllm-client",
     "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
     # NOTE: interact with docker for container tests.
     "docker",
@@ -94,7 +96,10 @@ clojure = ["bash tools/run-clojure-ui.sh"]
 [envs.ci]
 detached = true
 [envs.ci.scripts]
+compile = "bash ./compile.sh {args}"
+recompile = ["bash ./clean.sh", "compile"]
+edi = "bash local.sh"
 lock = [
-    "bash tools/lock-actions",
+    "bash tools/lock-actions.sh",
     "pushd contrib/clojure && pnpm i --frozen-lockfile",
 ]
diff --git a/local.sh b/local.sh
new file mode 100755
index 00000000..bca72336
--- /dev/null
+++ b/local.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -ex
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+cd "$GIT_ROOT" || exit 1
+
+pip install -e "$GIT_ROOT/openllm-core" -v
+pip install -e "$GIT_ROOT/openllm-client" -v
+pip install -e "$GIT_ROOT/openllm-python" -v
diff --git a/openllm-client/.git_archival.txt b/openllm-client/.git_archival.txt
new file mode 120000
index 00000000..d7a42b25
--- /dev/null
+++ b/openllm-client/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
\ No newline at end of file
diff --git a/openllm-client/.python-version-default b/openllm-client/.python-version-default
new file mode 120000
index 00000000..40488ba7
--- /dev/null
+++ b/openllm-client/.python-version-default
@@ -0,0 +1 @@
+../.python-version-default
\ No newline at end of file
diff --git a/openllm-client/LICENSE.md b/openllm-client/LICENSE.md
new file mode 100644
index 00000000..7fd0f057
--- /dev/null
+++ b/openllm-client/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
diff --git a/openllm-client/README.md b/openllm-client/README.md
new file mode 100644
index 00000000..417dc2e7
--- /dev/null
+++ b/openllm-client/README.md
@@ -0,0 +1,86 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">👾 OpenLLM Client</h1>
+    <a href="https://pypi.org/project/openllm-client">
+        <img src="https://img.shields.io/pypi/v/openllm-client.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm-client/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm-client">
+        <img src="https://img.shields.io/pypi/pyversions/openllm-client.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="htjtps://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server.<br/></p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps, and more.
+
+To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
+
+This package holds the underlying client implementation for OpenLLM. If you are
+coming from OpenLLM, the client can be accessed via `openllm.client`.
+
+It provides somewhat of a "similar" APIs to [`bentoml.Client`](https://docs.bentoml.com/en/latest/guides/client.html)
+(via `openllm_client.benmin`) for interacting with OpenLLM server. This can also be extended to use with general
+BentoML server as well.
+
+> [!NOTE]
+> The component of interop with generic BentoML server will be considered as experimental that will/can be merged back to BentoML.
+> If you are just using this package for interacting with OpenLLM server, nothing should change from `openllm.client` namespace.
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient()
+
+client.query('Explain to me the difference between "further" and "farther"')
+```
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml
new file mode 100644
index 00000000..f4466905
--- /dev/null
+++ b/openllm-client/pyproject.toml
@@ -0,0 +1,172 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+]
+
+[project]
+authors = [
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
+]
+dynamic = ['readme', 'version']
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+description = "OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server."
+keywords = [
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
+]
+dependencies = ["openllm-core", "httpx"]
+license = "Apache-2.0"
+name = "openllm-client"
+requires-python = ">=3.8"
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client/README.md"
+GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+[project.optional-dependencies]
+full = ["openllm-client[grpc,agents]"]
+grpc = ["bentoml[grpc]>=1.0.25"]
+agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm_client/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm_client"]
+sources = ["src"]
+[tool.hatch.build.targets.sdist]
+exclude = ["/.git_archival.txt", "tests", "/.python-version-default"]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+  "hatch-mypyc==0.16.0",
+  "mypy==1.5.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.2",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
+]
+enable-by-default = false
+include = ["src/openllm_client/__init__.py", "src/openllm_client/client.py"]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
+  </a>
+</p>
+
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+---
+
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
diff --git a/openllm-client/src/openllm_client/__init__.py b/openllm-client/src/openllm_client/__init__.py
new file mode 100644
index 00000000..0dd79140
--- /dev/null
+++ b/openllm-client/src/openllm_client/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from . import benmin as benmin
+from ._base import BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient
+from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
diff --git a/openllm-client/src/openllm_client/_adapters.py b/openllm-client/src/openllm_client/_adapters.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py
new file mode 100644
index 00000000..5a4b90c0
--- /dev/null
+++ b/openllm-client/src/openllm_client/_base.py
@@ -0,0 +1,203 @@
+# mypy: disable-error-code="override,no-redef"
+from __future__ import annotations
+import typing as t, functools, openllm_core, logging, httpx, orjson, attr, abc
+from http import HTTPStatus
+from urllib.parse import urljoin
+from .benmin import Client as BentoClient, AsyncClient as AsyncBentoClient
+from openllm_core.utils import is_transformers_supports_agent, is_transformers_available, bentoml_cattr, ensure_exec_coro
+from openllm_core._typing_compat import overload, LiteralString
+if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import LiteralRuntime, DictStrAny
+  import transformers
+
+logger = logging.getLogger(__name__)
+
+@attr.define(slots=False, init=False)
+class _ClientAttr:
+  _address: str
+  _timeout: float = attr.field(default=30)
+  _api_version: str = attr.field(default="v1")
+  def __init__(self, address: str, timeout: float = 30, api_version: str = "v1"): self.__attrs_init__(address, timeout, api_version)
+
+  @abc.abstractmethod
+  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: raise NotImplementedError
+  @abc.abstractmethod
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
+  @abc.abstractmethod
+  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: raise NotImplementedError
+  # NOTE: Scikit interface
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
+  @abc.abstractmethod
+  def predict(self, prompt: str, **attrs: t.Any) -> t.Any: raise NotImplementedError
+
+  @functools.cached_property
+  def _hf_agent(self) -> transformers.HfAgent:
+    if not is_transformers_available(): raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
+    if not self.supports_hf_agent: raise RuntimeError(f"{self.model_name} ({self.framework}) does not support running HF agent.")
+    if not is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
+    import transformers
+    return transformers.HfAgent(urljoin(self._address, "/hf/agent"))
+  @property
+  def _metadata(self) -> t.Any: return self.call("metadata")
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata["model_id"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try: return self._metadata["framework"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return self._metadata["timeout"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata["configuration"])
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.get("supports_embeddings", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.get("supports_hf_agent", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def config(self) -> openllm_core.LLMConfig: return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration)
+  @functools.cached_property
+  def inner(self) -> t.Any: raise NotImplementedError("'inner' client is not implemented.")
+
+class _Client(_ClientAttr):
+  _host: str
+  _port: str
+  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
+  def health(self) -> t.Any: return self.inner.health()
+  @functools.cached_property
+  def inner(self) -> BentoClient:
+    BentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
+    return BentoClient.from_url(self._address)
+
+  # Agent integration
+  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+    try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
+    except Exception as err:
+      logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
+      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
+
+class _AsyncClient(_ClientAttr):
+  _host: str
+  _port: str
+  def __init__(self, address: str, timeout: float = 30): self._address,self._timeout = address,timeout
+  async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return await self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
+  async def health(self) -> t.Any: return await self.inner.health()
+  @functools.cached_property
+  def inner(self) -> AsyncBentoClient:
+    ensure_exec_coro(AsyncBentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout))
+    return ensure_exec_coro(AsyncBentoClient.from_url(self._address))
+
+  # Agent integration
+  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    """Async version of agent.run."""
+    if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if not is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
+    from transformers.tools.python_interpreter import evaluate
+
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+    stop = ["Task:"]
+    prompt = t.cast(str, self._hf_agent.format_prompt(task))
+    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
+      response = await client.post(self._hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop}})
+      if response.status_code != HTTPStatus.OK: raise ValueError(f"Error {response.status_code}: {response.json()}")
+
+    result = response.json()[0]["generated_text"]
+    # Inference API returns the stop sequence
+    for stop_seq in stop:
+      if result.endswith(stop_seq):
+        result = result[:-len(stop_seq)]
+        break
+    # the below have the same logic as agent.run API
+    explanation, code = clean_code_for_run(result)
+    self._hf_agent.log(f"==Explanation from the agent==\n{explanation}")
+    self._hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
+    if not return_code:
+      self._hf_agent.log("\n\n==Result==")
+      self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools)
+      return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy())
+    else:
+      tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
+      return f"{tool_code}\n{code}"
+
+class BaseClient(_Client):
+  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt)))
+  def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: return self.query(prompt, **attrs)
+  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+    r = openllm_core.GenerationOutput(**self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
+    if return_response == "attrs": return r
+    elif return_response == "raw": return bentoml_cattr.unstructure(r)
+    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
+
+class BaseAsyncClient(_AsyncClient):
+  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**(await self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt))))
+  async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return await self.query(prompt, **attrs)
+  async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+    r = openllm_core.GenerationOutput(**(await self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
+    if return_response == "attrs": return r
+    elif return_response == "raw": return bentoml_cattr.unstructure(r)
+    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
diff --git a/openllm-client/src/openllm_client/benmin/__init__.py b/openllm-client/src/openllm_client/benmin/__init__.py
new file mode 100644
index 00000000..885b7648
--- /dev/null
+++ b/openllm-client/src/openllm_client/benmin/__init__.py
@@ -0,0 +1,87 @@
+"""This holds a simple client implementation, somewhat similar to `bentoml.client`.
+
+This module is subjected to change and to be merged upstream to BentoML.
+
+```python
+import openllm_client
+
+client = openllm_client.benmin.Client.from_url("http://localhost:3000")
+```
+
+The client implementation won't include a dynamic assignment of the service endpoints, rather this should be called
+via `client.call` or `await client.call`.
+"""
+from __future__ import annotations
+import typing as t, bentoml, attr, httpx
+from abc import abstractmethod
+if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI
+
+__all__ = ["Client", "AsyncClient"]
+
+@attr.define(init=False)
+class Client:
+  server_url: str
+  endpoints: t.List[str]
+  svc: bentoml.Service
+  timeout: int = attr.field(default=30)
+  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
+    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
+    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
+    for it, val in kwargs.items(): object.__setattr__(self, it, val)
+  def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
+  @abstractmethod
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @abstractmethod
+  def health(self) -> t.Any: raise NotImplementedError
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> Client:
+    try:
+      from ._http import HttpClient
+      return HttpClient.from_url(url, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import GrpcClient
+      return GrpcClient.from_url(url, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
+    try:
+      from ._http import HttpClient
+      return HttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import GrpcClient
+      return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err
+
+@attr.define(init=False)
+class AsyncClient:
+  server_url: str
+  endpoints: t.List[str]
+  svc: bentoml.Service
+  timeout: int = attr.field(default=30)
+  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
+    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
+    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
+    for it, val in kwargs.items(): object.__setattr__(self, it, val)
+  async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
+  @abstractmethod
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @abstractmethod
+  async def health(self) -> t.Any: raise NotImplementedError
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncClient:
+    try:
+      from ._http import AsyncHttpClient
+      return await AsyncHttpClient.from_url(url, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import AsyncGrpcClient
+      return await AsyncGrpcClient.from_url(url, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
+    try:
+      from ._http import AsyncHttpClient
+      await AsyncHttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import AsyncGrpcClient
+      await AsyncGrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err
diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py
new file mode 100644
index 00000000..d13a0c82
--- /dev/null
+++ b/openllm-client/src/openllm_client/benmin/_grpc.py
@@ -0,0 +1,187 @@
+# mypy: disable-error-code="no-redef"
+from __future__ import annotations
+import typing as t, logging, time, functools, bentoml
+from bentoml._internal.service.inference_api import InferenceAPI
+from openllm_client.benmin import Client, AsyncClient
+from openllm_core.utils import is_grpc_available, is_grpc_health_available, ensure_exec_coro
+from openllm_core._typing_compat import NotRequired, overload
+from bentoml.grpc.utils import load_from_file, import_generated_stubs
+if not is_grpc_available() or not is_grpc_health_available(): raise ImportError("gRPC is required to use gRPC client. Install with 'pip install \"openllm-client[grpc]\"'.")
+from grpc import aio
+from google.protobuf import json_format
+import grpc, grpc_health.v1.health_pb2 as pb_health, grpc_health.v1.health_pb2_grpc as services_health
+
+pb, services = import_generated_stubs("v1")
+
+if t.TYPE_CHECKING:
+  from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse
+
+logger = logging.getLogger(__name__)
+
+class ClientCredentials(t.TypedDict):
+  root_certificates: NotRequired[t.Union[bytes, str]]
+  private_key: NotRequired[t.Union[bytes, str]]
+  certificate_chain: NotRequired[t.Union[bytes, str]]
+
+@overload
+def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: ...
+@overload
+def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: ...
+def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", ssl: bool = False, ssl_client_credentials: ClientCredentials | None = None, options: t.Any | None = None, compression: grpc.Compression | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel:
+  credentials = None
+  if ssl:
+    if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+    credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()})
+
+  if typ == "async" and ssl: return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors)
+  elif typ == "async": return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors)
+  elif typ == "sync" and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression)
+  elif typ == "sync": return grpc.insecure_channel(server_url, options=options, compression=compression)
+  else: raise ValueError(f"Unknown type: {typ}")
+
+class GrpcClient(Client):
+  ssl: bool
+  ssl_client_credentials: t.Optional[ClientCredentials]
+  options: t.Any
+  compression: t.Optional[grpc.Compression]
+
+  def __init__(
+    self,
+    server_url: str,
+    svc: bentoml.Service,
+    # gRPC specific options
+    ssl: bool = False,
+    options: t.Any | None = None,
+    compression: grpc.Compression | None = None,
+    ssl_client_credentials: ClientCredentials | None = None,
+    **kwargs: t.Any) -> None:
+    self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression
+    super().__init__(server_url, svc, **kwargs)
+  @functools.cached_property
+  def inner(self) -> grpc.Channel:
+    if self.ssl:
+      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
+      return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression)
+    return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression)
+
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
+      req = pb_health.HealthCheckRequest()
+      req.service = "bentoml.grpc.v1.BentoService"
+      health_stub = services_health.HealthStub(channel)
+      start_time = time.time()
+      while time.time() - start_time < timeout:
+        try:
+          resp = health_stub.Check(req)
+          if resp.status == pb_health.HealthCheckResponse.SERVING: break
+          else: time.sleep(check_interval)
+        except grpc.RpcError:
+          logger.debug("Waiting for server to be ready...")
+          time.sleep(check_interval)
+      try:
+        resp = health_stub.Check(req)
+        if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
+      except grpc.RpcError as err:
+        logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
+        logger.error(err)
+        raise
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient:
+    with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
+    reflection = bentoml.Service(metadata.name)
+    for api in metadata.apis:
+      try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
+            bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
+            bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
+            name=api.name, doc=api.docs)
+      except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
+    return cls(url, reflection, **kwargs)
+  def health(self) -> t.Any: return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs))
+    else: fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data))
+    api_fn = {v: k for k, v in self.svc.apis.items()}
+    stubs = services.BentoServiceStub(self.inner)
+    proto = stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
+    return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content"))))
+
+class AsyncGrpcClient(AsyncClient):
+  ssl: bool
+  ssl_client_credentials: t.Optional[ClientCredentials]
+  options: aio.ChannelArgumentType
+  interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]]
+  compression: t.Optional[grpc.Compression]
+
+  def __init__(
+    self,
+    server_url: str,
+    svc: bentoml.Service,
+    # gRPC specific options
+    ssl: bool = False,
+    options: aio.ChannelArgumentType | None = None,
+    interceptors: t.Sequence[aio.ClientInterceptor] | None = None,
+    compression: grpc.Compression | None = None,
+    ssl_client_credentials: ClientCredentials | None = None,
+    **kwargs: t.Any) -> None:
+    self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression
+    super().__init__(server_url, svc, **kwargs)
+  @functools.cached_property
+  def inner(self) -> aio.Channel:
+    if self.ssl:
+      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
+      return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors)
+    return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)
+
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
+      req = pb_health.HealthCheckRequest()
+      req.service = "bentoml.grpc.v1.BentoService"
+      health_stub = services_health.HealthStub(channel)
+      start_time = time.time()
+      while time.time() - start_time < timeout:
+        try:
+          resp = health_stub.Check(req)
+          if resp.status == pb_health.HealthCheckResponse.SERVING: break
+          else: time.sleep(check_interval)
+        except grpc.RpcError:
+          logger.debug("Waiting for server to be ready...")
+          time.sleep(check_interval)
+      try:
+        resp = health_stub.Check(req)
+        if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
+      except grpc.RpcError as err:
+        logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
+        logger.error(err)
+        raise
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient:
+    async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
+    reflection = bentoml.Service(metadata.name)
+    for api in metadata.apis:
+      try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
+            bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
+            bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
+            name=api.name, doc=api.docs)
+      except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
+    return cls(url, reflection, **kwargs)
+  async def health(self) -> t.Any: return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
+    state = self.inner.get_state(try_to_connect=True)
+    if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready()
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = await _inference_api.input.to_proto(kwargs)
+    else: fake_resp = await _inference_api.input.to_proto(data)
+    api_fn = {v: k for k, v in self.svc.apis.items()}
+    async with self.inner:
+      stubs = services.BentoServiceStub(self.inner)
+      proto = await stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
+    return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content")))
diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py
new file mode 100644
index 00000000..447b2f1b
--- /dev/null
+++ b/openllm-client/src/openllm_client/benmin/_http.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+import functools, httpx, time, logging, urllib.error, typing as t, orjson, bentoml, starlette.requests, starlette.datastructures, starlette.responses, asyncio
+from bentoml._internal.service.inference_api import InferenceAPI
+from urllib.parse import urlparse
+from openllm_client.benmin import Client, AsyncClient
+from openllm_core.utils import ensure_exec_coro
+
+logger = logging.getLogger(__name__)
+
+class HttpClient(Client):
+  @functools.cached_property
+  def inner(self) -> httpx.Client:
+    if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
+    return httpx.Client(base_url=self.server_url)
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    host = host if "://" in host else "http://" + host
+    logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
+    start = time.time()
+    while time.time() - start < timeout:
+      try:
+        status = httpx.get(f"{host}:{port}/readyz").status_code
+        if status == 200: break
+        else: time.sleep(check_interval)
+      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
+        logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
+        time.sleep(check_interval)
+    # Try once more and raise for exception
+    try: httpx.get(f"{host}:{port}/readyz").raise_for_status()
+    except httpx.HTTPStatusError as err:
+      logger.error("Failed to wait until server ready: %s:%d", host, port)
+      logger.error(err)
+      raise
+  def health(self) -> httpx.Response: return self.inner.get("/readyz")
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> HttpClient:
+    url = url if "://" in url else "http://" + url
+    resp = httpx.get(f"{url}/docs.json")
+    if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}")
+    _spec = orjson.loads(resp.content)
+
+    reflection = bentoml.Service(_spec["info"]["title"])
+
+    for route, spec in _spec["paths"].items():
+      for meth_spec in spec.values():
+        if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
+          if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
+          except Exception as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
+    return cls(url, reflection)
+
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    # All gRPC kwargs should be popped out.
+    kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None))
+    else: fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None))
+
+    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
+    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
+    else: body = fake_resp.body
+
+    resp = self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
+    if resp.status_code != 200: raise ValueError(f"Error while making request: {resp.status_code}: {resp.content!s}")
+    fake_req = starlette.requests.Request(scope={"type": "http"})
+    headers = starlette.datastructures.Headers(headers=resp.headers)
+    fake_req._body = resp.content
+    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
+    fake_req._headers = headers
+    return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))
+
+class AsyncHttpClient(AsyncClient):
+  @functools.cached_property
+  def inner(self) -> httpx.AsyncClient:
+    if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
+    return httpx.AsyncClient(base_url=self.server_url)
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    host = host if "://" in host else "http://" + host
+    logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
+    start = time.time()
+    while time.time() - start < timeout:
+      try:
+        async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
+          resp = await sess.get("/readyz")
+          if resp.status_code == 200: break
+          else: await asyncio.sleep(check_interval)
+      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
+        logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
+        await asyncio.sleep(check_interval)
+    # Try once more and raise for exception
+    async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
+      resp = await sess.get("/readyz")
+      if resp.status_code != 200: raise TimeoutError(f"Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}")
+  async def health(self) -> httpx.Response: return await self.inner.get("/readyz")
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncHttpClient:
+    url = url if "://" in url else "http://" + url
+    async with httpx.AsyncClient(base_url=url) as session:
+        resp = await session.get("/docs.json")
+        if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}")
+        _spec = orjson.loads(await resp.aread())
+
+    reflection = bentoml.Service(_spec["info"]["title"])
+
+    for route, spec in _spec["paths"].items():
+      for meth_spec in spec.values():
+        if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
+          if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
+          except ValueError as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
+    return cls(url, reflection)
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    # All gRPC kwargs should be popped out.
+    kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = await _inference_api.input.to_http_response(kwargs, None)
+    else: fake_resp = await _inference_api.input.to_http_response(data, None)
+
+    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
+    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
+    else: body = t.cast(t.Any, fake_resp.body)
+
+    resp = await self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
+    if resp.status_code != 200: raise ValueError(f"Error making request: {resp.status_code}: {(await resp.aread())!s}")
+    fake_req = starlette.requests.Request(scope={"type": "http"})
+    headers = starlette.datastructures.Headers(headers=resp.headers)
+    fake_req._body = resp.content
+    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
+    fake_req._headers = headers
+    return await _inference_api.output.from_http_request(fake_req)
diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py
new file mode 100644
index 00000000..3b7e3400
--- /dev/null
+++ b/openllm-client/src/openllm_client/client.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+import logging
+from urllib.parse import urlparse
+from ._base import BaseClient, BaseAsyncClient
+
+logger = logging.getLogger(__name__)
+
+def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
+  address = address if "://" in address else "http://" + address
+  parsed = urlparse(address)
+  self._host, *_port = parsed.netloc.split(":")
+  if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
+  else: self._port = next(iter(_port))
+
+class HTTPClient(BaseClient):
+  def __init__(self, address: str, timeout: int = 30):
+    process_http_address(self, address)
+    super().__init__(address, timeout)
+
+class AsyncHTTPClient(BaseAsyncClient):
+  def __init__(self, address: str, timeout: int = 30):
+    process_http_address(self, address)
+    super().__init__(address, timeout)
+
+class GrpcClient(BaseClient):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
+
+class AsyncGrpcClient(BaseAsyncClient):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
diff --git a/openllm-core/.git_archival.txt b/openllm-core/.git_archival.txt
new file mode 120000
index 00000000..d7a42b25
--- /dev/null
+++ b/openllm-core/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
\ No newline at end of file
diff --git a/openllm-core/.python-version-default b/openllm-core/.python-version-default
new file mode 120000
index 00000000..40488ba7
--- /dev/null
+++ b/openllm-core/.python-version-default
@@ -0,0 +1 @@
+../.python-version-default
\ No newline at end of file
diff --git a/openllm-core/LICENSE.md b/openllm-core/LICENSE.md
new file mode 100644
index 00000000..7fd0f057
--- /dev/null
+++ b/openllm-core/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
diff --git a/openllm-core/README.md b/openllm-core/README.md
new file mode 100644
index 00000000..57e74b65
--- /dev/null
+++ b/openllm-core/README.md
@@ -0,0 +1,75 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">🦑 OpenLLM Core</h1>
+    <a href="https://pypi.org/project/openllm-core">
+        <img src="https://img.shields.io/pypi/v/openllm-core.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm-core/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm-core">
+        <img src="https://img.shields.io/pypi/pyversions/openllm-core.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="htjtps://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>OpenLLM Core: Core components for OpenLLM.<br/></p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps, and more.
+
+To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
+
+This package holds the core components of OpenLLM, and considered as internal.
+
+Components includes:
+
+- Configuration generation.
+- Utilities for interacting with OpenLLM server.
+- Schema and generation utilities for OpenLLM server.
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
diff --git a/openllm-core/pyproject.toml b/openllm-core/pyproject.toml
new file mode 100644
index 00000000..f519700a
--- /dev/null
+++ b/openllm-core/pyproject.toml
@@ -0,0 +1,195 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+]
+
+[project]
+authors = [
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
+]
+dynamic = ['readme', 'version']
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+description = "OpenLLM Core: Core components for OpenLLM."
+keywords = [
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
+]
+dependencies = [
+  "bentoml>=1.1.2",
+  "attrs>=23.1.0",
+  "cattrs>=23.1.0",
+  "orjson",
+  "inflection",
+  "typing_extensions",
+  "mypy_extensions",
+]
+license = "Apache-2.0"
+name = "openllm-core"
+requires-python = ">=3.8"
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core/README.md"
+GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+[project.optional-dependencies]
+full = ["openllm-core[vllm,fine-tune]"]
+vllm = ["vllm", "ray"]
+fine-tune = ["transformers[torch,tokenizers,accelerate]>=4.29.0", "peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm_core/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm_core"]
+sources = ["src"]
+[tool.hatch.build.targets.sdist]
+exclude = [
+  "/.git_archival.txt",
+  "tests",
+  "/.python-version-default",
+  "ADDING_NEW_MODEL.md",
+]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+  "hatch-mypyc==0.16.0",
+  "mypy==1.5.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.2",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
+]
+enable-by-default = false
+exclude = ["src/openllm_core/_typing_compat.py"]
+include = [
+  "src/openllm_core/utils/__init__.py",
+  "src/openllm_core/__init__.py",
+  "src/openllm_core/_prompt.py",
+  "src/openllm_core/_schema.py",
+  "src/openllm_core/_strategies.py",
+  "src/openllm_core/exceptions.py",
+]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--follow-imports=skip",
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+  "--exclude='/src\\/openllm_core\\/_typing_compat\\.py$'",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
+  </a>
+</p>
+
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+---
+
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
diff --git a/openllm-core/src/openllm_core/__init__.py b/openllm-core/src/openllm_core/__init__.py
new file mode 100644
index 00000000..93fa5491
--- /dev/null
+++ b/openllm-core/src/openllm_core/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+from . import utils as utils
+from . import exceptions as exceptions
+from ._configuration import LLMConfig as LLMConfig, GenerationConfig as GenerationConfig, SamplingParams as SamplingParams
+from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource, available_resource_spec as available_resource_spec, LiteralResourceSpec as LiteralResourceSpec, NvidiaGpuResource as NvidiaGpuResource, AmdGpuResource as AmdGpuResource
+from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+from .config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING, ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING, DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING, FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING, FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING, GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING, LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING, MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING, OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING, StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING, StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
diff --git a/openllm-python/src/openllm/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
similarity index 96%
rename from openllm-python/src/openllm/_configuration.py
rename to openllm-core/src/openllm_core/_configuration.py
index 22cb65c9..3553a3fe 100644
--- a/openllm-python/src/openllm/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -34,8 +34,7 @@ dynamically during serve, ahead-of-serve or per requests.
 Refer to ``openllm.LLMConfig`` docstring for more information.
 """
 from __future__ import annotations
-import copy, enum, logging, os, sys, types, typing as t
-import attr, click_option_group as cog, inflection, orjson, openllm
+import copy, enum, logging, os, sys, types, typing as t, attr, click_option_group as cog, inflection, orjson, openllm_core
 from cattr.gen import make_dict_structure_fn, make_dict_unstructure_fn, override
 from deepmerge.merger import Merger
 from ._strategies import LiteralResourceSpec, available_resource_spec, resource_spec
@@ -51,9 +50,10 @@ from .utils import (
   field_env_key,
   first_not_none,
   lenient_issubclass,
+  LazyLoader
 )
 from .utils.import_utils import BACKENDS_MAPPING
-# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm.LLMConfig to become 'attrs'-ish
+# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm_core.LLMConfig to become 'attrs'-ish
 from attr._compat import set_closure_cell
 from attr._make import _CountingAttr, _make_init, _transform_attrs
 from ._typing_compat import AnyCallable, At, Self, ListStr, DictStrAny
@@ -63,11 +63,11 @@ if t.TYPE_CHECKING:
   from transformers.generation.beam_constraints import Constraint
 else:
   Constraint = t.Any
-  vllm = openllm.utils.LazyLoader("vllm", globals(), "vllm")
-  transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-  peft = openllm.utils.LazyLoader("peft", globals(), "peft")
+  vllm = LazyLoader("vllm", globals(), "vllm")
+  transformers = LazyLoader("transformers", globals(), "transformers")
+  peft = LazyLoader("peft", globals(), "peft")
 
-__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams"]
+__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams", "field_env_key"]
 
 logger = logging.getLogger(__name__)
 config_merger = Merger([(dict, "merge")], ["override"], ["override"])
@@ -225,15 +225,12 @@ class GenerationConfig(ReprMixin):
   if t.TYPE_CHECKING and not MYPY:
     # stubs this for pyright as mypy already has a attr plugin builtin
     def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: ...
-
   def __init__(self, *, _internal: bool = False, **attrs: t.Any):
     if not _internal: raise RuntimeError("GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config")
     self.__attrs_init__(**attrs)
-
   def __getitem__(self, item: str) -> t.Any:
     if hasattr(self, item): return getattr(self, item)
     raise KeyError(f"'{self.__class__.__name__}' has no attribute {item}.")
-
   @property
   def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)}
 
@@ -334,7 +331,7 @@ class ModelSettings(t.TypedDict, total=False):
   name_type: NotRequired[t.Optional[t.Literal["dasherize", "lowercase"]]]
   model_name: NotRequired[str]
   start_name: NotRequired[str]
-  env: NotRequired[openllm.utils.EnvVarMixin]
+  env: NotRequired[openllm_core.utils.EnvVarMixin]
   # serving configuration
   timeout: int
   workers_per_resource: t.Union[int, float]
@@ -376,7 +373,7 @@ class _ModelSettingsAttr:
     name_type: t.Optional[t.Literal["dasherize", "lowercase"]]
     model_name: str
     start_name: str
-    env: openllm.utils.EnvVarMixin
+    env: openllm_core.utils.EnvVarMixin
     timeout: int
     workers_per_resource: t.Union[int, float]
     fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig]
@@ -414,7 +411,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
     if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = "pt"
   _final_value_dct["default_implementation"] = default_implementation
 
-  env = openllm.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
+  env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
   _final_value_dct["env"] = env
 
   # bettertransformer support
@@ -459,7 +456,8 @@ _reserved_namespace = {"__config__", "GenerationConfig", "SamplingParams"}
 
 @attr.define(slots=True)
 class _ConfigAttr:
-  Field = dantic.Field
+  @staticmethod
+  def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: return dantic.Field(default, **attrs)
   """Field is a alias to the internal dantic utilities to easily create
     attrs.fields with pydantic-compatible interface. For example:
 
@@ -513,11 +511,11 @@ class _ConfigAttr:
     """The accepted keys for this LLMConfig."""
     __openllm_extras__: DictStrAny = Field(None, init=False)
     """Extra metadata for this LLMConfig."""
-    __openllm_generation_class__: type[openllm._configuration.GenerationConfig] = Field(None)
+    __openllm_generation_class__: type[openllm_core._configuration.GenerationConfig] = Field(None)
     """The result generated GenerationConfig class for this LLMConfig. This will be used
         to create the generation_config argument that can be used throughout the lifecycle.
         This class will also be managed internally by OpenLLM."""
-    __openllm_sampling_class__: type[openllm._configuration.SamplingParams] = Field(None)
+    __openllm_sampling_class__: type[openllm_core._configuration.SamplingParams] = Field(None)
     """The result generated SamplingParams class for this LLMConfig. This will be used
         to create arguments for vLLM LLMEngine that can be used throughout the lifecycle.
         This class will also be managed internally by OpenLLM."""
@@ -587,7 +585,7 @@ class _ConfigAttr:
     """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
     __openllm_start_name__: str = Field(None)
     """Default name to be used with `openllm start`"""
-    __openllm_env__: openllm.utils.EnvVarMixin = Field(None)
+    __openllm_env__: openllm_core.utils.EnvVarMixin = Field(None)
     """A EnvVarMixin instance for this LLMConfig."""
     __openllm_timeout__: int = Field(None)
     """The default timeout to be set for this given LLM."""
@@ -697,7 +695,7 @@ class _ConfigBuilder:
       if not closure_cells: continue  # Catch None or the empty list.
       for cell in closure_cells:
         try: match = cell.cell_contents is self._cls
-        except ValueError: pass  # noqa: PERF203 # ValueError: Cell is empty
+        except ValueError: pass  # ValueError: Cell is empty
         else:
           if match: set_closure_cell(cell, cls)
     return cls
@@ -705,7 +703,6 @@ class _ConfigBuilder:
   def add_attrs_init(self) -> Self:
     self._cls_dict["__attrs_init__"] = codegen.add_method_dunders(self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
     return self
-
   def add_repr(self) -> Self:
     for key, fn in ReprMixin.__dict__.items():
       if key in ("__repr__", "__str__", "__repr_name__", "__repr_str__", "__repr_args__"): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
@@ -849,7 +846,7 @@ class LLMConfig(_ConfigAttr):
     unannotated = ca_names - annotated_names
     if len(unannotated) > 0:
       missing_annotated = sorted(unannotated, key=lambda n: t.cast("_CountingAttr", cd.get(n)).counter)
-      raise openllm.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
+      raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
     # We need to set the accepted key before generation_config
     # as generation_config is a special field that users shouldn't pass.
     cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
@@ -925,7 +922,7 @@ class LLMConfig(_ConfigAttr):
   @overload
   def __getitem__(self, item: t.Literal["start_name"]) -> str: ...
   @overload
-  def __getitem__(self, item: t.Literal["env"]) -> openllm.utils.EnvVarMixin: ...
+  def __getitem__(self, item: t.Literal["env"]) -> openllm_core.utils.EnvVarMixin: ...
   @overload
   def __getitem__(self, item: t.Literal["timeout"]) -> int: ...
   @overload
@@ -936,9 +933,9 @@ class LLMConfig(_ConfigAttr):
   def __getitem__(self, item: t.Literal["tokenizer_class"]) -> t.Optional[str]: ...
   # NOTE: generation_class, sampling_class and extras arguments
   @overload
-  def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...
+  def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...
   @overload
-  def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...
+  def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...
   @overload
   def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...
   # NOTE: GenerationConfig arguments
@@ -1120,7 +1117,6 @@ class LLMConfig(_ConfigAttr):
     try: new_cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__")
     except (AttributeError, ValueError): pass
     return new_cls(**attrs)
-
   def model_dump(self, flatten: bool = False, **_: t.Any) -> DictStrAny:
     dumped = bentoml_cattr.unstructure(self)
     generation_config = bentoml_cattr.unstructure(self.generation_config)
@@ -1129,15 +1125,13 @@ class LLMConfig(_ConfigAttr):
     else: dumped["generation_config"] = generation_config
     dumped.update(sampling_config)
     return dumped
-
   def model_dump_json(self, **kwargs: t.Any) -> bytes: return orjson.dumps(self.model_dump(**kwargs))
 
   @classmethod
   def model_construct_json(cls, json_str: str | bytes) -> Self:
     try: attrs = orjson.loads(json_str)
-    except orjson.JSONDecodeError as err: raise openllm.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
+    except orjson.JSONDecodeError as err: raise openllm_core.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
     return bentoml_cattr.structure(attrs, cls)
-
   @classmethod
   def model_construct_env(cls, **attrs: t.Any) -> Self:
     """A helpers that respect configuration values environment variables."""
@@ -1186,7 +1180,6 @@ class LLMConfig(_ConfigAttr):
     config = transformers.GenerationConfig(**bentoml_cattr.unstructure(self.generation_config))
     return config.to_dict() if return_as_dict else config
   def to_sampling_config(self) -> vllm.SamplingParams: return self.sampling_config.to_vllm()
-
   @classmethod
   def to_click_options(cls, f: AnyCallable) -> click.Command:
     """Convert current configuration to click options.
@@ -1228,6 +1221,32 @@ class LLMConfig(_ConfigAttr):
   @classmethod
   def default_implementation(cls) -> LiteralRuntime: return first_not_none(cls.__openllm_env__["framework_value"], default=get_default_implementation(cls.__openllm_default_implementation__))
 
+  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
+    """This handler will sanitize all attrs and setup prompt text.
+
+    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
+
+    Returns a tuple of three items:
+    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
+    - The attributes dictionary that will be passed into `self.postprocess_generate`.
+
+    `openllm.LLM` also has a sanitize_parameters that will just call this method.
+    """
+    return prompt, attrs, attrs
+
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
+    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
+
+    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
+
+    > [!NOTE]
+    > This will be used from the client side.
+
+    `openllm.LLM` also has a postprocess_generate that will just call this method.
+    """
+    return generation_result
+
+
 bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
 
 def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
@@ -1253,7 +1272,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
   return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs)
 
 bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
-
 openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
-
-__all__ = ["LLMConfig", "field_env_key"]
diff --git a/openllm-python/src/openllm/_prompt.py b/openllm-core/src/openllm_core/_prompt.py
similarity index 100%
rename from openllm-python/src/openllm/_prompt.py
rename to openllm-core/src/openllm_core/_prompt.py
diff --git a/openllm-python/src/openllm/_schema.py b/openllm-core/src/openllm_core/_schema.py
similarity index 78%
rename from openllm-python/src/openllm/_schema.py
rename to openllm-core/src/openllm_core/_schema.py
index e5ddad2a..d8accd08 100644
--- a/openllm-python/src/openllm/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -1,8 +1,8 @@
 """Schema definition for OpenLLM. This can be use for client interaction."""
 from __future__ import annotations
 import functools, typing as t
-import attr, inflection, openllm
-from ._configuration import GenerationConfig, LLMConfig
+import attr, inflection
+from openllm_core._configuration import GenerationConfig, LLMConfig
 from .utils import bentoml_cattr
 if t.TYPE_CHECKING: import vllm
 
@@ -19,9 +19,11 @@ class GenerationInput:
       if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
       return cls(**data)
   @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
+  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
+    import openllm
+    return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
   @classmethod
-  def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
+  def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
 @attr.frozen(slots=True)
 class GenerationOutput:
   responses: t.List[t.Any]
diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-core/src/openllm_core/_strategies.py
similarity index 96%
rename from openllm-python/src/openllm/_strategies.py
rename to openllm-core/src/openllm_core/_strategies.py
index 63255fa0..df2a44f1 100644
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-core/src/openllm_core/_strategies.py
@@ -1,10 +1,10 @@
+# mypy: disable-error-code="no-redef"
 from __future__ import annotations
 import functools, inspect, logging, math, os, sys, types, typing as t, warnings, psutil, bentoml
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
 from .utils import DEBUG, ReprMixin
-if sys.version_info[:2] >= (3, 11): from typing import overload
-else: from typing_extensions import overload
+from ._typing_compat import overload
 
 class DynResource(t.Protocol):
   resource_id: t.ClassVar[str]
@@ -176,10 +176,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
   except (ImportError, RuntimeError):
     pass
 
-def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
-  return types.new_class(
-    name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}),
-  )
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}))
 
 # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
 _TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2"
diff --git a/openllm-python/src/openllm/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
similarity index 94%
rename from openllm-python/src/openllm/_typing_compat.py
rename to openllm-core/src/openllm_core/_typing_compat.py
index 5e1c731b..4d0a7d18 100644
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -1,13 +1,14 @@
+# mypy: disable-error-code="type-arg,valid-type"
 from __future__ import annotations
 import sys, typing as t, bentoml, attr, abc
 from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict
+
 if t.TYPE_CHECKING:
   import openllm, peft, transformers, auto_gptq as autogptq, vllm
   from bentoml._internal.runner.runnable import RunnableMethod
   from bentoml._internal.runner.runner import RunnerMethod
   from bentoml._internal.runner.strategy import Strategy
 
-  from .bundle.oci import LiteralContainerVersionStrategy
   from .utils.lazy import VersionInfo
 
 M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]")
@@ -23,6 +24,10 @@ At = t.TypeVar("At", bound=attr.AttrsInstance)
 LiteralRuntime = t.Literal["pt", "tf", "flax", "vllm"]
 AdapterType = t.Literal["lora", "adalora", "adaption_prompt", "prefix_tuning", "p_tuning", "prompt_tuning", "ia3"]
 
+# TODO: support quay
+LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
+LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
+
 if sys.version_info[:2] >= (3,11):
   from typing import LiteralString as LiteralString, Self as Self, overload as overload
   from typing import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform
@@ -35,12 +40,7 @@ if sys.version_info[:2] >= (3,10):
 else:
   from typing_extensions import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate
 
-if sys.version_info[:2] >= (3,9):
-  from typing import TypedDict as TypedDict
-else:
-  from typing_extensions import TypedDict as TypedDict
-
-class PeftAdapterOutput(TypedDict):
+class PeftAdapterOutput(t.TypedDict):
   success: bool
   result: t.Dict[str, peft.PeftConfig]
   error_msg: str
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
new file mode 100644
index 00000000..2edcadd2
--- /dev/null
+++ b/openllm-core/src/openllm_core/config/__init__.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+from .configuration_auto import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
+from .configuration_baichuan import BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
+from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
+from .configuration_dolly_v2 import DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
+from .configuration_falcon import FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
+from .configuration_flan_t5 import FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
+from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
+from .configuration_llama import LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
+from .configuration_mpt import MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
+from .configuration_opt import OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
+from .configuration_stablelm import StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
+from .configuration_starcoder import StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
diff --git a/openllm-python/src/openllm/models/auto/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
similarity index 85%
rename from openllm-python/src/openllm/models/auto/configuration_auto.py
rename to openllm-core/src/openllm_core/config/configuration_auto.py
index ff7e7c70..7c1a9d6c 100644
--- a/openllm-python/src/openllm/models/auto/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -1,18 +1,16 @@
 # mypy: disable-error-code="type-arg"
 from __future__ import annotations
-import typing as t
+import inflection, openllm_core, importlib, typing as t
 from collections import OrderedDict
-
-import inflection, openllm
-from openllm.utils import ReprMixin
+from openllm_core.utils import ReprMixin
 
 if t.TYPE_CHECKING:
   import types
-  from openllm._typing_compat import LiteralString
+  from openllm_core._typing_compat import LiteralString
   from collections import _odict_items, _odict_keys, _odict_values
-  ConfigKeysView = _odict_keys[str, type[openllm.LLMConfig]]
-  ConfigValuesView = _odict_values[str, type[openllm.LLMConfig]]
-  ConfigItemsView = _odict_items[str, type[openllm.LLMConfig]]
+  ConfigKeysView = _odict_keys[str, type[openllm_core.LLMConfig]]
+  ConfigValuesView = _odict_values[str, type[openllm_core.LLMConfig]]
+  ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]]
 
 # NOTE: This is the entrypoint when adding new model config
 CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")])
@@ -28,10 +26,10 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
       if inflection.underscore(key) in self._mapping: return self.__getitem__(inflection.underscore(key))
       raise KeyError(key)
     value, module_name = self._mapping[key], inflection.underscore(key)
-    if module_name not in self._modules: self._modules[module_name] = openllm.utils.EnvVarMixin(module_name).module
+    if module_name not in self._modules: self._modules[module_name] = openllm_core.utils.EnvVarMixin(module_name).module
     if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value)
     # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the object at the top level.
-    return getattr(openllm, value)
+    return getattr(importlib.import_module("openllm"), value)
   @property
   def __repr_keys__(self) -> set[str]: return set(self._mapping.keys())
   def __repr__(self) -> str: return ReprMixin.__repr__(self)
@@ -45,19 +43,19 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
     if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
     self._extra_content[key] = value
 
-CONFIG_MAPPING: dict[str, type[openllm.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
 # The below handle special alias when we call underscore to the name directly without processing camelcase first.
 CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder", "gpt_neo_x": "gpt_neox",}
 
 class AutoConfig:
   def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.")
   @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
+  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig:
     model_name = inflection.underscore(model_name)
     if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
     raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
   @classmethod
-  def infer_class_from_name(cls, name: str) -> type[openllm.LLMConfig]:
+  def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
     model_name = inflection.underscore(name)
     if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name]
     if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name]
diff --git a/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
similarity index 73%
rename from openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
rename to openllm-core/src/openllm_core/config/configuration_baichuan.py
index 9f7b4122..844128d0 100644
--- a/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
+++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -1,22 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class BaichuanConfig(openllm.LLMConfig):
-  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
-
-  Baichuan-7B is based on Transformer architecture,
-  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
-  It supports both Chinese and English languages with a context window length of 4096.
-  It has achieved the best performance among models of the same size on standard Chinese
-  and English benchmarks (C-Eval, MMLU, etc).
-  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
-                "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
-  class GenerationConfig:
-    max_new_tokens: int = 2048
-    top_p: float = 0.7
-    temperature: float = 0.95
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
 
 START_BAICHUAN_COMMAND_DOCSTRING = """\
 Run a LLMServer for Baichuan model.
@@ -38,3 +22,24 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
 $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+
+class BaichuanConfig(openllm_core.LLMConfig):
+  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
+
+  Baichuan-7B is based on Transformer architecture,
+  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
+  It supports both Chinese and English languages with a context window length of 4096.
+  It has achieved the best performance among models of the same size on standard Chinese
+  and English benchmarks (C-Eval, MMLU, etc).
+  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
+                "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    top_p: float = 0.7
+    temperature: float = 0.95
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py
similarity index 54%
rename from openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
rename to openllm-core/src/openllm_core/config/configuration_chatglm.py
index a8e7e651..19030a8a 100644
--- a/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py
@@ -1,29 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class ChatGLMConfig(openllm.LLMConfig):
-  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
-
-  With the quantization technique, users can deploy locally on consumer-grade graphics cards
-  (only 6GB of GPU memory is required at the INT4 quantization level).
-
-  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
-  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
-  feedback bootstrap, and reinforcement learning wit human feedback.
-  With only about 6.2 billion parameters, the model is able to generate answers that are in line
-  with human preference.
-
-  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
-                "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
-  retain_history: bool = openllm.LLMConfig.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
-  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-  class GenerationConfig:
-    max_new_tokens: int = 2048
-    num_beams: int = 1
-    top_p: float = 0.7
-    temperature: float = 0.95
+import openllm_core, typing as t
+from openllm_core.utils import dantic
 
 START_CHATGLM_COMMAND_DOCSTRING = """\
 Run a LLMServer for ChatGLM model.
@@ -45,3 +22,42 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
 $ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class ChatGLMConfig(openllm_core.LLMConfig):
+  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
+
+  With the quantization technique, users can deploy locally on consumer-grade graphics cards
+  (only 6GB of GPU memory is required at the INT4 quantization level).
+
+  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
+  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
+  feedback bootstrap, and reinforcement learning wit human feedback.
+  With only about 6.2 billion parameters, the model is able to generate answers that are in line
+  with human preference.
+
+  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
+                "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
+  retain_history: bool = dantic.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
+  use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    num_beams: int = 1
+    top_p: float = 0.7
+    temperature: float = 0.95
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    prompt_text = ""
+    if use_default_prompt_template and chat_history is not None:
+      for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
+      prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
+    else: prompt_text = prompt
+    postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
+    return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
+  def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
+    generated, history = generation_result
+    if self.config.retain_history:
+      if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
+      chat_history.extend(history)
+    return generated
diff --git a/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
similarity index 78%
rename from openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
rename to openllm-core/src/openllm_core/config/configuration_dolly_v2.py
index 1b3026a8..ce047cd0 100644
--- a/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
@@ -1,29 +1,9 @@
 from __future__ import annotations
-import typing as t, openllm
+import typing as t, openllm_core
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic
 if t.TYPE_CHECKING: import transformers
 
-class DollyV2Config(openllm.LLMConfig):
-  """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
-
-  Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
-  generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
-  classification, closed QA, generation, information extraction, open QA and summarization.
-
-  dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
-  following behavior not characteristic of the foundation model on which it is based.
-
-  Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
-  """
-  __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
-                "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
-  return_full_text: bool = openllm.LLMConfig.Field(False, description="Whether to return the full prompt to the users.")
-  class GenerationConfig:
-    temperature: float = 0.9
-    top_p: float = 0.92
-    top_k: int = 5
-    max_new_tokens: int = 256
-    eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
-
 START_DOLLY_V2_COMMAND_DOCSTRING = """\
 Run a LLMServer for dolly-v2 model.
 
@@ -74,3 +54,28 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
   token_ids = tokenizer.encode(key)
   if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
   return token_ids[0]
+
+class DollyV2Config(openllm_core.LLMConfig):
+  """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
+
+  Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
+  generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
+  classification, closed QA, generation, information extraction, open QA and summarization.
+
+  dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
+  following behavior not characteristic of the foundation model on which it is based.
+
+  Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
+  """
+  __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
+  return_full_text: bool = dantic.Field(False, description="Whether to return the full prompt to the users.")
+  class GenerationConfig:
+    temperature: float = 0.9
+    top_p: float = 0.92
+    top_k: int = 5
+    max_new_tokens: int = 256
+    eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
diff --git a/openllm-python/src/openllm/models/falcon/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py
similarity index 71%
rename from openllm-python/src/openllm/models/falcon/configuration_falcon.py
rename to openllm-core/src/openllm_core/config/configuration_falcon.py
index 07101fb1..e645f679 100644
--- a/openllm-python/src/openllm/models/falcon/configuration_falcon.py
+++ b/openllm-core/src/openllm_core/config/configuration_falcon.py
@@ -1,22 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class FalconConfig(openllm.LLMConfig):
-  """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
-
-  It is made available under the TII Falcon LLM License.
-
-  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
-                "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
-                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
-  class GenerationConfig:
-    max_new_tokens: int = 200
-    top_k: int = 10
-    num_return_sequences: int = 1
-    num_beams: int = 4
-    early_stopping: bool = True
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
 
 START_FALCON_COMMAND_DOCSTRING = """\
 Run a LLMServer for FalconLM model.
@@ -43,3 +27,24 @@ DEFAULT_PROMPT_TEMPLATE = """{context}
 {user_name}: {instruction}
 {agent}:
 """
+
+class FalconConfig(openllm_core.LLMConfig):
+  """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
+
+  It is made available under the TII Falcon LLM License.
+
+  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
+                "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
+  class GenerationConfig:
+    max_new_tokens: int = 200
+    top_k: int = 10
+    num_return_sequences: int = 1
+    num_beams: int = 4
+    early_stopping: bool = True
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
similarity index 68%
rename from openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
rename to openllm-core/src/openllm_core/config/configuration_flan_t5.py
index e0cd167d..7fffddb7 100644
--- a/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
@@ -1,21 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class FlanT5Config(openllm.LLMConfig):
-  """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
-
-  It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
-
-  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
-  """
-  __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
-                "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 2048
-    top_k: int = 50
-    top_p: float = 0.4
-    repetition_penalty = 1.0
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
 
 START_FLAN_T5_COMMAND_DOCSTRING = """\
 Run a LLMServer for FLAN-T5 model.
@@ -43,3 +28,23 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
 $ openllm start flan-t5 --model-id google/flan-t5-xxl
 """
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
+
+class FlanT5Config(openllm_core.LLMConfig):
+  """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
+
+  It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
+
+  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
+  """
+  __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
+                "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 2048
+    top_k: int = 50
+    top_p: float = 0.4
+    repetition_penalty = 1.0
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
similarity index 71%
rename from openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
rename to openllm-core/src/openllm_core/config/configuration_gpt_neox.py
index 8346d05b..891002c5 100644
--- a/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
+++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
@@ -1,28 +1,7 @@
 from __future__ import annotations
-import openllm
-
-class GPTNeoXConfig(openllm.LLMConfig):
-  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
-
-  It is, to the best of our knowledge, the largest dense autoregressive model
-  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
-  can be found at https://github.com/EleutherAI/gpt-neox.
-
-  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
-
-  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
-  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
-
-  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
-  for more information.
-  """
-  __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
-                "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
-  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 100
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic
 
 START_GPT_NEOX_COMMAND_DOCSTRING = """\
 Run a LLMServer for GPTNeoX model.
@@ -44,3 +23,29 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
 $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class GPTNeoXConfig(openllm_core.LLMConfig):
+  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
+
+  It is, to the best of our knowledge, the largest dense autoregressive model
+  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
+  can be found at https://github.com/EleutherAI/gpt-neox.
+
+  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
+
+  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
+  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
+
+  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
+  for more information.
+  """
+  __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
+                "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
+  use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
+
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 100
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/llama/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py
similarity index 81%
rename from openllm-python/src/openllm/models/llama/configuration_llama.py
rename to openllm-core/src/openllm_core/config/configuration_llama.py
index 10dc5b31..bb6b9b29 100644
--- a/openllm-python/src/openllm/models/llama/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -1,32 +1,7 @@
 from __future__ import annotations
-import typing as t, openllm
-
-class LlamaConfig(openllm.LLMConfig):
-  """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-
-  It is a collection of foundation language models ranging from 7B to 65B parameters.
-
-  Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
-
-  Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
-
-  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
-  for more information.
-  """
-  use_llama2_prompt: bool = openllm.LLMConfig.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
-  __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
-                "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
-                                                                          "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
-                                                                          "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
-                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
-  class GenerationConfig:
-    max_new_tokens: int = 128
-    temperature: float = 0.6
-    top_p: float = 0.9
-    top_k: int = 12
-  class SamplingParams:
-    best_of: int = 1
-    presence_penalty: float = 0.5
+import typing as t, openllm_core
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic
 
 START_LLAMA_COMMAND_DOCSTRING = """\
 Run a LLMServer for Llama model.
@@ -68,3 +43,33 @@ _v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_
 PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
 def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
+class LlamaConfig(openllm_core.LLMConfig):
+  """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+
+  It is a collection of foundation language models ranging from 7B to 65B parameters.
+
+  Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
+
+  Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
+
+  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
+  for more information.
+  """
+  use_llama2_prompt: bool = dantic.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
+                "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
+                                                                          "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
+                                                                          "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0.6
+    top_p: float = 0.9
+    top_k: int = 12
+  class SamplingParams:
+    best_of: int = 1
+    presence_penalty: float = 0.5
+  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/mpt/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py
similarity index 61%
rename from openllm-python/src/openllm/models/mpt/configuration_mpt.py
rename to openllm-core/src/openllm_core/config/configuration_mpt.py
index 9930597b..dde1dbb9 100644
--- a/openllm-python/src/openllm/models/mpt/configuration_mpt.py
+++ b/openllm-core/src/openllm_core/config/configuration_mpt.py
@@ -1,28 +1,9 @@
 from __future__ import annotations
-import sys, typing as t
+import typing as t, openllm_core
+from openllm_core.utils import dantic
+from openllm_core._prompt import process_prompt
 
-import openllm
-
-if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
-else: MPTPromptType = str
-
-class MPTConfig(openllm.LLMConfig):
-  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
-
-  This model was trained by [MosaicML](https://www.mosaicml.com/).
-
-  ``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
-  on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
-  for more details on specific models.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
-                "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
-  prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
-  max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
-  class GenerationConfig:
-    max_new_tokens: int = 128
-    temperature: float = 0
-    top_p: float = 0.8
+MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
 
 START_MPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for MPT model.
@@ -63,3 +44,32 @@ _chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instr
 PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
 def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
+class MPTConfig(openllm_core.LLMConfig):
+  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
+
+  This model was trained by [MosaicML](https://www.mosaicml.com/).
+
+  ``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
+  on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
+  for more details on specific models.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
+                "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
+  prompt_type: MPTPromptType = dantic.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
+  max_sequence_length: int = dantic.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0
+    top_p: float = 0.8
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    _template = None
+    if use_default_prompt_template:
+      if prompt_type is None:
+        if "instruct" in self.model_id: prompt_type = "instruct"
+        elif "storywriter" in self.model_id: prompt_type = "storywriter"
+        elif "chat" in self.model_id: prompt_type = "chat"
+        else: prompt_type = "default"
+      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
+    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/opt/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
similarity index 66%
rename from openllm-python/src/openllm/models/opt/configuration_opt.py
rename to openllm-core/src/openllm_core/config/configuration_opt.py
index 0238445f..c4fdd028 100644
--- a/openllm-python/src/openllm/models/opt/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -1,27 +1,7 @@
 from __future__ import annotations
-import openllm
-
-class OPTConfig(openllm.LLMConfig):
-  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
-
-  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
-  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
-  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
-  the self-supervised causal language modeling objective.
-
-  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
-  """
-  __config__ = {
-      "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
-      "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
-      "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
-  }
-  format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
-  class GenerationConfig:
-    top_k: int = 15
-    temperature: float = 0.75
-    max_new_tokens: int = 1024
-    num_return_sequences: int = 1
+import openllm_core, typing as t
+from openllm_core.utils import dantic
+from openllm_core._prompt import process_prompt
 
 START_OPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for OPT model.
@@ -49,3 +29,30 @@ or provide `--model-id` flag when running ``openllm start opt``:
 $ openllm start opt --model-id facebook/opt-6.7b
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class OPTConfig(openllm_core.LLMConfig):
+  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
+
+  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
+  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
+  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
+  the self-supervised causal language modeling objective.
+
+  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
+  """
+  __config__ = {
+      "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
+      "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
+      "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
+  }
+  format_outputs: bool = dantic.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
+  class GenerationConfig:
+    top_k: int = 15
+    temperature: float = 0.75
+    max_new_tokens: int = 1024
+    num_return_sequences: int = 1
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
+    if len(generation_result) == 1: return generation_result[0]
+    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+    else: return "\n".join(generation_result)
diff --git a/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py
similarity index 72%
rename from openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
rename to openllm-core/src/openllm_core/config/configuration_stablelm.py
index 553cef61..9498f2dc 100644
--- a/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py
@@ -1,27 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class StableLMConfig(openllm.LLMConfig):
-  """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
-
-  It is pre-trained on a diverse collection of English datasets with a sequence
-  length of 4096 to push beyond the context window limitations of existing open-source language models.
-
-  StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
-  built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
-  instruction-following datasets.
-
-  Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
-  and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
-  for more information.
-  """
-  __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
-                "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 128
-    top_k: int = 0
-    top_p: float = 0.9
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
 
 START_STABLELM_COMMAND_DOCSTRING = """\
 Run a LLMServer for StableLM model.
@@ -49,3 +28,32 @@ SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM will refuse to participate in anything that could harm a human.
 """
 DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
+
+class StableLMConfig(openllm_core.LLMConfig):
+  """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
+
+  It is pre-trained on a diverse collection of English datasets with a sequence
+  length of 4096 to push beyond the context window limitations of existing open-source language models.
+
+  StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
+  built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
+  instruction-following datasets.
+
+  Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
+  and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
+  for more information.
+  """
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 128
+    top_k: int = 0
+    top_p: float = 0.9
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    if "tuned" in self._model_id and use_default_prompt_template:
+      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
+      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
+    else: prompt_text = prompt
+    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py
similarity index 63%
rename from openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
rename to openllm-core/src/openllm_core/config/configuration_starcoder.py
index 1c9b9ce0..a0bc5e68 100644
--- a/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py
@@ -1,26 +1,5 @@
 from __future__ import annotations
-
-import openllm
-
-class StarCoderConfig(openllm.LLMConfig):
-  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
-
-  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
-  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
-  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
-
-  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
-                "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
-  class GenerationConfig:
-    temperature: float = 0.2
-    max_new_tokens: int = 256
-    min_new_tokens: int = 32
-    top_k: float = 50
-    top_p: float = 0.95
-    pad_token_id: int = 49152
-    repetition_penalty: float = 1.2
+import openllm_core, typing as t
 
 START_STARCODER_COMMAND_DOCSTRING = """\
 Run a LLMServer for StarCoder model.
@@ -43,3 +22,33 @@ $ openllm start starcoder --model-id 'bigcode/starcoder'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
 FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
+
+class StarCoderConfig(openllm_core.LLMConfig):
+  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
+
+  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
+  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
+  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
+
+  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
+                "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
+  class GenerationConfig:
+    temperature: float = 0.2
+    max_new_tokens: int = 256
+    min_new_tokens: int = 32
+    top_k: float = 50
+    top_p: float = 0.95
+    pad_token_id: int = 49152
+    repetition_penalty: float = 1.2
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
+    if fim_mode:
+      try: prefix, suffix = prompt.split(FIM_INDICATOR)
+      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+    else: prompt_text = prompt
+    # XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD
+    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
diff --git a/openllm-core/src/openllm_core/exceptions.py b/openllm-core/src/openllm_core/exceptions.py
new file mode 100644
index 00000000..86e5d294
--- /dev/null
+++ b/openllm-core/src/openllm_core/exceptions.py
@@ -0,0 +1,19 @@
+"""Base exceptions for OpenLLM. This extends BentoML exceptions."""
+from __future__ import annotations
+import bentoml
+class OpenLLMException(bentoml.exceptions.BentoMLException):
+  """Base class for all OpenLLM exceptions. This extends BentoMLException."""
+class GpuNotAvailableError(OpenLLMException):
+  """Raised when there is no GPU available in given system."""
+class ValidationError(OpenLLMException):
+  """Raised when a validation fails."""
+class ForbiddenAttributeError(OpenLLMException):
+  """Raised when using an _internal field."""
+class MissingAnnotationAttributeError(OpenLLMException):
+  """Raised when a field under openllm.LLMConfig is missing annotations."""
+class MissingDependencyError(BaseException):
+  """Raised when a dependency is missing."""
+class Error(BaseException):
+  """To be used instead of naked raise."""
+class FineTuneStrategyNotSupportedError(OpenLLMException):
+  """Raised when a fine-tune strategy is not supported for given LLM."""
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
new file mode 100644
index 00000000..46995fd0
--- /dev/null
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -0,0 +1,304 @@
+"""Utilities function for OpenLLM.
+
+User can import these function for convenience, but we won't ensure backward compatibility for these functions. So use with caution.
+"""
+from __future__ import annotations
+import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm_core, asyncio
+from pathlib import Path
+from circus.exc import ConflictError
+from bentoml._internal.configuration import (
+  DEBUG_ENV_VAR as DEBUG_ENV_VAR,
+  GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
+  QUIET_ENV_VAR as QUIET_ENV_VAR,
+  get_debug_mode as _get_debug_mode,
+  get_quiet_mode as _get_quiet_mode,
+  set_quiet_mode as set_quiet_mode,
+)
+from bentoml._internal.models.model import ModelContext as _ModelContext
+from bentoml._internal.types import LazyType as LazyType
+from bentoml._internal.utils import (
+  LazyLoader as LazyLoader,
+  bentoml_cattr as bentoml_cattr,
+  calc_dir_size as calc_dir_size,
+  first_not_none as first_not_none,
+  pkg as pkg,
+  reserve_free_port as reserve_free_port,
+  resolve_user_filepath as resolve_user_filepath,
+)
+from openllm_core.utils.lazy import (
+  LazyModule as LazyModule,
+  VersionInfo as VersionInfo,
+)
+
+if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import AnyCallable
+
+logger = logging.getLogger(__name__)
+try: from typing import GenericAlias as _TypingGenericAlias  # type: ignore
+except ImportError: _TypingGenericAlias = ()  # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
+else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
+
+DEV_DEBUG_VAR = "OPENLLMDEVDEBUG"
+
+def set_debug_mode(enabled: bool, level: int = 1) -> None:
+  # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs
+  if enabled: os.environ[DEV_DEBUG_VAR] = str(level)
+  os.environ[DEBUG_ENV_VAR] = str(enabled)
+  os.environ[_GRPC_DEBUG_ENV_VAR] = "DEBUG" if enabled else "ERROR"
+
+def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
+  try:
+    return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
+  except TypeError:
+    if isinstance(cls, _WithArgsTypes): return False
+    raise
+
+def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
+    loop = asyncio.get_event_loop()
+    if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
+    else: return loop.run_until_complete(coro)
+
+def available_devices() -> tuple[str, ...]:
+  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+  from openllm_core._strategies import NvidiaGpuResource
+  return tuple(NvidiaGpuResource.from_system())
+
+@functools.lru_cache(maxsize=128)
+def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str:
+  """Generate a hash from given file's modification time.
+
+  Args:
+  f: The file to generate the hash from.
+  algorithm: The hashing algorithm to use. Defaults to 'sha1' (similar to how Git generate its commit hash.)
+
+  Returns:
+  The generated hash.
+  """
+  return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
+
+@functools.lru_cache(maxsize=1)
+def device_count() -> int: return len(available_devices())
+
+# equivocal setattr to save one lookup per assignment
+_object_setattr = object.__setattr__
+
+def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
+  """This makes sure that we don't overwrite any existing attributes on the object."""
+  _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
+  if not hasattr(obj, name): _setattr(name, value)
+
+def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
+
+# Special debug flag controled via OPENLLMDEVDEBUG
+DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
+# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
+MYPY = False
+SHOW_CODEGEN: bool = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
+
+def get_debug_mode() -> bool: return DEBUG or _get_debug_mode()
+def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode()
+
+class ExceptionFilter(logging.Filter):
+  def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
+    """A filter of all exception."""
+    if exclude_exceptions is None: exclude_exceptions = [ConflictError]
+    if ConflictError not in exclude_exceptions: exclude_exceptions.append(ConflictError)
+    super(ExceptionFilter, self).__init__(**kwargs)
+    self.EXCLUDE_EXCEPTIONS = exclude_exceptions
+
+  def filter(self, record: logging.LogRecord) -> bool:
+    if record.exc_info:
+      etype, _, _ = record.exc_info
+      if etype is not None:
+        for exc in self.EXCLUDE_EXCEPTIONS:
+          if issubclass(etype, exc): return False
+    return True
+
+class InfoFilter(logging.Filter):
+  def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING
+
+_LOGGING_CONFIG: dict[str, t.Any] = {
+    "version": 1, "disable_existing_loggers": True,
+    "filters": {"excfilter": {"()": "openllm_core.utils.ExceptionFilter"}, "infofilter": {"()": "openllm_core.utils.InfoFilter"}},
+    "handlers": {"bentomlhandler": {"class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout"}, "defaulthandler": {"class": "logging.StreamHandler", "level": logging.WARNING}},
+    "loggers": {"bentoml": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}, "openllm": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}},
+    "root": {"level": logging.WARNING},
+}
+
+def configure_logging() -> None:
+  """Configure logging for OpenLLM.
+
+  Behaves similar to how BentoML loggers are being configured.
+  """
+  if get_quiet_mode():
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.ERROR
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.ERROR
+    _LOGGING_CONFIG["root"]["level"] = logging.ERROR
+  elif get_debug_mode() or DEBUG:
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.DEBUG
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.DEBUG
+    _LOGGING_CONFIG["root"]["level"] = logging.DEBUG
+  else:
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.INFO
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.INFO
+    _LOGGING_CONFIG["root"]["level"] = logging.INFO
+
+  logging.config.dictConfig(_LOGGING_CONFIG)
+
+@functools.lru_cache(maxsize=1)
+def in_notebook() -> bool:
+  try:
+    from IPython.core.getipython import get_ipython
+    if t.TYPE_CHECKING:
+      from IPython.core.interactiveshell import InteractiveShell
+    return "IPKernelApp" in t.cast("dict[str, t.Any]", t.cast(t.Callable[[], "InteractiveShell"], get_ipython)().config)
+  except (ImportError, AttributeError): return False
+
+_dockerenv, _cgroup = Path("/.dockerenv"), Path("/proc/self/cgroup")
+
+class suppress(contextlib.suppress, contextlib.ContextDecorator):
+  """A version of contextlib.suppress with decorator support.
+
+  >>> @suppress(KeyError)
+  ... def key_error():
+  ...     {}['']
+  >>> key_error()
+  """
+
+def compose(*funcs: AnyCallable) -> AnyCallable:
+  """Compose any number of unary functions into a single unary function.
+
+  >>> import textwrap
+  >>> expected = str.strip(textwrap.dedent(compose.__doc__))
+  >>> strip_and_dedent = compose(str.strip, textwrap.dedent)
+  >>> strip_and_dedent(compose.__doc__) == expected
+  True
+
+  Compose also allows the innermost function to take arbitrary arguments.
+
+  >>> round_three = lambda x: round(x, ndigits=3)
+  >>> f = compose(round_three, int.__truediv__)
+  >>> [f(3*x, x+1) for x in range(1,10)]
+  [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7]
+  """
+  def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs))
+  return functools.reduce(compose_two, funcs)
+
+def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
+  """Decorate a function with a transform function that is invoked on results returned from the decorated function.
+
+  ```python
+  @apply(reversed)
+  def get_numbers(start):
+  "doc for get_numbers"
+  return range(start, start+3)
+  list(get_numbers(4))
+  # [6, 5, 4]
+  ```
+  ```python
+  get_numbers.__doc__
+  # 'doc for get_numbers'
+  ```
+  """
+  return lambda func: functools.wraps(func)(compose(transform, func))
+
+@apply(bool)
+@suppress(FileNotFoundError)
+def _text_in_file(text: str, filename: Path) -> bool:
+  return any(text in line for line in filename.open())
+
+def in_docker() -> bool:
+  """Is this current environment running in docker?
+
+  ```python
+  type(in_docker())
+  ```
+  """
+  return _dockerenv.exists() or _text_in_file("docker", _cgroup)
+
+T, K = t.TypeVar("T"), t.TypeVar("K")
+
+def resolve_filepath(path: str, ctx: str | None = None) -> str:
+  """Resolve a file path to an absolute path, expand user and environment variables."""
+  try: return resolve_user_filepath(path, ctx)
+  except FileNotFoundError: return path
+
+def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path)))
+
+def generate_context(framework_name: str) -> _ModelContext:
+  framework_versions = {"transformers": pkg.get_pkg_version("transformers")}
+  if openllm_core.utils.is_torch_available(): framework_versions["torch"] = pkg.get_pkg_version("torch")
+  if openllm_core.utils.is_tf_available():
+    from bentoml._internal.frameworks.utils.tensorflow import get_tf_version
+    framework_versions["tensorflow"] = get_tf_version()
+  if openllm_core.utils.is_flax_available(): framework_versions.update({"flax": pkg.get_pkg_version("flax"), "jax": pkg.get_pkg_version("jax"), "jaxlib": pkg.get_pkg_version("jaxlib")})
+  return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)
+
+_TOKENIZER_PREFIX = "_tokenizer_"
+
+def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+  """Normalize the given attrs to a model and tokenizer kwargs accordingly."""
+  tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
+  for k in tuple(attrs.keys()):
+    if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
+  return attrs, tokenizer_attrs
+
+# NOTE: The set marks contains a set of modules name
+# that are available above and are whitelisted
+# to be included in the extra_objects map.
+_whitelist_modules = {"pkg"}
+
+# XXX: define all classes, functions import above this line
+# since _extras will be the locals() import from this file.
+_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))}
+_extras["__openllm_migration__"] = {"ModelEnv": "EnvVarMixin"}
+_import_structure: dict[str, list[str]] = {
+  "analytics": [], "codegen": [], "dantic": [], "representation": ["ReprMixin"], "lazy": ["LazyModule"],
+  "import_utils": ["OPTIONAL_DEPENDENCIES", "ENV_VARS_TRUE_VALUES", "DummyMetaclass", "EnvVarMixin", "require_backends",
+                  "is_cpm_kernels_available", "is_einops_available", "is_flax_available", "is_tf_available", "is_vllm_available", "is_torch_available", "is_bitsandbytes_available", "is_peft_available", "is_datasets_available",
+                  "is_transformers_supports_kbit", "is_transformers_supports_agent", "is_jupyter_available", "is_jupytext_available", "is_notebook_available", "is_triton_available", "is_autogptq_available", "is_sentencepiece_available",
+                  "is_xformers_available", "is_fairscale_available", "is_grpc_available", "is_grpc_health_available", "is_transformers_available"]}
+
+if t.TYPE_CHECKING:
+  # NOTE: The following exports useful utils from bentoml
+  from . import (
+    analytics as analytics,
+    codegen as codegen,
+    dantic as dantic,
+  )
+  from .import_utils import (
+    ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
+    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
+    DummyMetaclass as DummyMetaclass,
+    EnvVarMixin as EnvVarMixin,
+    is_autogptq_available as is_autogptq_available,
+    is_bitsandbytes_available as is_bitsandbytes_available,
+    is_cpm_kernels_available as is_cpm_kernels_available,
+    is_datasets_available as is_datasets_available,
+    is_einops_available as is_einops_available,
+    is_fairscale_available as is_fairscale_available,
+    is_flax_available as is_flax_available,
+    is_jupyter_available as is_jupyter_available,
+    is_jupytext_available as is_jupytext_available,
+    is_notebook_available as is_notebook_available,
+    is_peft_available as is_peft_available,
+    is_sentencepiece_available as is_sentencepiece_available,
+    is_tf_available as is_tf_available,
+    is_torch_available as is_torch_available,
+    is_transformers_supports_agent as is_transformers_supports_agent,
+    is_transformers_supports_kbit as is_transformers_supports_kbit,
+    is_triton_available as is_triton_available,
+    is_vllm_available as is_vllm_available,
+    is_xformers_available as is_xformers_available,
+    is_grpc_available as is_grpc_available,
+    is_grpc_health_available as is_grpc_health_available,
+    is_transformers_available as is_transformers_available,
+    require_backends as require_backends,
+  )
+  from .representation import ReprMixin as ReprMixin
+
+__lazy = LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects=_extras)
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py
similarity index 83%
rename from openllm-python/src/openllm/utils/analytics.py
rename to openllm-core/src/openllm_core/utils/analytics.py
index e19c0872..3d861a4c 100644
--- a/openllm-python/src/openllm/utils/analytics.py
+++ b/openllm-core/src/openllm_core/utils/analytics.py
@@ -3,10 +3,9 @@
 Users can disable this with OPENLLM_DO_NOT_TRACK envvar.
 """
 from __future__ import annotations
-import contextlib, functools, logging, os, re, typing as t, importlib.metadata
-import attr, openllm
+import contextlib, functools, logging, os, re, typing as t, importlib.metadata, attr, openllm_core
 from bentoml._internal.utils import analytics as _internal_analytics
-from openllm._typing_compat import ParamSpec
+from openllm_core._typing_compat import ParamSpec
 
 P = ParamSpec("P")
 T = t.TypeVar("T")
@@ -17,7 +16,7 @@ OPENLLM_DO_NOT_TRACK = "OPENLLM_DO_NOT_TRACK"
 DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()
 
 @functools.lru_cache(maxsize=1)
-def do_not_track() -> bool: return DO_NOT_TRACK in openllm.utils.ENV_VARS_TRUE_VALUES
+def do_not_track() -> bool: return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
 @functools.lru_cache(maxsize=1)
 def _usage_event_debugging() -> bool: return os.environ.get("__BENTOML_DEBUG_USAGE", str(False)).lower() == "true"
 
@@ -27,7 +26,7 @@ def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
     try: return func(*args, **kwargs)
     except Exception as err:
       if _usage_event_debugging():
-        if openllm.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
+        if openllm_core.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
         else: logger.info("Tracking Error: %s", err)
       else: logger.debug("Tracking Error: %s", err)
   return wrapper
@@ -73,8 +72,8 @@ class StartInitEvent(EventMeta):
   model_name: str
   llm_config: t.Dict[str, t.Any] = attr.field(default=None)
   @staticmethod
-  def handler(llm_config: openllm.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
+  def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
 
-def track_start_init(llm_config: openllm.LLMConfig) -> None:
+def track_start_init(llm_config: openllm_core.LLMConfig) -> None:
   if do_not_track(): return
   track(StartInitEvent.handler(llm_config))
diff --git a/openllm-python/src/openllm/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py
similarity index 68%
rename from openllm-python/src/openllm/utils/codegen.py
rename to openllm-core/src/openllm_core/utils/codegen.py
index 994e8818..8cd3e630 100644
--- a/openllm-python/src/openllm/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -1,49 +1,14 @@
 from __future__ import annotations
-import functools, inspect, linecache, os, logging, string, types, typing as t
+import functools, inspect, linecache, logging, types, typing as t, orjson
 from operator import itemgetter
-from pathlib import Path
-import orjson
 
 if t.TYPE_CHECKING:
-  from fs.base import FS
-
-  import openllm
-  from openllm._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
+  import openllm_core
+  from openllm_core._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
   PartialAny = functools.partial[t.Any]
 
 _T = t.TypeVar("_T", bound=t.Callable[..., t.Any])
 logger = logging.getLogger(__name__)
-OPENLLM_MODEL_NAME = "# openllm: model name"
-OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
-class ModelNameFormatter(string.Formatter):
-  model_keyword: LiteralString = "__model_name__"
-  def __init__(self, model_name: str):
-    """The formatter that extends model_name to be formatted the 'service.py'."""
-    super().__init__()
-    self.model_name = model_name
-  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
-  def can_format(self, value: str) -> bool:
-    try:
-      self.parse(value)
-      return True
-    except ValueError: return False
-class ModelIdFormatter(ModelNameFormatter):
-  model_keyword: LiteralString = "__model_id__"
-class ModelAdapterMapFormatter(ModelNameFormatter):
-  model_keyword: LiteralString = "__model_adapter_map__"
-
-_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
-def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
-  from openllm.utils import DEBUG
-  model_name = llm.config["model_name"]
-  logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
-  with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
-  for it in src_contents:
-    if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
-    elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
-  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
-  if DEBUG: logger.info("Generated script:\n%s", script)
-  llm_fs.writetext(llm.config["service_name"], script)
 
 # sentinel object for unequivocal object() getattr
 _sentinel = object()
@@ -72,7 +37,7 @@ def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str
   try: method_or_cls.__doc__ = _overwrite_doc or "Generated by ``openllm.LLMConfig`` for class " f"{cls.__qualname__}."
   except AttributeError: pass
   return method_or_cls
-def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs)  # noqa: S307
+def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs)
 def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable:
   locs: DictStrAny = {}
   # In order of debuggers like PDB being able to step through the code, we add a fake linecache entry.
@@ -111,15 +76,15 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.
 
 def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
 def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable:
-  from openllm.utils import SHOW_CODEGEN
+  from openllm_core.utils import SHOW_CODEGEN
   script = "def %s(%s):\n    %s\n" % (func_name, ", ".join(args) if args is not None else "", "\n    ".join(lines) if lines else "pass")
   meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
   if annotations: meth.__annotations__ = annotations
   if SHOW_CODEGEN: logger.info("Generated script for %s:\n\n%s", typ, script)
   return meth
 
-def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
-  from openllm.utils import dantic, field_env_key
+def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
+  from openllm_core.utils import dantic, field_env_key
   def identity(_: str, x_value: t.Any) -> t.Any: return x_value
   default_callback = identity if default_callback is None else default_callback
   globs = {} if globs is None else globs
@@ -129,13 +94,13 @@ def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix:
   return generate_function(cls, "__auto_env", lines, args=("_", "fields"), globs=globs, annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann})
 def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
   """Enhance sdk with nice repr that plays well with your brain."""
-  from openllm.utils import ReprMixin
+  from openllm_core.utils import ReprMixin
   if name is None: name = func.__name__.strip("_")
   _signatures = inspect.signature(func).parameters
   def _repr(self: ReprMixin) -> str: return f"<generated function {name} {orjson.dumps(dict(self.__repr_args__()), option=orjson.OPT_NON_STR_KEYS | orjson.OPT_INDENT_2).decode()}>"
   def _repr_args(self: ReprMixin) -> t.Iterator[t.Tuple[str, t.Any]]: return ((k, _signatures[k].annotation) for k in self.__repr_keys__)
   if func.__doc__ is None: doc = f"Generated SDK for {func.__name__}"
   else: doc = func.__doc__
-  return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm",}),)(func, **attrs), func,))
+  return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm"}),)(func, **attrs), func,))
 
-__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function", "OPENLLM_MODEL_NAME", "OPENLLM_MODEL_ADAPTER_MAP"]
+__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function"]
diff --git a/openllm-python/src/openllm/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py
similarity index 99%
rename from openllm-python/src/openllm/utils/dantic.py
rename to openllm-core/src/openllm_core/utils/dantic.py
index 960620c3..97d320b5 100644
--- a/openllm-python/src/openllm/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -335,7 +335,6 @@ class CudaValueType(ParamType):
   name = "cuda"
   envvar_list_splitter = ","
   is_composite = True
-  typ = click_types.convert_type(str)
 
   def split_envvar_value(self, rv: str) -> t.Sequence[str]:
     var = tuple(i for i in rv.split(self.envvar_list_splitter))
@@ -352,10 +351,11 @@ class CudaValueType(ParamType):
     param: The parameter that is requesting completion.
     incomplete: Value being completed. May be empty.
     """
-    from openllm.utils import available_devices
+    from openllm_core.utils import available_devices
     mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices()
     return [sc.CompletionItem(str(i), help=f"CUDA device index {i}") for i in mapping]
   def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any:
+    typ = click_types.convert_type(str)
     if isinstance(value, bytes):
       enc = _get_argv_encoding()
       try: value = value.decode(enc)
@@ -365,7 +365,7 @@ class CudaValueType(ParamType):
           try: value = value.decode(fs_enc)
           except UnicodeError: value = value.decode("utf-8", "replace")
         else: value = value.decode("utf-8", "replace")
-    return tuple(self.typ(x, param, ctx) for x in value.split(","))
+    return tuple(typ(x, param, ctx) for x in value.split(","))
 
   def __repr__(self) -> str: return "STRING"
 
diff --git a/openllm-python/src/openllm/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
similarity index 95%
rename from openllm-python/src/openllm/utils/import_utils.py
rename to openllm-core/src/openllm_core/utils/import_utils.py
index 90b148ca..6179dbd5 100644
--- a/openllm-python/src/openllm/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -1,19 +1,19 @@
 """Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons."""
 from __future__ import annotations
-import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t
+import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t, openllm_core
 from collections import OrderedDict
 import inflection, packaging.version
 from bentoml._internal.utils import LazyLoader, pkg
-from openllm._typing_compat import overload, LiteralString
+from openllm_core._typing_compat import overload, LiteralString
 
 from .representation import ReprMixin
 
 if t.TYPE_CHECKING:
   BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
-  from openllm._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralRuntime
 
 logger = logging.getLogger(__name__)
-OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq",}
+OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq"}
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
 USE_TF = os.environ.get("USE_TF", "AUTO").upper()
@@ -32,6 +32,9 @@ _torch_available = importlib.util.find_spec("torch") is not None
 _tf_available = importlib.util.find_spec("tensorflow") is not None
 _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
 _vllm_available = importlib.util.find_spec("vllm") is not None
+_transformers_available = _is_package_available("transformers")
+_grpc_available = importlib.util.find_spec("grpc") is not None
+_grpc_health_available = importlib.util.find_spec("grpc_health") is not None
 _peft_available = _is_package_available("peft")
 _einops_available = _is_package_available("einops")
 _cpm_kernel_available = _is_package_available("cpm_kernels")
@@ -46,6 +49,9 @@ _sentencepiece_available = _is_package_available("sentencepiece")
 _xformers_available = _is_package_available("xformers")
 _fairscale_available = _is_package_available("fairscale")
 
+def is_transformers_available() -> bool: return _transformers_available
+def is_grpc_available() -> bool: return _grpc_available
+def is_grpc_health_available() -> bool: return _grpc_health_available
 def is_transformers_supports_kbit() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 30)
 def is_transformers_supports_agent() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 29)
 def is_jupyter_available() -> bool: return _jupyter_available
@@ -86,7 +92,7 @@ def is_tf_available() -> bool:
           try:
             _tf_version = importlib.metadata.version(_pkg)
             break
-          except importlib.metadata.PackageNotFoundError: pass  # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
+          except importlib.metadata.PackageNotFoundError: pass  # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
         _tf_available = _tf_version is not None
       if _tf_available:
         if _tf_version and packaging.version.parse(_tf_version) < packaging.version.parse("2"):
@@ -281,7 +287,7 @@ class EnvVarMixin(ReprMixin):
     raise KeyError(f"Key {item} not found in {self}")
   def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
     """EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
-    from openllm._configuration import field_env_key
+    from openllm_core.utils import field_env_key
     self.model_name = inflection.underscore(model_name)
     self._implementation = implementation
     self._model_id = model_id
@@ -307,6 +313,6 @@ class EnvVarMixin(ReprMixin):
   @property
   def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
   @property
-  def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
+  def start_docstring(self) -> str: return getattr(openllm_core.config, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
   @property
   def module(self) -> LazyLoader: return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
diff --git a/openllm-python/src/openllm/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py
similarity index 96%
rename from openllm-python/src/openllm/utils/lazy.py
rename to openllm-core/src/openllm_core/utils/lazy.py
index da8c46dd..ab1dcc41 100644
--- a/openllm-python/src/openllm/utils/lazy.py
+++ b/openllm-core/src/openllm_core/utils/lazy.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
-import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t
-import attr, openllm
+import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t, attr, openllm_core
 
 __all__ = ["VersionInfo", "LazyModule"]
 # vendorred from attrs
@@ -75,7 +74,7 @@ class LazyModule(types.ModuleType):
 
     It also contains a special case for all of the metadata information, such as __version__ and __version_info__.
     """
-    if name in _reserved_namespace: raise openllm.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
+    if name in _reserved_namespace: raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
     dunder_to_metadata = {"__title__": "Name", "__copyright__": "", "__version__": "version", "__version_info__": "version", "__description__": "summary", "__uri__": "", "__url__": "", "__author__": "", "__email__": "", "__license__": "license", "__homepage__": ""}
     if name in dunder_to_metadata:
       if name not in {"__version_info__", "__copyright__", "__version__"}: warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2)
diff --git a/openllm-python/src/openllm/utils/representation.py b/openllm-core/src/openllm_core/utils/representation.py
similarity index 91%
rename from openllm-python/src/openllm/utils/representation.py
rename to openllm-core/src/openllm_core/utils/representation.py
index f644b77f..b7f33eb1 100644
--- a/openllm-python/src/openllm/utils/representation.py
+++ b/openllm-core/src/openllm_core/utils/representation.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
-import typing as t
 from abc import abstractmethod
-import attr, orjson
-from openllm import utils
-if t.TYPE_CHECKING: from openllm._typing_compat import TypeAlias
+import attr, orjson, typing as t
+from openllm_core import utils
+if t.TYPE_CHECKING: from openllm_core._typing_compat import TypeAlias
 
 ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None]
 class ReprMixin:
diff --git a/openllm-python/ADDING_NEW_MODEL.md b/openllm-python/ADDING_NEW_MODEL.md
index fb6c9bd2..426cea75 100644
--- a/openllm-python/ADDING_NEW_MODEL.md
+++ b/openllm-python/ADDING_NEW_MODEL.md
@@ -8,21 +8,24 @@ environment by referring to our
 ## Procedure
 
 All the relevant code for incorporating a new model resides within
-[`src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
+[`$GIT_ROOT/openllm-python/src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
 Here's your roadmap:
 
 - [ ] Generate model configuration file:
-      `src/openllm/models/{model_name}/configuration_{model_name}.py`
+      `$GIT_ROOT/openllm-core/src/openllm_core/config/configuration_{model_name}.py`
 - [ ] Establish model implementation files:
-      `src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
+      `$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
 - [ ] Create module's `__init__.py`:
-      `src/openllm/models/{model_name}/__init__.py`
-- [ ] Adjust the entrypoints for files at `src/openllm/models/auto/*` If it is a
-      new runtime, then add it a `src/openllm/models/auto/modeling_{runtime}_auto.py`.
+      `$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/__init__.py`
+- [ ] Adjust the entrypoints for files at `$GIT_ROOT/openllm-python/src/openllm/models/auto/*` If it is a
+      new runtime, then add it a `$GIT_ROOT/openllm-python/src/openllm/models/auto/modeling_{runtime}_auto.py`.
       See the other auto runtime for example.
-- [ ] Modify the main `__init__.py`: `src/openllm/models/__init__.py`
+- [ ] Run the following script: `$GIT_ROOT/tools/update-models-import.py`
 - [ ] Run the following to update stubs: `hatch run check-stubs`
 
+> [!NOTE]
+> `$GIT_ROOT` refers to `$(git rev-parse --show-toplevel)`
+
 For a working example, check out any existing model.
 
 ### Model Configuration
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 6451bc64..4e161c9d 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -5,7 +5,6 @@ requires = [
   "hatchling==1.18.0",
   "hatch-vcs==0.3.0",
   "hatch-fancy-pypi-readme==23.1.0",
-  "hatch-mypyc==0.16.0",
 ]
 
 [project]
@@ -39,20 +38,14 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "bentoml[grpc,io]>=1.0.25",
+  "bentoml[grpc,io]>=1.1.2",
   "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "openllm-client",
   "safetensors",
   "optimum",
-  "attrs>=23.1.0",
-  "cattrs>=23.1.0",
-  "orjson",
-  "inflection",
-  "tabulate[widechars]>=0.9.0",
-  "httpx",
-  "click>=8.1.3",
-  "typing_extensions",
-  "mypy_extensions",
   "ghapi",
+  "tabulate[widechars]>=0.9.0",
+  "click>=8.1.3",
   "cuda-python;platform_system!=\"Darwin\"",
   "bitsandbytes<0.42",
 ]
@@ -102,34 +95,21 @@ Twitter = "https://twitter.com/bentomlai"
 
 [project.optional-dependencies]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-all = [
-  "openllm[agents]",
-  "openllm[baichuan]",
-  "openllm[chatglm]",
-  "openllm[falcon]",
-  "openllm[fine-tune]",
-  "openllm[flan-t5]",
-  "openllm[ggml]",
-  "openllm[gptq]",
-  "openllm[llama]",
-  "openllm[mpt]",
-  "openllm[openai]",
-  "openllm[opt]",
-  "openllm[playground]",
-  "openllm[starcoder]",
-  "openllm[vllm]",
-]
+all = ["openllm[full]"]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
 falcon = ["einops", "xformers"]
 fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
-flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
+full = [
+  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,llama,mpt,openai,opt,playground,starcoder,vllm]",
+]
 ggml = ["ctransformers"]
 gptq = ["auto-gptq[triton]"]
 llama = ["fairscale", "sentencepiece"]
 mpt = ["triton", "einops"]
 openai = ["openai", "tiktoken"]
-opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
 vllm = ["vllm", "ray"]
@@ -165,10 +145,10 @@ exclude = [
 [tool.hatch.build.targets.wheel.hooks.mypyc]
 dependencies = [
   "hatch-mypyc==0.16.0",
-  "mypy==1.4.1",
+  "mypy==1.5.1",
   # avoid https://github.com/pallets/click/issues/2558
   "click==8.1.3",
-  "bentoml==1.1.1",
+  "bentoml==1.1.2",
   "transformers>=4.31.0",
   "pandas-stubs",
   "types-psutil",
@@ -177,19 +157,14 @@ dependencies = [
   "types-protobuf",
 ]
 enable-by-default = false
-exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
+exclude = ["src/openllm/_service.py", "src/openllm/utils/__init__.py"]
 include = [
-  "src/openllm/models/__init__.py",
-  "src/openllm/models/auto/__init__.py",
-  "src/openllm/utils/__init__.py",
   "src/openllm/__init__.py",
-  "src/openllm/_prompt.py",
-  "src/openllm/_schema.py",
   "src/openllm/_quantisation.py",
   "src/openllm/_generation.py",
-  "src/openllm/_strategies.py",
   "src/openllm/exceptions.py",
   "src/openllm/testing.py",
+  "src/openllm/utils",
 ]
 # NOTE: This is consistent with pyproject.toml
 mypy-args = [
@@ -204,7 +179,6 @@ mypy-args = [
   "--no-warn-no-return",
   "--no-warn-unused-ignores",
   "--exclude='/src\\/openllm\\/playground\\/**'",
-  "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
 ]
 options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
 require-runtime-dependencies = true
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index eb9048d5..4ce32f0b 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -9,13 +9,18 @@ deploy, and monitor any LLMs with ease.
 * Native integration with BentoML and LangChain for custom LLM apps
 """
 from __future__ import annotations
-import logging as _logging, os as _os, typing as _t, warnings as _warnings
+import logging as _logging, os as _os, typing as _t, warnings as _warnings, openllm_core
 from pathlib import Path as _Path
 from . import exceptions as exceptions, utils as utils
 
-if utils.DEBUG:
-  utils.set_debug_mode(True)
-  utils.set_quiet_mode(False)
+from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
+from openllm_core._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
+from openllm_core._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig
+
+if openllm_core.utils.DEBUG:
+  openllm_core.utils.set_debug_mode(True)
+  openllm_core.utils.set_quiet_mode(False)
   _logging.basicConfig(level=_logging.NOTSET)
 else:
   # configuration for bitsandbytes before import
@@ -28,40 +33,26 @@ else:
   _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
 
 _import_structure: dict[str, list[str]] = {
-    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
-    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
-    "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
-    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [],
+    "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": [], "models.baichuan": [], "models.dolly_v2": [], "models.falcon": [], "models.flan_t5": [], "models.gpt_neox": [], "models.llama": [], "models.mpt": [], "models.opt": [], "models.stablelm": [], "models.starcoder": []
 }
 COMPILED = _Path(__file__).suffix in (".pyd", ".so")
 
 if _t.TYPE_CHECKING:
   from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
-  from ._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
   from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
   from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
   from ._quantisation import infer_quantisation_config as infer_quantisation_config
-  from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
   from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
-  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
   from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
-  from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
-  from .models.baichuan import BaichuanConfig as BaichuanConfig
-  from .models.chatglm import ChatGLMConfig as ChatGLMConfig
-  from .models.dolly_v2 import DollyV2Config as DollyV2Config
-  from .models.falcon import FalconConfig as FalconConfig
-  from .models.flan_t5 import FlanT5Config as FlanT5Config
-  from .models.gpt_neox import GPTNeoXConfig as GPTNeoXConfig
-  from .models.llama import LlamaConfig as LlamaConfig
-  from .models.mpt import MPTConfig as MPTConfig
-  from .models.opt import OPTConfig as OPTConfig
-  from .models.stablelm import StableLMConfig as StableLMConfig
-  from .models.starcoder import StarCoderConfig as StarCoderConfig
+  from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
   from .serialisation import ggml as ggml, transformers as transformers
   from .utils import infer_auto_class as infer_auto_class
 
 try:
-  if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   _import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
 else:
@@ -71,7 +62,7 @@ else:
     from .models.baichuan import Baichuan as Baichuan
     from .models.chatglm import ChatGLM as ChatGLM
 try:
-  if not (utils.is_torch_available() and utils.is_triton_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
   else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
@@ -79,7 +70,7 @@ else:
   _import_structure["models.mpt"].extend(["MPT"])
   if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
 try:
-  if not (utils.is_torch_available() and utils.is_einops_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
   else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
@@ -88,7 +79,7 @@ else:
   if _t.TYPE_CHECKING: from .models.falcon import Falcon as Falcon
 
 try:
-  if not utils.is_torch_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   _import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")]
 else:
@@ -110,7 +101,7 @@ else:
     from .models.stablelm import StableLM as StableLM
     from .models.starcoder import StarCoder as StarCoder
 try:
-  if not utils.is_vllm_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -136,7 +127,7 @@ else:
     from .models.stablelm import VLLMStableLM as VLLMStableLM
     from .models.starcoder import VLLMStarCoder as VLLMStarCoder
 try:
-  if not utils.is_flax_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   _import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -148,7 +139,7 @@ else:
     from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
     from .models.opt import FlaxOPT as FlaxOPT
 try:
-  if not utils.is_tf_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
   _import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -161,7 +152,7 @@ else:
     from .models.opt import TFOPT as TFOPT
 
 # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = utils.LazyModule(__name__, _os.path.abspath("__file__"), _import_structure, extra_objects={"COMPILED": COMPILED})
+__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index deb6fd45..54bd3678 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,22 +1,21 @@
 from __future__ import annotations
-import functools, inspect, logging, os, re, traceback, types, typing as t, uuid
+import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc
 from abc import ABC, abstractmethod
 from pathlib import Path
-import attr, fs.path, inflection, orjson, bentoml, openllm, gc
 from huggingface_hub import hf_hub_download
 from bentoml._internal.models.model import ModelSignature
 
-from ._configuration import (
+from openllm_core._configuration import (
   FineTuneConfig,
   LLMConfig,
   _object_getattribute,
   _setattr_class,
 )
 from ._quantisation import infer_quantisation_config
-from ._schema import unmarshal_vllm_outputs
+from openllm_core._schema import unmarshal_vllm_outputs
 from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
 from .models.auto import AutoConfig
-from .utils import (
+from openllm_core.utils import (
   DEBUG,
   ENV_VARS_TRUE_VALUES,
   MYPY,
@@ -29,7 +28,6 @@ from .utils import (
   device_count,
   first_not_none,
   generate_hash_from_file,
-  infer_auto_class,
   is_peft_available,
   is_torch_available,
   non_intrusive_setattr,
@@ -37,8 +35,8 @@ from .utils import (
   resolve_filepath,
   validate_is_path,
 )
-
-from ._typing_compat import (
+from .utils import infer_auto_class
+from openllm_core._typing_compat import (
   AdaptersMapping,
   AdaptersTuple,
   AnyCallable,
@@ -57,8 +55,8 @@ from ._typing_compat import (
 
 if t.TYPE_CHECKING:
   import auto_gptq as autogptq, peft, torch, transformers, vllm
-  from ._configuration import PeftType
-  from .utils.representation import ReprArgs
+  from openllm_core._configuration import PeftType
+  from openllm_core.utils.representation import ReprArgs
 else:
   autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
   vllm = LazyLoader("vllm", globals(), "vllm")
@@ -156,27 +154,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
     """The iterator version of `generate` function."""
     raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.")
 
-  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
-    """This handler will sanitize all attrs and setup prompt text.
-
-    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
-
-    Returns a tuple of three items:
-    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
-    - The attributes dictionary that will be passed into `self.postprocess_generate`.
-    """
-    return prompt, attrs, attrs
-
-  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
-    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
-
-    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
-
-    > [!NOTE]
-    > This will be used from the client side.
-    """
-    return generation_result
-
   def llm_post_init(self) -> None:
     """This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals."""
     pass
@@ -380,9 +357,7 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
     anns[key] = interface_anns.get(key)
   return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
 
-def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
-  return generation_result[0]["outputs"][0]["text"]
-
+def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
 def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
   outputs: list[vllm.RequestOutput] = []
   # TODO: support prompt_token_ids
@@ -430,8 +405,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
     _make_assignment_script(cls)(cls)
     if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER
-
-  # fmt: off
   @overload
   def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
   @overload
@@ -459,24 +432,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
     elif hasattr(self, item): return getattr(self, item)
     else: raise KeyError(item)
-  @classmethod
   @overload
-  def from_pretrained(
-      cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
-      quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
-  ) -> LLM[M, T]:    ...
   @classmethod
+  def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
   @overload
-  def from_pretrained(
-      cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
-      quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
-  ) -> LLM[M, T]:    ...
-  # fmt: on
   @classmethod
-  def from_pretrained(
-      cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None,
-      adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any,
-  ) -> LLM[M, T]:
+  def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
+  @classmethod
+  def from_pretrained(cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any) -> LLM[M, T]:
     """Instantiate a pretrained LLM.
 
     ``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
@@ -708,7 +671,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
     # NOTE: Save the args and kwargs for latter load
     self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
-
     # handle trust_remote_code
     _from_env = os.getenv("TRUST_REMOTE_CODE", None)
     self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))
@@ -723,7 +685,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def __setattr__(self, attr: str, value: t.Any) -> None:
     if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.")
     super().__setattr__(attr, value)
-
   @property
   def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping
   @adapters_mapping.setter
@@ -740,6 +701,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime
   @property
   def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner"
+  # NOTE: The section below defines a loose contract with langchain's LLM interface.
   @property
   def llm_type(self) -> str: return normalise_model_name(self._model_id)
   @property
@@ -755,6 +717,27 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if self.__llm_bentomodel__ is None: self.__llm_bentomodel__ = openllm.serialisation.get(self)
     return self.__llm_bentomodel__
 
+  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
+    """This handler will sanitize all attrs and setup prompt text.
+
+    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
+
+    Returns a tuple of three items:
+    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
+    - The attributes dictionary that will be passed into `self.postprocess_generate`.
+    """
+    return self.config.sanitize_parameters(prompt, **attrs)
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
+    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
+
+    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
+
+    > [!NOTE]
+    > This will be used from the client side.
+    """
+    if isinstance(generation_result, dict): return generation_result["text"]
+    return self.config.postprocess_generate(prompt, generation_result, **attrs)
+
   @property
   def model(self) -> M:
     # Run check for GPU
@@ -868,7 +851,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
   # order of these fields matter here, make sure to sync it with
   # openllm.models.auto.factory.BaseAutoLLMClass.for_model
-  def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] | None = None) -> LLMRunner[M, T]:
+  def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
     """Convert this LLM into a Runner.
 
     Args:
@@ -894,10 +877,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     try: models.append(self._bentomodel)
     except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None
 
-    if scheduling_strategy is None:
-      from ._strategies import CascadingResourceStrategy
-      scheduling_strategy = CascadingResourceStrategy
-
     generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
     embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
     generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
@@ -932,10 +911,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     for it in self.generate_iterator(prompt, **attrs): pass
     return [it]
 
-  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str:
-    if isinstance(generation_result, dict): return generation_result["text"]
-    return generation_result
-
   def generate_iterator(self, prompt: str, /,
                         *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
     # NOTE: encoder-decoder models will need to implement their own generate_iterator for now
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index 06f110f8..bdc8290f 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,12 +1,11 @@
-# mypy: disable-error-code="name-defined"
+# mypy: disable-error-code="name-defined,no-redef"
 from __future__ import annotations
-import logging, sys, typing as t
-from .utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
-if sys.version_info[:2] >= (3, 11): from typing import overload
-else: from typing_extensions import overload
+import logging, typing as t
+from openllm_core.utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
+from openllm_core._typing_compat import overload
 if t.TYPE_CHECKING:
   from ._llm import LLM
-  from ._typing_compat import DictStrAny
+  from openllm_core._typing_compat import DictStrAny
 
 autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers")
 
diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py
index dc276cd4..7f66d890 100644
--- a/openllm-python/src/openllm/bundle/__init__.py
+++ b/openllm-python/src/openllm/bundle/__init__.py
@@ -4,15 +4,12 @@ These utilities will stay internal, and its API can be changed or updated withou
 """
 from __future__ import annotations
 import os, typing as t
-from openllm.utils import LazyModule
+from openllm_core.utils import LazyModule
 
 _import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]}
 
 if t.TYPE_CHECKING:
-  from . import (
-    _package as _package,
-    oci as oci,
-  )
+  from . import _package as _package, oci as oci
   from ._package import (
     build_editable as build_editable,
     construct_docker_options as construct_docker_options,
@@ -28,7 +25,7 @@ if t.TYPE_CHECKING:
     supported_registries as supported_registries,
   )
 
-__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
-__all__=__lazy.__all__
-__dir__=__lazy.__dir__
-__getattr__=__lazy.__getattr__
+__lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 9e3bf20d..8b667834 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,35 +1,34 @@
 # mypy: disable-error-code="misc"
 from __future__ import annotations
-import importlib.metadata, inspect, logging, os, typing as t
+import fs, fs.copy, fs.errors, orjson, bentoml, openllm_core, importlib.metadata, inspect, logging, os, typing as t, string
 from pathlib import Path
-import fs, fs.copy, fs.errors, orjson, bentoml, openllm
 from simple_di import Provide, inject
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from . import oci
 
 if t.TYPE_CHECKING:
+  import openllm
   from fs.base import FS
-  from openllm._typing_compat import LiteralString
+  from openllm_core._typing_compat import LiteralString, LiteralContainerRegistry, LiteralContainerVersionStrategy
   from bentoml._internal.bento import BentoStore
   from bentoml._internal.models.model import ModelStore
-  from .oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
 
 logger = logging.getLogger(__name__)
 
 OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
 
-def build_editable(path: str) -> str | None:
+def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "openllm_client"] = "openllm") -> str | None:
   """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
   if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != "true": return None
   # We need to build the package in editable mode, so that we can import it
   from build import ProjectBuilder
   from build.env import IsolatedEnvBuilder
-  module_location = openllm.utils.pkg.source_locations("openllm")
+  module_location = openllm_core.utils.pkg.source_locations(package)
   if not module_location: raise RuntimeError("Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.")
   pyproject_path = Path(module_location).parent.parent/"pyproject.toml"
   if os.path.isfile(pyproject_path.__fspath__()):
-    logger.info("OpenLLM is installed in editable mode. Generating built wheels...")
+    logger.info("Generating built wheels for package %s...", package)
     with IsolatedEnvBuilder() as env:
       builder = ProjectBuilder(pyproject_path.parent)
       builder.python_executable = env.executable
@@ -49,15 +48,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
 
   req = llm.config["requirements"]
   if req is not None: packages.extend(req)
-  if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm.utils.pkg.pkg_version_info('bentoml')])}")
+  if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
 
   env = llm.config["env"]
   framework_envvar = env["framework_value"]
   if framework_envvar == "flax":
-    if not openllm.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+    if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
     packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")])
   elif framework_envvar == "tf":
-    if not openllm.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+    if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
     candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",)
     # For the metadata, we have to look for both tensorflow and tensorflow-cpu
     for candidate in candidates:
@@ -68,19 +67,19 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
           _tf_version = importlib.metadata.version(candidate)
           packages.extend([f"tensorflow>={_tf_version}"])
         break
-      except importlib.metadata.PackageNotFoundError: pass  # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
+      except importlib.metadata.PackageNotFoundError: pass  # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
   else:
-    if not openllm.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
+    if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
     packages.extend([f'torch>={importlib.metadata.version("torch")}'])
   wheels: list[str] = []
-  built_wheels = build_editable(llm_fs.getsyspath("/"))
-  if built_wheels is not None: wheels.append(llm_fs.getsyspath(f"/{built_wheels.split('/')[-1]}"))
+  built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")]
+  if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
   return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
 
 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
   from openllm.cli._factory import parse_config_options
   environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
-  env: openllm.utils.EnvVarMixin = llm.config["env"]
+  env: openllm_core.utils.EnvVarMixin = llm.config["env"]
   if env["framework_value"] == "vllm": serialisation_format = "legacy"
   env_dict = {
       env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
@@ -91,13 +90,45 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
   if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
 
   # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
 
   env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
   if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
   env_dict[_env.runtime] = _env["runtime_value"]
   return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
 
+OPENLLM_MODEL_NAME = "# openllm: model name"
+OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
+class ModelNameFormatter(string.Formatter):
+  model_keyword: LiteralString = "__model_name__"
+  def __init__(self, model_name: str):
+    """The formatter that extends model_name to be formatted the 'service.py'."""
+    super().__init__()
+    self.model_name = model_name
+  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
+  def can_format(self, value: str) -> bool:
+    try:
+      self.parse(value)
+      return True
+    except ValueError: return False
+class ModelIdFormatter(ModelNameFormatter):
+  model_keyword: LiteralString = "__model_id__"
+class ModelAdapterMapFormatter(ModelNameFormatter):
+  model_keyword: LiteralString = "__model_adapter_map__"
+
+_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
+def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
+  from openllm_core.utils import DEBUG
+  model_name = llm.config["model_name"]
+  logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
+  with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
+  for it in src_contents:
+    if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
+    elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
+  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
+  if DEBUG: logger.info("Generated script:\n%s", script)
+  llm_fs.writetext(llm.config["service_name"], script)
+
 @inject
 def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
                 runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release",
@@ -108,14 +139,14 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
   if adapter_map: labels.update(adapter_map)
   if isinstance(workers_per_resource, str):
     if workers_per_resource == "round_robin": workers_per_resource = 1.0
-    elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
+    elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
     else:
       try: workers_per_resource = float(workers_per_resource)
       except ValueError: raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
   elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource)
   logger.info("Building Bento for '%s'", llm.config["start_name"])
   # add service.py definition to this temporary folder
-  openllm.utils.codegen.write_service(llm, adapter_map, llm_fs)
+  write_service(llm, adapter_map, llm_fs)
 
   llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
   build_config = BentoBuildConfig(
@@ -134,7 +165,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
     if "__bento_name__" in it: service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
 
   script = "".join(service_contents)
-  if openllm.utils.DEBUG: logger.info("Generated script:\n%s", script)
+  if openllm_core.utils.DEBUG: logger.info("Generated script:\n%s", script)
 
   bento._fs.writetext(service_fs_path, script)
   if "model_store" in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index b2ece9dc..c7f2cd9d 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -1,26 +1,23 @@
 # mypy: disable-error-code="misc"
 """OCI-related utilities for OpenLLM. This module is considered to be internal and API are subjected to change."""
 from __future__ import annotations
-import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t
+import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t, openllm_core
 from datetime import datetime, timedelta, timezone
 import attr, orjson, bentoml, openllm
-from openllm.utils.lazy import VersionInfo
+from openllm_core.utils.lazy import VersionInfo
 
 if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
   from ghapi import all
-  from openllm._typing_compat import RefTuple, LiteralString
+  from openllm_core._typing_compat import RefTuple, LiteralString
 
-all = openllm.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811
+all = openllm_core.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811
 
 logger = logging.getLogger(__name__)
 
 _BUILDER = bentoml.container.get_backend("buildx")
 ROOT_DIR = pathlib.Path(os.path.abspath("__file__")).parent.parent.parent
 
-# TODO: support quay
-LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
-LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
-
 # XXX: This registry will be hard code for now for easier to maintain
 # but in the future, we can infer based on git repo and everything to make it more options for users
 # to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
@@ -31,10 +28,10 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {"docker": "docker.io
 _OWNER = "bentoml"
 _REPO = "openllm"
 
-_module_location = openllm.utils.pkg.source_locations("openllm")
+_module_location = openllm_core.utils.pkg.source_locations("openllm")
 
 @functools.lru_cache
-@openllm.utils.apply(str.lower)
+@openllm_core.utils.apply(str.lower)
 def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg]
 
 def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s)
@@ -43,7 +40,7 @@ def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) -
 class VersionNotSupported(openllm.exceptions.OpenLLMException):
   """Raised when the stable release is too low that it doesn't include OpenLLM base container."""
 
-_RefTuple: type[RefTuple] = openllm.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
+_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
 
 def nightly_resolver(cls: type[RefResolver]) -> str:
   # NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -60,7 +57,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
 @attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
   git_hash: str = attr.field()
-  version: openllm.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
+  version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
   strategy: LiteralContainerVersionStrategy = attr.field()
   _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
   @classmethod
@@ -74,7 +71,7 @@ class RefResolver:
       version_str = meta["name"].lstrip("v")
       version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str)
     else: version = ("", version_str)
-    if openllm.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
+    if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
     return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
   @classmethod
   @functools.lru_cache(maxsize=64)
@@ -101,7 +98,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
   try:
     if not _BUILDER.health(): raise openllm.exceptions.Error
   except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None
-  if openllm.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
+  if openllm_core.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
   if not shutil.which("nvidia-container-runtime"): raise RuntimeError("NVIDIA Container Toolkit is required to compile CUDA kernel in container.")
   if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
   pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml"
@@ -111,7 +108,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
     registries = [registries] if isinstance(registries, str) else list(registries)
     tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries}
   try:
-    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm.utils.get_debug_mode() else "auto", quiet=machine)
+    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine)
     if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip()
   except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err
   return tags
diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
index 53b974ab..8f0acafa 100644
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -1,16 +1,16 @@
 from __future__ import annotations
-import functools, importlib.util, os, typing as t, logging
-import click, click_option_group as cog, inflection, orjson, bentoml, openllm
+import functools, importlib.util, os, typing as t, logging, click, click_option_group as cog, inflection, orjson, bentoml, openllm
+from click import shell_completion as sc
 from bentoml_cli.utils import BentoMLCommandGroup
 from click.shell_completion import CompletionItem
-from openllm.utils import DEBUG
+from openllm_core.utils import DEBUG
 from bentoml._internal.configuration.containers import BentoMLContainer
-from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
+from openllm_core._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
 from . import termui
 
 if t.TYPE_CHECKING:
   import subprocess
-  from openllm._configuration import LLMConfig
+  from openllm_core._configuration import LLMConfig
 
 logger = logging.getLogger(__name__)
 
@@ -20,6 +20,12 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"]
 _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])
 
+def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
+  return [sc.CompletionItem(str(it.tag), help="Bento") for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {"start_name", "bundler"})]
+
+def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
+  return [sc.CompletionItem(inflection.dasherize(it), help="Model") for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+
 def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
   # TODO: Support amd.com/gpu on k8s
   _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
@@ -316,7 +322,7 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
 def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
-def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
+def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       "--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
@@ -382,7 +388,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
   )(f)
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
-      "--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
+      "--container-registry", "container_registry", type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
 
                                                                                                                         Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
 
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index 30bdc717..512dc49e 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
-import itertools, logging, os, re, subprocess, sys, typing as t
-import bentoml, openllm
+import itertools, logging, os, re, subprocess, sys, typing as t, bentoml, openllm, openllm_core
 from simple_di import Provide, inject
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm.exceptions import OpenLLMException
@@ -8,10 +7,9 @@ from . import termui
 from ._factory import start_command_factory
 
 if t.TYPE_CHECKING:
-  from openllm._typing_compat import LiteralString, LiteralRuntime
+  from openllm_core._configuration import LLMConfig
+  from openllm_core._typing_compat import LiteralString, LiteralRuntime, LiteralContainerRegistry, LiteralContainerVersionStrategy
   from bentoml._internal.bento import BentoStore
-  from openllm._configuration import LLMConfig
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
 
 logger = logging.getLogger(__name__)
 
@@ -58,7 +56,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
   """
   from .entrypoint import start_command, start_grpc_command
   llm_config = openllm.AutoConfig.for_model(model_name)
-  _ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
   os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
 
   args: list[str] = ["--runtime", runtime]
@@ -203,5 +201,5 @@ def _list_models() -> dict[str, t.Any]:
   return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)
 
 
-start, start_grpc, build, import_model, list_models = openllm.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm.utils.codegen.gen_sdk(_build), openllm.utils.codegen.gen_sdk(_import_model), openllm.utils.codegen.gen_sdk(_list_models)
+start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ["start", "start_grpc", "build", "import_model", "list_models"]
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index 5091cd47..70dcc11b 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -20,10 +20,9 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
 ```
 """
 from __future__ import annotations
-import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t
-import attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
-from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
+import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t, attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
 from simple_di import Provide, inject
+from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelStore
 from . import termui
@@ -56,8 +55,8 @@ from openllm.models.auto import (
   AutoConfig,
   AutoLLM,
 )
-from openllm._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
-from openllm.utils import (
+from openllm_core._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
+from openllm_core.utils import (
   DEBUG,
   DEBUG_ENV_VAR,
   OPTIONAL_DEPENDENCIES,
@@ -72,21 +71,20 @@ from openllm.utils import (
   first_not_none,
   get_debug_mode,
   get_quiet_mode,
-  infer_auto_class,
   is_torch_available,
   is_transformers_supports_agent,
   resolve_user_filepath,
   set_debug_mode,
   set_quiet_mode,
 )
+from openllm.utils import infer_auto_class
 
 if t.TYPE_CHECKING:
   import torch
   from bentoml._internal.bento import BentoStore
   from bentoml._internal.container import DefaultBuilder
-  from openllm.client import BaseClient
-  from openllm._schema import EmbeddingsOutput
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+  from openllm_core._schema import EmbeddingsOutput
+  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
 else: torch = LazyLoader("torch", globals(), "torch")
 
 P = ParamSpec("P")
@@ -271,7 +269,7 @@ def cli() -> None:
   \b
   An open platform for operating large language models in production.
   Fine-tune, serve, deploy, and monitor any LLMs with ease.
-  """  # noqa: D205
+  """
 
 @cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"])
 def start_command() -> None:
@@ -670,10 +668,8 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
   """
   client = openllm.client.HTTPClient(endpoint, timeout=timeout)
 
-  try:
-    client.call("metadata")
-  except http.client.BadStatusLine:
-    raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
+  try: client.call("metadata")
+  except http.client.BadStatusLine: raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
   if agent == "hf":
     if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
     _memoized = {k: v[0] for k, v in _memoized.items() if v}
@@ -700,7 +696,7 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time
   $ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
   ```
   """
-  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
   try:
     gen_embed = client.embed(text)
   except ValueError:
@@ -733,14 +729,14 @@ def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: in
   """
   _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
   if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint)
-  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
   input_fg, generated_fg = "magenta", "cyan"
   if output != "porcelain":
     termui.echo("==Input==\n", fg="white")
     termui.echo(f"{prompt}", fg=input_fg)
   res = client.query(prompt, return_response="raw", **{**client.configuration, **_memoized})
   if output == "pretty":
-    response = client.llm.postprocess_generate(prompt, res["responses"])
+    response = client.config.postprocess_generate(prompt, res["responses"])
     termui.echo("\n\n==Responses==\n", fg="white")
     termui.echo(response, fg=generated_fg)
   elif output == "json":
diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py
index ecc85349..587d8d38 100644
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -1,37 +1,26 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
-import orjson
-
-import openllm
-
-from .. import termui
-from .._factory import machine_option
-
-if t.TYPE_CHECKING:
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
-
+import typing as t, click, orjson, openllm
+from openllm.cli import termui
+from openllm.cli._factory import machine_option, container_registry_option
+if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
 @click.command(
     "build_base_container", context_settings=termui.CONTEXT_SETTINGS, help="""Base image builder for BentoLLM.
 
                 By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
-
                 Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
 
+                \b
                 If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
-
                 This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
 
                 Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
                 """
 )
-@click.option("--registry", multiple=True, type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), help="Target registry to create image tag on.", default=None)
+@container_registry_option
 @click.option("--version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="nightly", help="Version strategy to use for tagging the image.")
 @click.option("--push/--no-push", help="Whether to push to remote repository", is_flag=True, default=False)
 @machine_option
-def cli(registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
-  mapping = openllm.bundle.build_container(registry, version_strategy, push, machine)
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
   if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
   return mapping
diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py
index e74d06c7..94ec8914 100644
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -1,24 +1,16 @@
-
 from __future__ import annotations
-import shutil
-import subprocess
-import typing as t
-
-import click
-import psutil
+import shutil, subprocess, typing as t, click, psutil, bentoml
 from simple_di import Provide, inject
-
-import bentoml
 from bentoml._internal.configuration.containers import BentoMLContainer
 
-from .. import termui
+from openllm.cli import termui
+from openllm.cli._factory import bento_complete_envvar, machine_option
 
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
+if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
 
 @click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("bento", type=str)
-@click.option("--machine", is_flag=True, default=False, hidden=True)
+@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
+@machine_option
 @click.pass_context
 @inject
 def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
@@ -32,5 +24,5 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
   if machine: return bentomodel.path
   # copy and paste this into a new shell
   if psutil.WINDOWS: subprocess.check_call([shutil.which("dir") or "dir"], cwd=bentomodel.path)
-  else: subprocess.check_call([shutil.which("tree") or "tree"], cwd=bentomodel.path)
+  else: subprocess.check_call([shutil.which("ls") or "ls", "-Rrthla"], cwd=bentomodel.path)
   ctx.exit(0)
diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py
index 1e21b4be..8facabaf 100644
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -1,24 +1,18 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
+import typing as t, click, bentoml
 from simple_di import Provide, inject
-
-import bentoml
 from bentoml._internal.bento.bento import BentoInfo
 from bentoml._internal.bento.build_config import DockerOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.container.generate import generate_containerfile
+from openllm.cli import termui
+from openllm.cli._factory import bento_complete_envvar
+from openllm_core.utils import bentoml_cattr
 
-from .. import termui
-from ...utils import bentoml_cattr
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
+if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
 
 @click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.")
-@click.argument("bento", type=str)
+@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
 @click.pass_context
 @inject
 def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:
diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py
index 8610e3dd..bd848e7b 100644
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -1,25 +1,18 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
-import inflection
-import orjson
+import typing as t, click, inflection, orjson, openllm
 from bentoml_cli.utils import opt_callback
-
-import openllm
-
-from .. import termui
-from ..._prompt import process_prompt
+from openllm.cli import termui
+from openllm.cli._factory import model_complete_envvar, output_option, machine_option
+from openllm_core._prompt import process_prompt
 
 LiteralOutput = t.Literal["json", "pretty", "porcelain"]
 
 @click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
+@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
 @click.argument("prompt", type=click.STRING)
-@click.option("-o", "--output", "output", type=click.Choice(["json", "pretty", "porcelain"]), default="pretty", help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True)
+@output_option
 @click.option("--format", type=click.STRING, default=None)
-@click.option("--machine", is_flag=True, default=False, hidden=True)
+@machine_option
 @click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
 @click.pass_context
 def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py
index c52071fa..e8130090 100644
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -1,16 +1,8 @@
-
 from __future__ import annotations
-
-import click
-import inflection
-import orjson
-
-import bentoml
-import openllm
+import click, inflection, orjson, bentoml, openllm
 from bentoml._internal.utils import human_readable_size
-
-from .. import termui
-from .._factory import LiteralOutput, output_option
+from openllm.cli import termui
+from openllm.cli._factory import LiteralOutput, output_option
 
 @click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS)
 @output_option(default_value="json")
diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py
index e1265f8c..5f61609b 100644
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 import typing as t, bentoml, openllm, orjson, inflection ,click
-from bentoml._internal.utils import human_readable_size
-
 from openllm.cli import termui
-from openllm.cli._factory import LiteralOutput, model_name_argument, output_option
+from bentoml._internal.utils import human_readable_size
+from openllm.cli._factory import LiteralOutput, model_name_argument, output_option, model_complete_envvar
 
-if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
 
 @click.command("list_models", context_settings=termui.CONTEXT_SETTINGS)
-@model_name_argument(required=False)
+@model_name_argument(required=False, shell_complete=model_complete_envvar)
 @output_option(default_value="json")
 def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
   """This is equivalent to openllm models --show-available less the nice table."""
diff --git a/openllm-python/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py
index f9ce0e51..42343ccd 100644
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
-import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t
-import click, yaml
+import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t, click, yaml
 from openllm.cli import termui
 from openllm import playground
-from openllm.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
+from openllm_core.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
 
 if t.TYPE_CHECKING:
   import jupytext, nbformat
-  from openllm._typing_compat import DictStrAny
+  from openllm_core._typing_compat import DictStrAny
 
 logger = logging.getLogger(__name__)
 
@@ -38,7 +37,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   \b
   > [!NOTE]
   > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
-  """  # noqa: D301
+  """
   if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
     raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
   metadata = load_notebook_metadata()
diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py
index 05de8746..c6ec9a59 100644
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import os, typing as t, click, inflection, openllm
-if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
 
 def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
   attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None
diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py
new file mode 100644
index 00000000..4b94ad61
--- /dev/null
+++ b/openllm-python/src/openllm/client.py
@@ -0,0 +1,17 @@
+"""OpenLLM Python client.
+
+```python
+client = openllm.client.HTTPClient("http://localhost:8080")
+client.query("What is the difference between gather and scatter?")
+```
+
+If the server has embedding supports, use it via `client.embed`:
+```python
+client.embed("What is the difference between gather and scatter?")
+```
+"""
+from __future__ import annotations
+import openllm_client, typing as t
+if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
+def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client))
+def __getattr__(it: str) -> t.Any: return getattr(openllm_client, it)
diff --git a/openllm-python/src/openllm/client/__init__.py b/openllm-python/src/openllm/client/__init__.py
deleted file mode 100644
index 4cb5cfd5..00000000
--- a/openllm-python/src/openllm/client/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""OpenLLM Python client.
-
-```python
-client = openllm.client.HTTPClient("http://localhost:8080")
-client.query("What is the difference between gather and scatter?")
-```
-
-If the server has embedding supports, use it via `client.embed`:
-```python
-client.embed("What is the difference between gather and scatter?")
-```
-"""
-from __future__ import annotations
-
-from openllm.client.runtimes import (
-    AsyncGrpcClient as AsyncGrpcClient,
-    AsyncHTTPClient as AsyncHTTPClient,
-    BaseAsyncClient as BaseAsyncClient,
-    BaseClient as BaseClient,
-    GrpcClient as GrpcClient,
-    HTTPClient as HTTPClient,
-)
diff --git a/openllm-python/src/openllm/client/runtimes/__init__.py b/openllm-python/src/openllm/client/runtimes/__init__.py
deleted file mode 100644
index c0ee21e5..00000000
--- a/openllm-python/src/openllm/client/runtimes/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""Client that supports REST/gRPC protocol to interact with a LLMServer."""
-from __future__ import annotations
-
-from openllm.client.runtimes.base import (
-    BaseAsyncClient as BaseAsyncClient,
-    BaseClient as BaseClient,
-)
-from openllm.client.runtimes.grpc import (
-    AsyncGrpcClient as AsyncGrpcClient,
-    GrpcClient as GrpcClient,
-)
-from openllm.client.runtimes.http import (
-    AsyncHTTPClient as AsyncHTTPClient,
-    HTTPClient as HTTPClient,
-)
diff --git a/openllm-python/src/openllm/client/runtimes/base.py b/openllm-python/src/openllm/client/runtimes/base.py
deleted file mode 100644
index 52a85285..00000000
--- a/openllm-python/src/openllm/client/runtimes/base.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# mypy: disable-error-code="name-defined"
-from __future__ import annotations
-import asyncio, logging, typing as t
-import bentoml, bentoml.client, openllm, httpx
-from abc import abstractmethod
-from http import HTTPStatus
-from urllib.parse import urljoin
-from openllm._typing_compat import overload, LiteralString
-
-T = t.TypeVar("T")
-T_co = t.TypeVar("T_co", covariant=True)
-
-if t.TYPE_CHECKING:
-  import transformers
-  from openllm._typing_compat import DictStrAny, LiteralRuntime
-else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-class AnnotatedClient(t.Protocol[T_co]):
-  server_url: str
-  _svc: bentoml.Service
-  endpoints: list[str]
-  def health(self, *args: t.Any, **attrs: t.Any) -> t.Any: ...
-  async def async_health(self) -> t.Any: ...
-  def generate_v1(self, qa: openllm.GenerationInput) -> T_co: ...
-  def metadata_v1(self) -> T_co: ...
-  def embeddings_v1(self) -> t.Sequence[float]: ...
-  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
-  async def async_call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
-  @staticmethod
-  def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: ...
-  @staticmethod
-  def from_url(server_url: str) -> AnnotatedClient[t.Any]: ...
-
-logger = logging.getLogger(__name__)
-
-def in_async_context() -> bool:
-  try:
-    _ = asyncio.get_running_loop()
-    return True
-  except RuntimeError: return False
-
-class ClientMeta(t.Generic[T]):
-  _api_version: str
-  _client_type: t.Literal["GrpcClient", "HTTPClient"]
-  _host: str
-  _port: str
-
-  __client__: AnnotatedClient[T] | None = None
-  __agent__: transformers.HfAgent | None = None
-  __llm__: openllm.LLM[t.Any, t.Any] | None = None
-
-  def __init__(self, address: str, timeout: int = 30): self._address,self._timeout = address,timeout
-  def __init_subclass__(cls, *, client_type: t.Literal["http", "grpc"] = "http", api_version: str = "v1"): cls._client_type, cls._api_version = "HTTPClient" if client_type == "http" else "GrpcClient", api_version
-  @property
-  def _hf_agent(self) -> transformers.HfAgent:
-    if not self.supports_hf_agent: raise openllm.exceptions.OpenLLMException(f"{self.model_name} ({self.framework}) does not support running HF agent.")
-    if self.__agent__ is None:
-      if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
-      self.__agent__ = transformers.HfAgent(urljoin(self._address, "/hf/agent"))
-    return self.__agent__
-  @property
-  def _metadata(self) -> T: return httpx.post(urljoin(self._address, f"/{self._api_version}/metadata")).json() if in_async_context() else self.call("metadata")
-  @property
-  @abstractmethod
-  def model_name(self) -> str: raise NotImplementedError
-  @property
-  @abstractmethod
-  def framework(self) -> LiteralRuntime: raise NotImplementedError
-  @property
-  @abstractmethod
-  def timeout(self) -> int: raise NotImplementedError
-  @property
-  @abstractmethod
-  def model_id(self) -> str: raise NotImplementedError
-  @property
-  @abstractmethod
-  def configuration(self) -> dict[str, t.Any]: raise NotImplementedError
-  @property
-  @abstractmethod
-  def supports_embeddings(self) -> bool: raise NotImplementedError
-  @property
-  @abstractmethod
-  def supports_hf_agent(self) -> bool: raise NotImplementedError
-  @abstractmethod
-  def postprocess(self, result: t.Any) -> openllm.GenerationOutput: ...
-  @abstractmethod
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: ...
-
-  @property
-  def config(self) -> openllm.LLMConfig: return self.llm.config
-  @property
-  def llm(self) -> openllm.LLM[t.Any, t.Any]:
-    # XXX: if the server runs vllm or any framework that is not available from the user client, client will fail.
-    if self.__llm__ is None: self.__llm__ = openllm.infer_auto_class(self.framework).for_model(self.model_name)
-    return self.__llm__
-
-  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return self._cached.call(f"{name}_{self._api_version}", *args, **attrs)
-  async def acall(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return await self._cached.async_call(f"{name}_{self._api_version}", *args, **attrs)
-  @property
-  def _cached(self) -> AnnotatedClient[T]:
-    client_class = t.cast(AnnotatedClient[T], getattr(bentoml.client, self._client_type))
-    if self.__client__ is None:
-      client_class.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
-      self.__client__ = client_class.from_url(self._address)
-    return self.__client__
-
-class BaseClient(ClientMeta[T]):
-  def health(self) -> t.Any: raise NotImplementedError
-  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
-  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
-    return_raw_response = attrs.pop("return_raw_response", None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = "raw"
-    return_attrs = attrs.pop("return_attrs", None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = "attrs"
-    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
-    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-
-    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
-    if in_async_context(): result = httpx.post(urljoin(self._address, f"/{self._api_version}/generate"), json=inputs.model_dump(), timeout=self.timeout).json()
-    else: result = self.call("generate", inputs.model_dump())
-    r = self.postprocess(result)
-    if return_response == "attrs": return r
-    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
-    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
-
-  # NOTE: Scikit interface
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], self.query(prompt, **attrs))
-
-  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
-    if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    task = kwargs.pop("task", args[0])
-    return_code = kwargs.pop("return_code", False)
-    remote = kwargs.pop("remote", False)
-    try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
-    except Exception as err:
-      logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
-      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
-
-class BaseAsyncClient(ClientMeta[T]):
-  async def health(self) -> t.Any: raise NotImplementedError
-  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
-  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
-    return_raw_response = attrs.pop("return_raw_response", None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = "raw"
-    return_attrs = attrs.pop("return_attrs", None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = "attrs"
-    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
-    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-
-    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
-    res = await self.acall("generate", inputs.model_dump())
-    r = self.postprocess(res)
-
-    if return_response == "attrs": return r
-    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
-    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
-
-  # NOTE: Scikit interface
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  async def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], await self.query(prompt, **attrs))
-  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
-    """Async version of agent.run."""
-    if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    task = kwargs.pop("task", args[0])
-    return_code = kwargs.pop("return_code", False)
-    remote = kwargs.pop("remote", False)
-
-    from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
-    from transformers.tools.python_interpreter import evaluate
-
-    _hf_agent = self._hf_agent
-
-    prompt = t.cast(str, _hf_agent.format_prompt(task))
-    stop = ["Task:"]
-    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
-      response = await client.post(_hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},},)
-      if response.status_code != HTTPStatus.OK:
-        raise ValueError(f"Error {response.status_code}: {response.json()}")
-
-    result = response.json()[0]["generated_text"]
-    # Inference API returns the stop sequence
-    for stop_seq in stop:
-      if result.endswith(stop_seq):
-        result = result[:-len(stop_seq)]
-        break
-
-    # the below have the same logic as agent.run API
-    explanation, code = clean_code_for_run(result)
-    _hf_agent.log(f"==Explanation from the agent==\n{explanation}")
-    _hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
-    if not return_code:
-      _hf_agent.log("\n\n==Result==")
-      _hf_agent.cached_tools = resolve_tools(code, _hf_agent.toolbox, remote=remote, cached_tools=_hf_agent.cached_tools)
-      return evaluate(code, _hf_agent.cached_tools, state=kwargs.copy())
-    else:
-      tool_code = get_tool_creation_code(code, _hf_agent.toolbox, remote=remote)
-      return f"{tool_code}\n{code}"
diff --git a/openllm-python/src/openllm/client/runtimes/grpc.py b/openllm-python/src/openllm/client/runtimes/grpc.py
deleted file mode 100644
index 0115d80d..00000000
--- a/openllm-python/src/openllm/client/runtimes/grpc.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from __future__ import annotations
-import asyncio, logging, typing as t
-import orjson, openllm
-from openllm._typing_compat import LiteralRuntime
-from .base import BaseAsyncClient, BaseClient
-
-if t.TYPE_CHECKING:
-  from grpc_health.v1 import health_pb2
-  from bentoml.grpc.v1.service_pb2 import Response
-
-logger = logging.getLogger(__name__)
-
-class GrpcClient(BaseClient["Response"], client_type="grpc"):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(":")
-    super().__init__(address, timeout)
-  def health(self) -> health_pb2.HealthCheckResponse: return asyncio.run(self._cached.health("bentoml.grpc.v1.BentoService"))
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_name"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try:
-      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
-      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
-      return value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_id"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
-    from google.protobuf.json_format import MessageToDict
-    if isinstance(result, dict): return openllm.GenerationOutput(**result)
-    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
-
-class AsyncGrpcClient(BaseAsyncClient["Response"], client_type="grpc"):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(":")
-    super().__init__(address, timeout)
-  async def health(self) -> health_pb2.HealthCheckResponse: return await self._cached.health("bentoml.grpc.v1.BentoService")
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_name"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try:
-      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
-      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
-      return value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_id"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
-    from google.protobuf.json_format import MessageToDict
-    if isinstance(result, dict): return openllm.GenerationOutput(**result)
-    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
diff --git a/openllm-python/src/openllm/client/runtimes/http.py b/openllm-python/src/openllm/client/runtimes/http.py
deleted file mode 100644
index a8a73bf7..00000000
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from __future__ import annotations
-import logging, typing as t
-from urllib.parse import urljoin, urlparse
-import httpx, orjson, openllm
-from .base import BaseAsyncClient, BaseClient, in_async_context
-from openllm._typing_compat import DictStrAny, LiteralRuntime
-
-logger = logging.getLogger(__name__)
-def process_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
-  address = address if "://" in address else "http://" + address
-  parsed = urlparse(address)
-  self._host, *_port = parsed.netloc.split(":")
-  if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
-  else: self._port = next(iter(_port))
-
-class HTTPClient(BaseClient[DictStrAny]):
-  def __init__(self, address: str, timeout: int = 30):
-    process_address(self, address)
-    super().__init__(address, timeout)
-
-  def health(self) -> t.Any: return self._cached.health()
-  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if isinstance(prompt, str): prompt = [prompt]
-    result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
-    return openllm.EmbeddingsOutput(**result)
-
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try: return self._metadata["framework"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return self._metadata["timeout"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata["configuration"])
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.get("supports_embeddings", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.get("supports_hf_agent", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
-
-class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
-  def __init__(self, address: str, timeout: int = 30):
-    process_address(self, address)
-    super().__init__(address, timeout)
-
-  async def health(self) -> t.Any: return await self._cached.async_health()
-  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if isinstance(prompt, str): prompt = [prompt]
-    res = await self.acall("embeddings", list(prompt))
-    return openllm.EmbeddingsOutput(**res)
-
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try: return self._metadata["framework"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return self._metadata["timeout"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata["configuration"])
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.get("supports_embeddings", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.get("supports_hf_agent", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
diff --git a/openllm-python/src/openllm/exceptions.py b/openllm-python/src/openllm/exceptions.py
index 86e5d294..eba9b31a 100644
--- a/openllm-python/src/openllm/exceptions.py
+++ b/openllm-python/src/openllm/exceptions.py
@@ -1,19 +1,3 @@
 """Base exceptions for OpenLLM. This extends BentoML exceptions."""
 from __future__ import annotations
-import bentoml
-class OpenLLMException(bentoml.exceptions.BentoMLException):
-  """Base class for all OpenLLM exceptions. This extends BentoMLException."""
-class GpuNotAvailableError(OpenLLMException):
-  """Raised when there is no GPU available in given system."""
-class ValidationError(OpenLLMException):
-  """Raised when a validation fails."""
-class ForbiddenAttributeError(OpenLLMException):
-  """Raised when using an _internal field."""
-class MissingAnnotationAttributeError(OpenLLMException):
-  """Raised when a field under openllm.LLMConfig is missing annotations."""
-class MissingDependencyError(BaseException):
-  """Raised when a dependency is missing."""
-class Error(BaseException):
-  """To be used instead of naked raise."""
-class FineTuneStrategyNotSupportedError(OpenLLMException):
-  """Raised when a fine-tune strategy is not supported for given LLM."""
+from openllm_core.exceptions import OpenLLMException as OpenLLMException, GpuNotAvailableError as GpuNotAvailableError, ValidationError as ValidationError, ForbiddenAttributeError as ForbiddenAttributeError, MissingAnnotationAttributeError as MissingAnnotationAttributeError, MissingDependencyError as MissingDependencyError, Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError
diff --git a/openllm-python/src/openllm/models/__init__.py b/openllm-python/src/openllm/models/__init__.py
index 2fea005e..a2ee45f3 100644
--- a/openllm-python/src/openllm/models/__init__.py
+++ b/openllm-python/src/openllm/models/__init__.py
@@ -1,11 +1,11 @@
 # This file is generated by tools/update-models-import.py. DO NOT EDIT MANUALLY!
 # To update this, run ./tools/update-models-import.py
 from __future__ import annotations
-import typing as t, os
-from openllm.utils import LazyModule
-_MODELS: set[str] = {"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
-if t.TYPE_CHECKING: from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
-__lazy=LazyModule(__name__, os.path.abspath("__file__"), {k: [] for k in _MODELS})
+import typing as t
+from openllm_core.utils import LazyModule
+_MODELS:set[str]={"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
+if t.TYPE_CHECKING:from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
+__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})
 __all__=__lazy.__all__
 __dir__=__lazy.__dir__
 __getattr__=__lazy.__getattr__
diff --git a/openllm-python/src/openllm/models/auto/__init__.py b/openllm-python/src/openllm/models/auto/__init__.py
index 016e96f3..0552bcec 100644
--- a/openllm-python/src/openllm/models/auto/__init__.py
+++ b/openllm-python/src/openllm/models/auto/__init__.py
@@ -1,15 +1,11 @@
 from __future__ import annotations
 import typing as t, os
 import openllm
-from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
 
-_import_structure: dict[str, list[str]] = {"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"], "modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
+_import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
 if t.TYPE_CHECKING:
-  from .configuration_auto import (
-    CONFIG_MAPPING as CONFIG_MAPPING,
-    CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-    AutoConfig as AutoConfig,
-  )
   from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
   from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
   from .modeling_tf_auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py
index 1e82124b..48274184 100644
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -3,10 +3,10 @@ from __future__ import annotations
 import importlib, inspect, logging, typing as t
 from collections import OrderedDict
 import inflection, openllm
-from openllm.utils import ReprMixin
+from openllm_core.utils import ReprMixin
 
 if t.TYPE_CHECKING:
-  from openllm._typing_compat import LiteralString, LLMRunner
+  from openllm_core._typing_compat import LiteralString, LLMRunner
   import types
   from collections import _odict_items, _odict_keys, _odict_values
 
diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py
index 13d3cd1d..9b08b8e1 100644
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES
 
 MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
diff --git a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
index 20d45f6f..4d1d7e98 100644
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES
 
 MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")])
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
diff --git a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
index 9aa6a0a4..0d5b4ed9 100644
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES
 
 MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")])
 MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
index 12778a65..94b1bae5 100644
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES
 
 MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
diff --git a/openllm-python/src/openllm/models/baichuan/__init__.py b/openllm-python/src/openllm/models/baichuan/__init__.py
index f201ef91..cabb57e7 100644
--- a/openllm-python/src/openllm/models/baichuan/__init__.py
+++ b/openllm-python/src/openllm/models/baichuan/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_baichuan import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
+  BaichuanConfig as BaichuanConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_baichuan import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
-    BaichuanConfig as BaichuanConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
index 6b4cd5b9..c9fecdcf 100644
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -1,16 +1,11 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
-
-if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+if t.TYPE_CHECKING: import transformers
 
 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
   __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
       outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
diff --git a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
index 1e9e73d6..602c361f 100644
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
 
 class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
diff --git a/openllm-python/src/openllm/models/chatglm/__init__.py b/openllm-python/src/openllm/models/chatglm/__init__.py
index 90bacaed..dc194a18 100644
--- a/openllm-python/src/openllm/models/chatglm/__init__.py
+++ b/openllm-python/src/openllm/models/chatglm/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available
+from openllm_core.config.configuration_chatglm import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
+  ChatGLMConfig as ChatGLMConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_chatglm import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
-    ChatGLMConfig as ChatGLMConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
index ebcaa35e..ddf54a62 100644
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -1,32 +1,17 @@
 from __future__ import annotations
 import typing as t, openllm
-if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
-else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
-
+if t.TYPE_CHECKING: import transformers
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
   __openllm_internal__ = True
-
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    prompt_text = ""
-    if use_default_prompt_template and chat_history is not None:
-      for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
-      prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
-    else: prompt_text = prompt
-    postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
-    return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
-  def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
-    generated, history = generation_result
-    if self.config.retain_history:
-      if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
-      chat_history.extend(history)
-    return generated
   def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
+    import torch
     with torch.inference_mode():
       self.model.eval()
       # Only use half precision if the model is not yet quantized
       if self.config.use_half_precision: self.model.half()
       return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
   def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    import torch, torch.nn.functional as F
     embeddings: list[list[float]] = []
     num_tokens = 0
     for prompt in prompts:
diff --git a/openllm-python/src/openllm/models/dolly_v2/__init__.py b/openllm-python/src/openllm/models/dolly_v2/__init__.py
index fb8ce3d2..77b33c86 100644
--- a/openllm-python/src/openllm/models/dolly_v2/__init__.py
+++ b/openllm-python/src/openllm/models/dolly_v2/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_dolly_v2 import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
+  DollyV2Config as DollyV2Config,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_dolly_v2 import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
-    DollyV2Config as DollyV2Config,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index e9df5328..fdc5bcd0 100644
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 import logging, re, typing as t, openllm
-from openllm._prompt import process_prompt
-from openllm._typing_compat import overload
-from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
+from openllm_core._typing_compat import overload
+from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
 
 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
 else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -102,8 +101,6 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
   @property
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {}
   def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
   def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
     llm_config = self.config.model_construct_env(**attrs)
     with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
index 3694ae08..9f916dfa 100644
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
 
 logger = logging.getLogger(__name__)
 class VLLMDollyV2(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizer"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
diff --git a/openllm-python/src/openllm/models/falcon/__init__.py b/openllm-python/src/openllm/models/falcon/__init__.py
index 0d6f3138..2e2ae8e5 100644
--- a/openllm-python/src/openllm/models/falcon/__init__.py
+++ b/openllm-python/src/openllm/models/falcon/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_falcon import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
+  FalconConfig as FalconConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_falcon import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
-    FalconConfig as FalconConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
index 1351744c..533b027b 100644
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import torch, transformers
 else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 
@@ -9,8 +7,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
   __openllm_internal__ = True
   @property
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
     eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
diff --git a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
index 14a5af2e..e07cc8d1 100644
--- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
 
 logger = logging.getLogger(__name__)
 class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
diff --git a/openllm-python/src/openllm/models/flan_t5/__init__.py b/openllm-python/src/openllm/models/flan_t5/__init__.py
index 189bcc10..4d9a05a3 100644
--- a/openllm-python/src/openllm/models/flan_t5/__init__.py
+++ b/openllm-python/src/openllm/models/flan_t5/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available
+from openllm_core.config.configuration_flan_t5 import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING,
+  FlanT5Config as FlanT5Config,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_flan_t5 import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING,
-    FlanT5Config as FlanT5Config,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
index be61ec1f..ee75d8e3 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -1,17 +1,14 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
-else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
+if t.TYPE_CHECKING: import transformers
 
 class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
   __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
   def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    import torch, torch.nn.functional as F
     embeddings: list[list[float]] = []
     num_tokens = 0
     for prompt in prompts:
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 537d6f27..3c4ca703 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
+from openllm_core._prompt import process_prompt
+from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
 
 class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
@@ -9,7 +9,6 @@ class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "tra
   def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
     if decoder_start_token_id is None: decoder_start_token_id = 0
     return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
     # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
     decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index a3dfaba6..1a542d69 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -1,11 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
 
 class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
   __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/openllm-python/src/openllm/models/gpt_neox/__init__.py b/openllm-python/src/openllm/models/gpt_neox/__init__.py
index dbf164c7..7997dbd5 100644
--- a/openllm-python/src/openllm/models/gpt_neox/__init__.py
+++ b/openllm-python/src/openllm/models/gpt_neox/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_gpt_neox import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING,
+  GPTNeoXConfig as GPTNeoXConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_gpt_neox import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING,
-    GPTNeoXConfig as GPTNeoXConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
index e0deff47..6016c325 100644
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -1,20 +1,19 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+if t.TYPE_CHECKING: import transformers
 
 logger = logging.getLogger(__name__)
 class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
-  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
   def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
+    import transformers
     model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
     if self.config.use_half_precision: model.half()
     return model
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
index 8582b575..35e45015 100644
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
@@ -1,11 +1,7 @@
 from __future__ import annotations
-import typing as t, openllm, logging
-from openllm._prompt import process_prompt
-from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
+import typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
 
-logger = logging.getLogger(__name__)
 class VLLMGPTNeoX(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
diff --git a/openllm-python/src/openllm/models/llama/__init__.py b/openllm-python/src/openllm/models/llama/__init__.py
index a630485a..6f2f03a3 100644
--- a/openllm-python/src/openllm/models/llama/__init__.py
+++ b/openllm-python/src/openllm/models/llama/__init__.py
@@ -2,15 +2,14 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_llama import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  PROMPT_MAPPING as PROMPT_MAPPING,
+  START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING,
+  LlamaConfig as LlamaConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlamaConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
-if t.TYPE_CHECKING:
-  from .configuration_llama import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    PROMPT_MAPPING as PROMPT_MAPPING,
-    START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING,
-    LlamaConfig as LlamaConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py
index 60226f03..24610581 100644
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -1,17 +1,14 @@
 from __future__ import annotations
-import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
-else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
-
-logger = logging.getLogger(__name__)
+import typing as t, openllm
+if t.TYPE_CHECKING: import transformers
 class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
   __openllm_internal__ = True
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
   def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    import torch, torch.nn.functional as F
     encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
     input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
     with torch.inference_mode():
diff --git a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
index 11981597..22660a64 100644
--- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
@@ -1,10 +1,5 @@
 from __future__ import annotations
-import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
+import typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
 class VLLMLlama(openllm.LLM["vllm.LLMEngine", "transformers.LlamaTokenizerFast"]):
   __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
diff --git a/openllm-python/src/openllm/models/mpt/__init__.py b/openllm-python/src/openllm/models/mpt/__init__.py
index 9f7fe1f5..004abd64 100644
--- a/openllm-python/src/openllm/models/mpt/__init__.py
+++ b/openllm-python/src/openllm/models/mpt/__init__.py
@@ -2,15 +2,14 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_mpt import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  PROMPT_MAPPING as PROMPT_MAPPING,
+  START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING,
+  MPTConfig as MPTConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
-if t.TYPE_CHECKING:
-  from .configuration_mpt import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    PROMPT_MAPPING as PROMPT_MAPPING,
-    START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING,
-    MPTConfig as MPTConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
index 214cfb28..827c5e9f 100644
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -1,14 +1,11 @@
 from __future__ import annotations
 import logging, typing as t, bentoml, openllm
-from openllm._prompt import process_prompt
 from openllm.utils import generate_labels, is_triton_available
-from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
-
 if t.TYPE_CHECKING: import transformers, torch
-else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
 
 logger = logging.getLogger(__name__)
 def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
+  import torch
   config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
   if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
   if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
@@ -18,10 +15,15 @@ def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torc
   return config
 class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
-  def llm_post_init(self) -> None: self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+  def llm_post_init(self) -> None:
+    import torch
+    self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
   def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
+    import torch, transformers
     _, tokenizer_attrs = self.llm_parameters
     torch_dtype = attrs.pop("torch_dtype", self.dtype)
     device_map = attrs.pop("device_map", None)
@@ -33,6 +35,7 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
     try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
     finally: torch.cuda.empty_cache()
   def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
+    import transformers
     torch_dtype = attrs.pop("torch_dtype", self.dtype)
     device_map = attrs.pop("device_map", None)
     trust_remote_code = attrs.pop("trust_remote_code", True)
@@ -40,18 +43,8 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
     model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
     model.tie_weights()
     return model
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    _template = None
-    if use_default_prompt_template:
-      if prompt_type is None:
-        if "instruct" in self.model_id: prompt_type = "instruct"
-        elif "storywriter" in self.model_id: prompt_type = "storywriter"
-        elif "chat" in self.model_id: prompt_type = "chat"
-        else: prompt_type = "default"
-      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
-    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     llm_config = self.config.model_construct_env(**attrs)
     inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
     attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
diff --git a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
index 35ba16ba..37f539c7 100644
--- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
@@ -1,20 +1,6 @@
 from __future__ import annotations
-import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
+import typing as t, openllm
 if t.TYPE_CHECKING: import transformers, vllm
-
-logger = logging.getLogger(__name__)
 class VLLMMPT(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    _template = None
-    if use_default_prompt_template:
-      if prompt_type is None:
-        if "instruct" in self.model_id: prompt_type = "instruct"
-        elif "storywriter" in self.model_id: prompt_type = "storywriter"
-        elif "chat" in self.model_id: prompt_type = "chat"
-        else: prompt_type = "default"
-      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
-    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
diff --git a/openllm-python/src/openllm/models/opt/__init__.py b/openllm-python/src/openllm/models/opt/__init__.py
index ebd225e4..87cb0b2d 100644
--- a/openllm-python/src/openllm/models/opt/__init__.py
+++ b/openllm-python/src/openllm/models/opt/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_opt import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING,
+  OPTConfig as OPTConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_opt import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING,
-    OPTConfig as OPTConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
index 089959fd..7858b42d 100644
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import logging, typing as t, bentoml, openllm
 from openllm._prompt import process_prompt
 from openllm.utils import generate_labels
-from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
 else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
 
@@ -14,8 +14,4 @@ class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tok
     tokenizer.pad_token_id = config.pad_token_id
     return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
   def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-    if len(generation_result) == 1: return generation_result[0]
-    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
-    else: return "\n".join(generation_result)
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py
index 9c9456e9..32b27713 100644
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -1,19 +1,14 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+if t.TYPE_CHECKING: import transformers
 
 logger = logging.getLogger(__name__)
 class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
   __openllm_internal__ = True
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-    if len(generation_result) == 1: return generation_result[0]
-    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
-    else: return "\n".join(generation_result)
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
index a53dd871..8ab0eaa7 100644
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -1,21 +1,12 @@
 from __future__ import annotations
-import logging, typing as t, bentoml, openllm
-from openllm._prompt import process_prompt
-from openllm.utils import generate_labels
-from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+import typing as t, bentoml, openllm
+from openllm_core.utils import generate_labels
 if t.TYPE_CHECKING: import transformers
-else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-logger = logging.getLogger(__name__)
 class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
   __openllm_internal__ = True
   def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
+    import transformers
     config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
     tokenizer.pad_token_id = config.pad_token_id
     return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-    if len(generation_result) == 1: return generation_result[0]
-    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
-    else: return "\n".join(generation_result)
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
index 6591ae5e..5ca8c77d 100644
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -1,10 +1,8 @@
 from __future__ import annotations
-import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+import typing as t, openllm
+from openllm_core._prompt import process_prompt
+from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
 class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
diff --git a/openllm-python/src/openllm/models/stablelm/__init__.py b/openllm-python/src/openllm/models/stablelm/__init__.py
index 2927aaec..3f7f1258 100644
--- a/openllm-python/src/openllm/models/stablelm/__init__.py
+++ b/openllm-python/src/openllm/models/stablelm/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_stablelm import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING,
+  StableLMConfig as StableLMConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_stablelm import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING,
-    StableLMConfig as StableLMConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
index ddeb6cb3..77db1d0c 100644
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -1,23 +1,15 @@
 from __future__ import annotations
-import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE, SYSTEM_PROMPT
-
-if t.TYPE_CHECKING: import transformers, torch
-else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
-
-logger = logging.getLogger(__name__)
+import typing as t, openllm
+if t.TYPE_CHECKING: import transformers
 class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
-  def llm_post_init(self) -> None: self.bettertransformer = True if not torch.cuda.is_available() else False
+  def llm_post_init(self) -> None:
+    import torch
+    self.bettertransformer = True if not torch.cuda.is_available() else False
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    if "tuned" in self._model_id and use_default_prompt_template:
-      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
-      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
-    else: prompt_text = prompt
-    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
-  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)]
diff --git a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
index 56eddfe5..f35fd071 100644
--- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
@@ -1,16 +1,6 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE, SYSTEM_PROMPT
 if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
 class VLLMStableLM(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    if "tuned" in self._model_id and use_default_prompt_template:
-      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
-      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
-    else: prompt_text = prompt
-    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
diff --git a/openllm-python/src/openllm/models/starcoder/__init__.py b/openllm-python/src/openllm/models/starcoder/__init__.py
index 6cc2c524..f6e68aff 100644
--- a/openllm-python/src/openllm/models/starcoder/__init__.py
+++ b/openllm-python/src/openllm/models/starcoder/__init__.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_starcoder import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING,
+  StarCoderConfig as StarCoderConfig,
+)
 
-_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_starcoder import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING,
-    StarCoderConfig as StarCoderConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
   if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
index 4abb20ab..83e7fc17 100644
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -1,34 +1,24 @@
 from __future__ import annotations
 import logging, typing as t, bentoml, openllm
 from openllm.utils import generate_labels
-from .configuration_starcoder import EOD, FIM_INDICATOR, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
-if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-logger = logging.getLogger(__name__)
+from openllm_core.config.configuration_starcoder import EOD, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
+if t.TYPE_CHECKING: import transformers
 class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
   __openllm_internal__ = True
   @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+    import torch
+    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
   def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
+    import torch, transformers
     torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
     tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
     tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
     model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
     try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
     finally: torch.cuda.empty_cache()
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
-    if fim_mode:
-      try: prefix, suffix = prompt.split(FIM_INDICATOR)
-      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
-      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
-    else: prompt_text = prompt
-    # XXX: This value for pad_token_id is currently a hack, need more investigate why the
-    # default starcoder doesn't include the same value as santacoder EOD
-    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
     with torch.inference_mode():
       # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
       # NOTE: support fine-tuning starcoder
diff --git a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
index b54aa63d..f3145546 100644
--- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
@@ -1,19 +1,6 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from .configuration_starcoder import EOD, FIM_INDICATOR, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
 class VLLMStarCoder(openllm.LLM["vllm.LLMEngine", "transformers.GPT2TokenizerFast"]):
   __openllm_internal__ = True
   tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
-    if fim_mode:
-      try: prefix, suffix = prompt.split(FIM_INDICATOR)
-      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
-      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
-    else: prompt_text = prompt
-    # XXX: This value for pad_token_id is currently a hack, need more investigate why the
-    # default starcoder doesn't include the same value as santacoder EOD
-    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
index 4adadbe2..226f4c43 100644
--- a/openllm-python/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import importlib, typing as t
 import cloudpickle, fs, openllm
 from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
-from openllm._typing_compat import M, T, ParamSpec
+from openllm_core._typing_compat import M, T, ParamSpec
 
 if t.TYPE_CHECKING:
   import bentoml
diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py
index c7dc1ffb..33aa6754 100644
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -6,7 +6,7 @@ from __future__ import annotations
 import typing as t
 import bentoml, openllm
 
-if t.TYPE_CHECKING: from openllm._typing_compat import M
+if t.TYPE_CHECKING: from openllm_core._typing_compat import M
 
 _conversion_strategy = {"pt": "ggml"}
 
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index 87afefff..84d5d403 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
   import torch.nn
 
   from bentoml._internal.models import ModelStore
-  from openllm._typing_compat import DictStrAny, M, T
+  from openllm_core._typing_compat import DictStrAny, M, T
 else:
   vllm = openllm.utils.LazyLoader("vllm", globals(), "vllm")
   autogptq = openllm.utils.LazyLoader("autogptq", globals(), "auto_gptq")
diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
index 92f0d4a3..9251c2e3 100644
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-import copy, typing as t, openllm
+import copy, typing as t, openllm_core, openllm
 from bentoml._internal.models.model import ModelInfo, ModelSignature
 from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING, HUB_ATTRS
 
@@ -7,8 +7,8 @@ if t.TYPE_CHECKING:
   import torch, transformers, bentoml
   from transformers.models.auto.auto_factory import _BaseAutoModelClass
   from bentoml._internal.models.model import ModelSignaturesType
-  from openllm._typing_compat import DictStrAny, M, T
-else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
+  from openllm_core._typing_compat import DictStrAny, M, T
+else: transformers, torch = openllm_core.utils.LazyLoader("transformers", globals(), "transformers"), openllm_core.utils.LazyLoader("torch", globals(), "torch")
 
 _object_setattr = object.__setattr__
 
@@ -33,7 +33,7 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
   return config, hub_attrs, attrs
 
 def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
-  __cls = getattr(transformers, openllm.utils.first_not_none(__llm.config["tokenizer_class"], default="AutoTokenizer"), None)
+  __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config["tokenizer_class"], default="AutoTokenizer"), None)
   if __cls is None: raise ValueError(f"Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`")
   return __cls
 
diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py
index ee5372e2..f6017893 100644
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -3,7 +3,7 @@ import typing as t, attr
 from huggingface_hub import HfApi
 if t.TYPE_CHECKING:
   import openllm
-  from openllm._typing_compat import M, T
+  from openllm_core._typing_compat import M, T
 
 def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool: return any(s.rfilename.endswith(".safetensors") for s in HfApi().model_info(model_id, revision=revision).siblings)
 @attr.define(slots=True)
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index 147a1b16..414baea8 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -4,244 +4,19 @@ User can import these function for convenience, but
 we won't ensure backward compatibility for these functions. So use with caution.
 """
 from __future__ import annotations
-import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm
-from pathlib import Path
-from circus.exc import ConflictError
-from bentoml._internal.configuration import (
-  DEBUG_ENV_VAR as DEBUG_ENV_VAR,
-  GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
-  QUIET_ENV_VAR as QUIET_ENV_VAR,
-  get_debug_mode as _get_debug_mode,
-  get_quiet_mode as _get_quiet_mode,
-  set_quiet_mode as set_quiet_mode,
-)
-from bentoml._internal.models.model import ModelContext as _ModelContext
-from bentoml._internal.types import LazyType as LazyType
-from bentoml._internal.utils import (
-  LazyLoader as LazyLoader,
-  bentoml_cattr as bentoml_cattr,
-  calc_dir_size as calc_dir_size,
-  first_not_none as first_not_none,
-  pkg as pkg,
-  reserve_free_port as reserve_free_port,
-  resolve_user_filepath as resolve_user_filepath,
-)
-from openllm.utils.lazy import (
-  LazyModule as LazyModule,
-  VersionInfo as VersionInfo,
+import typing as t, openllm_core
+from . import (
+  dummy_flax_objects as dummy_flax_objects,
+  dummy_pt_objects as dummy_pt_objects,
+  dummy_tf_objects as dummy_tf_objects,
+  dummy_vllm_objects as dummy_vllm_objects,
 )
 
 if t.TYPE_CHECKING:
-  from openllm._typing_compat import AnyCallable, LiteralRuntime
-
-logger = logging.getLogger(__name__)
-try: from typing import GenericAlias as _TypingGenericAlias  # type: ignore
-except ImportError: _TypingGenericAlias = ()  # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
-if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
-else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
-
-DEV_DEBUG_VAR = "OPENLLMDEVDEBUG"
-
-def set_debug_mode(enabled: bool, level: int = 1) -> None:
-  # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs
-  if enabled: os.environ[DEV_DEBUG_VAR] = str(level)
-  os.environ[DEBUG_ENV_VAR] = str(enabled)
-  os.environ[_GRPC_DEBUG_ENV_VAR] = "DEBUG" if enabled else "ERROR"
-
-def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
-  try:
-    return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
-  except TypeError:
-    if isinstance(cls, _WithArgsTypes): return False
-    raise
-
-def available_devices() -> tuple[str, ...]:
-  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
-  from openllm._strategies import NvidiaGpuResource
-  return tuple(NvidiaGpuResource.from_system())
-
-@functools.lru_cache(maxsize=128)
-def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str:
-  """Generate a hash from given file's modification time.
-
-  Args:
-  f: The file to generate the hash from.
-  algorithm: The hashing algorithm to use. Defaults to 'sha1' (similar to how Git generate its commit hash.)
-
-  Returns:
-  The generated hash.
-  """
-  return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
-
-@functools.lru_cache(maxsize=1)
-def device_count() -> int: return len(available_devices())
-
-# equivocal setattr to save one lookup per assignment
-_object_setattr = object.__setattr__
-
-def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
-  """This makes sure that we don't overwrite any existing attributes on the object."""
-  _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
-  if not hasattr(obj, name): _setattr(name, value)
-
-def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
-
-# Special debug flag controled via OPENLLMDEVDEBUG
-DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
-# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
-MYPY = False
-SHOW_CODEGEN: bool = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
-
-def get_debug_mode() -> bool: return DEBUG or _get_debug_mode()
-def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode()
-
-class ExceptionFilter(logging.Filter):
-  def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
-    """A filter of all exception."""
-    if exclude_exceptions is None: exclude_exceptions = [ConflictError]
-    if ConflictError not in exclude_exceptions: exclude_exceptions.append(ConflictError)
-    super(ExceptionFilter, self).__init__(**kwargs)
-    self.EXCLUDE_EXCEPTIONS = exclude_exceptions
-
-  def filter(self, record: logging.LogRecord) -> bool:
-    if record.exc_info:
-      etype, _, _ = record.exc_info
-      if etype is not None:
-        for exc in self.EXCLUDE_EXCEPTIONS:
-          if issubclass(etype, exc): return False
-    return True
-
-class InfoFilter(logging.Filter):
-  def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING
-
-_LOGGING_CONFIG: dict[str, t.Any] = {
-    "version": 1, "disable_existing_loggers": True,
-    "filters": {"excfilter": {"()": "openllm.utils.ExceptionFilter"}, "infofilter": {"()": "openllm.utils.InfoFilter"}},
-    "handlers": {"bentomlhandler": {"class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout"}, "defaulthandler": {"class": "logging.StreamHandler", "level": logging.WARNING}},
-    "loggers": {"bentoml": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}, "openllm": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False,}},
-    "root": {"level": logging.WARNING},
-}
-
-def configure_logging() -> None:
-  """Configure logging for OpenLLM.
-
-  Behaves similar to how BentoML loggers are being configured.
-  """
-  if get_quiet_mode():
-    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.ERROR
-    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.ERROR
-    _LOGGING_CONFIG["root"]["level"] = logging.ERROR
-  elif get_debug_mode() or DEBUG:
-    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.DEBUG
-    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.DEBUG
-    _LOGGING_CONFIG["root"]["level"] = logging.DEBUG
-  else:
-    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.INFO
-    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.INFO
-    _LOGGING_CONFIG["root"]["level"] = logging.INFO
-
-  logging.config.dictConfig(_LOGGING_CONFIG)
-
-@functools.lru_cache(maxsize=1)
-def in_notebook() -> bool:
-  try:
-    from IPython.core.getipython import get_ipython
-    if t.TYPE_CHECKING:
-      from IPython.core.interactiveshell import InteractiveShell
-    return "IPKernelApp" in t.cast("dict[str, t.Any]", t.cast(t.Callable[[], "InteractiveShell"], get_ipython)().config)
-  except (ImportError, AttributeError): return False
-
-_dockerenv, _cgroup = Path("/.dockerenv"), Path("/proc/self/cgroup")
-
-class suppress(contextlib.suppress, contextlib.ContextDecorator):
-  """A version of contextlib.suppress with decorator support.
-
-  >>> @suppress(KeyError)
-  ... def key_error():
-  ...     {}['']
-  >>> key_error()
-  """
-
-def compose(*funcs: AnyCallable) -> AnyCallable:
-  """Compose any number of unary functions into a single unary function.
-
-  >>> import textwrap
-  >>> expected = str.strip(textwrap.dedent(compose.__doc__))
-  >>> strip_and_dedent = compose(str.strip, textwrap.dedent)
-  >>> strip_and_dedent(compose.__doc__) == expected
-  True
-
-  Compose also allows the innermost function to take arbitrary arguments.
-
-  >>> round_three = lambda x: round(x, ndigits=3)
-  >>> f = compose(round_three, int.__truediv__)
-  >>> [f(3*x, x+1) for x in range(1,10)]
-  [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7]
-  """
-  def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs))
-  return functools.reduce(compose_two, funcs)
-
-def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
-  """Decorate a function with a transform function that is invoked on results returned from the decorated function.
-
-  ```python
-  @apply(reversed)
-  def get_numbers(start):
-  "doc for get_numbers"
-  return range(start, start+3)
-  list(get_numbers(4))
-  # [6, 5, 4]
-  ```
-  ```python
-  get_numbers.__doc__
-  # 'doc for get_numbers'
-  ```
-  """
-  return lambda func: functools.wraps(func)(compose(transform, func))
-
-@apply(bool)
-@suppress(FileNotFoundError)
-def _text_in_file(text: str, filename: Path) -> bool:
-  return any(text in line for line in filename.open())
-
-def in_docker() -> bool:
-  """Is this current environment running in docker?
-
-  ```python
-  type(in_docker())
-  ```
-  """
-  return _dockerenv.exists() or _text_in_file("docker", _cgroup)
-
-T, K = t.TypeVar("T"), t.TypeVar("K")
-
-def resolve_filepath(path: str, ctx: str | None = None) -> str:
-  """Resolve a file path to an absolute path, expand user and environment variables."""
-  try: return resolve_user_filepath(path, ctx)
-  except FileNotFoundError: return path
-
-def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path)))
-
-def generate_context(framework_name: str) -> _ModelContext:
-  framework_versions = {"transformers": pkg.get_pkg_version("transformers")}
-  if openllm.utils.is_torch_available(): framework_versions["torch"] = pkg.get_pkg_version("torch")
-  if openllm.utils.is_tf_available():
-    from bentoml._internal.frameworks.utils.tensorflow import get_tf_version
-    framework_versions["tensorflow"] = get_tf_version()
-  if openllm.utils.is_flax_available(): framework_versions.update({"flax": pkg.get_pkg_version("flax"), "jax": pkg.get_pkg_version("jax"), "jaxlib": pkg.get_pkg_version("jaxlib")})
-  return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)
+  from openllm_core._typing_compat import LiteralRuntime
+  import openllm
 
 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: return {"runtime": llm.runtime, "framework": "openllm", "model_name": llm.config["model_name"], "architecture": llm.config["architecture"], "serialisation_format": llm._serialisation_format}
-
-_TOKENIZER_PREFIX = "_tokenizer_"
-
-def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-  """Normalize the given attrs to a model and tokenizer kwargs accordingly."""
-  tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
-  for k in tuple(attrs.keys()):
-    if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
-  return attrs, tokenizer_attrs
-
 def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
   import openllm
   if implementation == "tf": return openllm.AutoTFLLM
@@ -250,62 +25,8 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o
   elif implementation == "vllm": return openllm.AutoVLLM
   else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
 
-# NOTE: The set marks contains a set of modules name
-# that are available above and are whitelisted
-# to be included in the extra_objects map.
-_whitelist_modules = {"pkg"}
-
-# XXX: define all classes, functions import above this line
-# since _extras will be the locals() import from this file.
-_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))}
-_extras["__openllm_migration__"] = {"ModelEnv": "EnvVarMixin"}
-_import_structure: dict[str, list[str]] = {
-  "analytics": [], "codegen": [], "dantic": [], "dummy_flax_objects": [], "dummy_pt_objects": [], "dummy_tf_objects": [], "dummy_vllm_objects": [], "representation": ["ReprMixin"], "lazy": ["LazyModule"],
-  "import_utils": ["OPTIONAL_DEPENDENCIES", "ENV_VARS_TRUE_VALUES", "DummyMetaclass", "EnvVarMixin", "require_backends",
-                  "is_cpm_kernels_available", "is_einops_available", "is_flax_available", "is_tf_available", "is_vllm_available", "is_torch_available", "is_bitsandbytes_available", "is_peft_available", "is_datasets_available",
-                  "is_transformers_supports_kbit", "is_transformers_supports_agent", "is_jupyter_available", "is_jupytext_available", "is_notebook_available", "is_triton_available", "is_autogptq_available", "is_sentencepiece_available",
-                  "is_xformers_available", "is_fairscale_available"]}
-
-if t.TYPE_CHECKING:
-  # NOTE: The following exports useful utils from bentoml
-  from . import (
-    analytics as analytics,
-    codegen as codegen,
-    dantic as dantic,
-    dummy_flax_objects as dummy_flax_objects,
-    dummy_pt_objects as dummy_pt_objects,
-    dummy_tf_objects as dummy_tf_objects,
-    dummy_vllm_objects as dummy_vllm_objects,
-  )
-  from .import_utils import (
-    ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
-    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
-    DummyMetaclass as DummyMetaclass,
-    EnvVarMixin as EnvVarMixin,
-    is_autogptq_available as is_autogptq_available,
-    is_bitsandbytes_available as is_bitsandbytes_available,
-    is_cpm_kernels_available as is_cpm_kernels_available,
-    is_datasets_available as is_datasets_available,
-    is_einops_available as is_einops_available,
-    is_fairscale_available as is_fairscale_available,
-    is_flax_available as is_flax_available,
-    is_jupyter_available as is_jupyter_available,
-    is_jupytext_available as is_jupytext_available,
-    is_notebook_available as is_notebook_available,
-    is_peft_available as is_peft_available,
-    is_sentencepiece_available as is_sentencepiece_available,
-    is_tf_available as is_tf_available,
-    is_torch_available as is_torch_available,
-    is_transformers_supports_agent as is_transformers_supports_agent,
-    is_transformers_supports_kbit as is_transformers_supports_kbit,
-    is_triton_available as is_triton_available,
-    is_vllm_available as is_vllm_available,
-    is_xformers_available as is_xformers_available,
-    require_backends as require_backends,
-  )
-  from .representation import ReprMixin as ReprMixin
-
-__lazy = LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects=_extras)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__ = ["generate_labels", "infer_auto_class", "dummy_flax_objects", "dummy_pt_objects", "dummy_tf_objects", "dummy_vllm_objects"]
+def __dir__() -> t.Sequence[str]: return sorted(__all__)
+def __getattr__(it: str) -> t.Any:
+  if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
+  else: raise AttributeError(f"module {__name__} has no attribute {it}")
diff --git a/openllm-python/src/openllm/utils/dummy_flax_objects.py b/openllm-python/src/openllm/utils/dummy_flax_objects.py
index e6e06644..92bdbe52 100644
--- a/openllm-python/src/openllm/utils/dummy_flax_objects.py
+++ b/openllm-python/src/openllm/utils/dummy_flax_objects.py
@@ -2,7 +2,7 @@
 # To update this, run ./tools/update-dummy.py
 from __future__ import annotations
 import typing as _t
-from openllm.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
+from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
 class FlaxFlanT5(metaclass=_DummyMetaclass):
   _backends=["flax"]
   def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["flax"])
diff --git a/openllm-python/src/openllm/utils/dummy_pt_objects.py b/openllm-python/src/openllm/utils/dummy_pt_objects.py
index b76d58cb..4242f0b9 100644
--- a/openllm-python/src/openllm/utils/dummy_pt_objects.py
+++ b/openllm-python/src/openllm/utils/dummy_pt_objects.py
@@ -2,7 +2,7 @@
 # To update this, run ./tools/update-dummy.py
 from __future__ import annotations
 import typing as _t
-from openllm.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
+from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
 class ChatGLM(metaclass=_DummyMetaclass):
   _backends=["torch","cpm_kernels","sentencepiece"]
   def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["torch","cpm_kernels","sentencepiece"])
diff --git a/openllm-python/src/openllm/utils/dummy_tf_objects.py b/openllm-python/src/openllm/utils/dummy_tf_objects.py
index f1a4e8c0..8e05b101 100644
--- a/openllm-python/src/openllm/utils/dummy_tf_objects.py
+++ b/openllm-python/src/openllm/utils/dummy_tf_objects.py
@@ -2,7 +2,7 @@
 # To update this, run ./tools/update-dummy.py
 from __future__ import annotations
 import typing as _t
-from openllm.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
+from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
 class TFFlanT5(metaclass=_DummyMetaclass):
   _backends=["tensorflow"]
   def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["tensorflow"])
diff --git a/openllm-python/src/openllm/utils/dummy_vllm_objects.py b/openllm-python/src/openllm/utils/dummy_vllm_objects.py
index d1ca58c8..048a4771 100644
--- a/openllm-python/src/openllm/utils/dummy_vllm_objects.py
+++ b/openllm-python/src/openllm/utils/dummy_vllm_objects.py
@@ -2,7 +2,7 @@
 # To update this, run ./tools/update-dummy.py
 from __future__ import annotations
 import typing as _t
-from openllm.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
+from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends
 class VLLMBaichuan(metaclass=_DummyMetaclass):
   _backends=["vllm","cpm_kernels","sentencepiece"]
   def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm","cpm_kernels","sentencepiece"])
diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py
index fecdce38..860e5e7f 100644
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -1,12 +1,8 @@
 from __future__ import annotations
-import logging
-import typing as t
-
+import logging, typing as t, openllm
+from openllm_core._configuration import ModelSettings
 from hypothesis import strategies as st
 
-import openllm
-from openllm._configuration import ModelSettings
-
 logger = logging.getLogger(__name__)
 
 env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_name in openllm.CONFIG_MAPPING.keys()])
diff --git a/openllm-python/tests/compiled_test.py b/openllm-python/tests/compiled_test.py
deleted file mode 100644
index d2970ecd..00000000
--- a/openllm-python/tests/compiled_test.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from __future__ import annotations
-
-import pytest
-
-import openllm
-
-@pytest.mark.parametrize("name", ["AutoConfig", "AutoLLM", "AutoVLLM", "AutoFlaxLLM", "AutoTFLLM", "LLM", "LLMRunner", "LLMRunnable", "LLMEmbeddings",
-                                  "Runner", "client", "exceptions", "bundle", "build", "ggml", "transformers", "import_model", "infer_auto_class",
-                                  "infer_quantisation_config", "models", "list_models", "start", "start_grpc", "build", "serialisation"])
-def test_compiled_imports(name: str):
-  assert getattr(openllm, name) is not None
diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
index 6d40dcfa..f7d8e7d6 100644
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -1,32 +1,10 @@
 from __future__ import annotations
-import contextlib
-import logging
-import os
-import sys
-import typing as t
+import contextlib, os, sys, typing as t, attr, pytest, transformers, openllm
 from unittest import mock
-
-import attr
-import pytest
-import transformers
-from hypothesis import (
-  assume,
-  given,
-  strategies as st,
-)
-
-import openllm
-from openllm._configuration import GenerationConfig, ModelSettings, field_env_key
-
+from openllm_core._configuration import GenerationConfig, ModelSettings, field_env_key
+from hypothesis import assume, given, strategies as st
 from ._strategies._configuration import make_llm_config, model_settings
 
-logger = logging.getLogger(__name__)
-
-if t.TYPE_CHECKING:
-  DictStrAny = dict[str, t.Any]
-else:
-  DictStrAny = dict
-
 # XXX: @aarnphm fixes TypedDict behaviour in 3.11
 @pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason="TypedDict in 3.11 behaves differently, so we need to fix this")
 def test_missing_default():
@@ -43,7 +21,6 @@ def test_forbidden_access():
   assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "__config__",)
   assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "GenerationConfig",)
   assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "SamplingParams",)
-
   assert openllm.utils.lenient_issubclass(cl_.__openllm_generation_class__, GenerationConfig)
 
 @given(model_settings())
@@ -133,7 +110,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
     assert sent.field1 == 20.0
 
 @given(model_settings())
-@pytest.mark.parametrize(("return_dict", "typ"), [(True, DictStrAny), (False, transformers.GenerationConfig)])
+@pytest.mark.parametrize(("return_dict", "typ"), [(True, dict), (False, transformers.GenerationConfig)])
 def test_conversion_to_transformers(return_dict: bool, typ: type[t.Any], gen_settings: ModelSettings):
   cl_ = make_llm_config("ConversionLLM", gen_settings)
   assert isinstance(cl_().to_generation_config(return_as_dict=return_dict), typ)
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index 1c04cdfb..e5106f9e 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import itertools, os, typing as t, pytest, openllm
-if t.TYPE_CHECKING: from openllm._configuration import LiteralRuntime
+if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralRuntime
 
 _FRAMEWORK_MAPPING = {"flan_t5": "google/flan-t5-small", "opt": "facebook/opt-125m", "baichuan": "baichuan-inc/Baichuan-7B",}
 _PROMPT_MAPPING = {"qa": "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?",}
diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py
index f5404db2..3f55000c 100644
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 import attr, docker, docker.errors, docker.types, orjson, pytest, openllm
 from syrupy.extensions.json import JSONSnapshotExtension
 from openllm._llm import normalise_model_name
-from openllm._typing_compat import DictStrAny, ListAny
+from openllm_core._typing_compat import DictStrAny, ListAny
 
 logger = logging.getLogger(__name__)
 
diff --git a/openllm-python/tests/models_test.py b/openllm-python/tests/models_test.py
index 0017da89..d0b35219 100644
--- a/openllm-python/tests/models_test.py
+++ b/openllm-python/tests/models_test.py
@@ -1,11 +1,7 @@
 from __future__ import annotations
-import os
-import typing as t
+import os, typing as t, pytest
 
-import pytest
-
-if t.TYPE_CHECKING:
-  import openllm
+if t.TYPE_CHECKING: import openllm
 
 @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") is not None, reason="Model is too large for CI")
 def test_flan_t5_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]):
diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py
index 5fc6bd17..a50a8b13 100644
--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -1,15 +1,7 @@
 from __future__ import annotations
-import functools
-import os
-import typing as t
-
-import pytest
-
-import openllm
+import functools, os, typing as t, pytest, openllm
 from bentoml._internal.configuration.containers import BentoMLContainer
-
-if t.TYPE_CHECKING:
-  from pathlib import Path
+if t.TYPE_CHECKING: from pathlib import Path
 
 HF_INTERNAL_T5_TESTING = "hf-internal-testing/tiny-random-t5"
 
diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
index d9520a5a..162dbde4 100644
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -1,16 +1,8 @@
 from __future__ import annotations
-import os
-import typing as t
-
-import pytest
-
-if t.TYPE_CHECKING:
-  from _pytest.monkeypatch import MonkeyPatch
-
-import bentoml
-from bentoml._internal.resource import get_resource
-from openllm import _strategies as strategy
-from openllm._strategies import CascadingResourceStrategy, NvidiaGpuResource
+import os, typing as t, pytest, bentoml
+from openllm_core import _strategies as strategy
+from openllm_core._strategies import CascadingResourceStrategy, NvidiaGpuResource, get_resource
+if t.TYPE_CHECKING: from _pytest.monkeypatch import MonkeyPatch
 
 def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
diff --git a/package.json b/package.json
index f5895535..7e29744a 100644
--- a/package.json
+++ b/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "openllm",
+  "name": "openllm-monorepo",
   "version": "0.2.27.dev0",
   "description": "OpenLLM: Operating LLMs in production",
   "repository": "git@github.com:bentoml/OpenLLM.git",
diff --git a/pyproject.toml b/pyproject.toml
index f426f9c6..a7e9b5f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,6 @@ requires = [
   "hatchling==1.18.0",
   "hatch-vcs==0.3.0",
   "hatch-fancy-pypi-readme==23.1.0",
-  "hatch-mypyc==0.16.0",
 ]
 [project]
 authors = [
@@ -102,6 +101,9 @@ MYPYC_OPT_LEVEL = "3"
 [tool.cibuildwheel.linux]
 before-build = ["yum install -y clang gcc"]
 
+[tool.cibuildwheel.macos]
+before-build = ["rustup target add aarch64-apple-darwin"]
+
 [tool.cibuildwheel.linux.environment]
 HATCH_BUILD_HOOKS_ENABLE = "1"
 MYPYC_DEBUG_LEVEL = "1"
@@ -167,11 +169,9 @@ extend-include = ["*.ipynb"]
 extend-select = [
   "I",    # isort
   "G",    # flake8-logging-format
-  "D",    # pydocstyle
   "W",    # pycodestyle
   "Q",    # flake8-quotes
   "FA",   # flake8-future-annotations
-  "S",    # flake8-bandit
   "TCH",  # flake8-type-checking
   "PLW",  # pylint-warning
   "PLR",  # pylint-refactor
@@ -184,10 +184,6 @@ extend-select = [
 fix = true
 ignore = [
   "FBT003",  # Allow boolean positional values in function calls, like `dict.get(... True)`
-  "S105",    # Ignore checks for possible passwords
-  "S106",
-  "S107",
-  "S603",    # ignore subprocess.call
   "PLR0911",
   "PLR0912",
   "PLR0913",
@@ -197,26 +193,23 @@ ignore = [
   "E401",    # ignore multiple line import
   "E702",
   "I001",    # unsorted imports
-  "D103",    # Just missing docstring for magic methods.
-  "D102",
-  "D101",
-  "D100",
   "TCH004",  # don't move runtime import out, just warn about it
   "RUF012",  # mutable attributes to be used with ClassVar
-  "D105",    # magic docstring
+  "B905",    # zip warning about strict, only applicable for 3.10+
   "E701",    # multiple statement on single line
 ]
 line-length = 768
-target-version = "py38"
-typing-modules = ["openllm._typing_compat"]
+target-version = "py312"
+typing-modules = ["openllm_core._typing_compat"]
 unfixable = ["TCH004"]
 [tool.ruff.flake8-type-checking]
-exempt-modules = ["typing", "typing_extensions", "openllm._typing_compat"]
+exempt-modules = ["typing", "typing_extensions", "openllm_core._typing_compat"]
 runtime-evaluated-base-classes = [
   "pydantic.BaseModel",
-  "openllm._configuration.LLMConfig",
-  "openllm._configuration.GenerationConfig",
-  "openllm._configuration.ModelSettings",
+  "openllm_core._configuration.LLMConfig",
+  "openllm_core._configuration.GenerationConfig",
+  "openllm_core._configuration.ModelSettings",
+  "openllm.LLMConfig",
 ]
 runtime-evaluated-decorators = ["attrs.define", "attrs.frozen", "trait"]
 [tool.ruff.pydocstyle]
@@ -227,7 +220,7 @@ ignore-overlong-task-comments = true
 combine-as-imports = true
 force-single-line = false
 force-wrap-aliases = true
-known-first-party = ["openllm", "bentoml"]
+known-first-party = ["openllm", "bentoml", "openllm_core", "openllm_client"]
 known-third-party = [
   "transformers",
   "click",
@@ -245,13 +238,13 @@ required-imports = ["from __future__ import annotations"]
 avoid-escape = false
 [tool.ruff.extend-per-file-ignores]
 "openllm-python/src/openllm/_service.py" = ["E401"]
-"openllm-python/src/openllm/cli/entrypoint.py" = ["D301"]
-"openllm-python/src/openllm/client/runtimes/*" = ["D107"]
-"openllm-python/src/openllm/models/**" = ["E", "D", "F"]
-"openllm-python/src/openllm/utils/import_utils.py" = ["PLW0603"]
-"openllm-python/src/openllm/_configuration.py" = ["F811"]
-"openllm-python/tests/**/*" = ["S101", "TID252", "D", "PT011", "S307"]
-"typings/**" = ["D", "F", "E"]
+"openllm-python/src/openllm/models/**" = ["E", "F"]
+"openllm-python/tests/**/*" = ["S101", "TID252", "PT011", "S307"]
+"openllm-python/src/openllm/_llm.py" = ["F811"]
+"openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"]
+"openllm-core/src/openllm_core/_configuration.py" = ["F811"]
+"openllm-client/src/openllm_client/bentoclient/_http.py" = ["PERF203"]
+"typings/**" = ["F", "E"]
 
 [tool.yapf]
 ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
@@ -312,7 +305,7 @@ omit = [
   "openllm-python/src/openllm/__init__.py",
   "openllm-python/src/openllm/__main__.py",
   "openllm-python/src/openllm/utils/dummy_*.py",
-  "openllm-python/src/openllm/_typing_compat.py",
+  "openllm-core/src/openllm_core/_typing_compat.py",
 ]
 source_pkgs = ["openllm"]
 [tool.coverage.report]
@@ -328,6 +321,7 @@ exclude_lines = [
   'if DEBUG:',
   'if utils.DEBUG',
   'if openllm.utils.DEBUG',
+  'if openllm_core.utils.DEBUG',
   '@_overload',
   '@overload',
   '@t.overload',
@@ -343,7 +337,7 @@ omit = [
   "openllm-python/src/openllm/__init__.py",
   "openllm-python/src/openllm/__main__.py",
   "openllm-python/src/openllm/utils/dummy_*.py",
-  "openllm-python/src/openllm/_typing_compat.py",
+  "openllm-core/src/openllm_core/_typing_compat.py",
 ]
 precision = 2
 show_missing = true
@@ -360,15 +354,19 @@ exclude = [
   "openllm-python/src/openllm/__init__.py",
   "openllm-python/src/openllm/__main__.py",
   "openllm-python/src/openllm/utils/dummy_*.py",
-  "openllm-python/src/openllm/_typing_compat.py",
   "openllm-python/tests",
+  "openllm-core/src/openllm_core/_typing_compat.py",
   "cz.py",
   "hatch_build.py",
   "tools",
   "examples",
   "typings",
 ]
-executionEnvironments = [{ root = "openllm-python/src" }]
+executionEnvironments = [
+  { root = "openllm-python/src" },
+  { root = "openllm-core/src" },
+  { root = "openllm-client/src" },
+]
 pythonVersion = "3.12"
 enableExperimentalFeatures = true
 reportMissingImports = "warning"
@@ -390,10 +388,10 @@ exclude = [
   "openllm-python/src/openllm/playground",
   "openllm-python/src/openllm/utils/dummy_*.py",
   "openllm-python/src/openllm/models",
-  "openllm-python/src/openllm/_typing_compat.py",
   "openllm-python/src/openllm/_service.py",
+  "openllm-core/src/openllm_core/_typing_compat.py",
 ]
-modules = ["openllm"]
+modules = ["openllm", "openllm-core", "openllm-client"]
 mypy_path = "typings"
 pretty = true
 python_version = "3.8"
@@ -427,4 +425,4 @@ module = [
 ]
 [[tool.mypy.overrides]]
 ignore_errors = true
-module = ["openllm.models.*", "openllm.playground.*", "openllm._typing_compat"]
+module = ["openllm.models.*", "openllm.playground.*", "openllm_core._typing_compat"]
diff --git a/tools/dependencies.py b/tools/dependencies.py
index 98270b73..dbffd404 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -74,25 +74,19 @@ _BENTOML_EXT = ["grpc", "io"]
 _TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
 
 _BASE_DEPENDENCIES = [
-    Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.25"),
+    Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.1.2"),
     Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
+    Dependencies(name="openllm-client"),
     Dependencies(name="safetensors"),
     Dependencies(name="optimum"),
-    Dependencies(name="attrs", lower_constraint="23.1.0"),
-    Dependencies(name="cattrs", lower_constraint="23.1.0"),
-    Dependencies(name="orjson"),
-    Dependencies(name="inflection"),
-    Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
-    Dependencies(name="httpx"),
-    Dependencies(name="click", lower_constraint="8.1.3"),
-    Dependencies(name="typing_extensions"),
-    Dependencies(name="mypy_extensions"),  # for mypyc compilation
     Dependencies(name="ghapi"),
+    Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
+    Dependencies(name="click", lower_constraint="8.1.3"),
     Dependencies(name="cuda-python", platform=("Darwin", "ne")),
     Dependencies(name="bitsandbytes", upper_constraint="0.42"),  # 0.41  works with CUDA 11.8
 ]
 
-_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+_ALL_RUNTIME_DEPS = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 FINE_TUNE_DEPS = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
 OPT_DEPS = _ALL_RUNTIME_DEPS
@@ -137,10 +131,10 @@ def create_classifiers() -> Array:
 
 def create_optional_table() -> Table:
   all_array = tomlkit.array()
-  all_array.extend([f"openllm[{k}]" for k in _base_requirements])
+  all_array.append(f"openllm[{','.join(_base_requirements)}]")
 
   table = tomlkit.table(is_super_table=True)
-  _base_requirements.update({"all": all_array.multiline(True)})
+  _base_requirements.update({"full": all_array.multiline(True), "all": tomlkit.array('["openllm[full]"]')})
   table.update({k: v for k, v in sorted(_base_requirements.items())})
   table.add(tomlkit.nl())
 
@@ -163,7 +157,7 @@ def build_system() -> Table:
   table = tomlkit.table()
   table.add("build-backend", "hatchling.build")
   requires_array = tomlkit.array()
-  requires_array.extend(["hatchling==1.18.0", "hatch-vcs==0.3.0", "hatch-fancy-pypi-readme==23.1.0", "hatch-mypyc==0.16.0"])
+  requires_array.extend(["hatchling==1.18.0", "hatch-vcs==0.3.0", "hatch-fancy-pypi-readme==23.1.0"])
   table.add("requires", requires_array.multiline(True))
   return table
 
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index dd4bad3e..fbfdc2ef 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -12,11 +12,11 @@ START_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs start\n"
 END_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs stop\n"
 
 ROOT = Path(__file__).parent.parent
-_TARGET_FILE = ROOT/"openllm-python"/"src"/"openllm"/"_configuration.py"
+_TARGET_FILE = ROOT/"openllm-core"/"src"/"openllm_core"/"_configuration.py"
 
-sys.path.insert(0, (ROOT/"openllm-python"/"src").__fspath__())
-from openllm._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
-from openllm.utils import codegen
+sys.path.insert(0, (ROOT/"openllm-core"/"src").__fspath__())
+from openllm_core._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
+from openllm_core.utils import codegen
 
 def process_annotations(annotations: str) -> str:
   if "NotRequired" in annotations: return annotations[len("NotRequired["):-1]
@@ -106,8 +106,8 @@ def main() -> int:
   lines.append(" "*2 + "# NOTE: generation_class, sampling_class and extras arguments\n")
   lines.extend([
   " "*2 + line for line in [
-    "@overload\n", 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...\n',
-    "@overload\n", 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...\n',
+    "@overload\n", 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...\n',
+    "@overload\n", 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...\n',
     "@overload\n", 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
   ]])
   lines.append(" "*2 + "# NOTE: GenerationConfig arguments\n")
diff --git a/tools/update-dummy.py b/tools/update-dummy.py
index 78038ee3..3714f561 100755
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -4,8 +4,9 @@ import os, typing as t, sys
 from pathlib import Path
 _ROOT = Path(__file__).parent.parent
 
-sys.path.insert(0, (_ROOT/"openllm-python"/"src").__fspath__())
-from openllm._configuration import LiteralRuntime
+sys.path.insert(0, (_ROOT/"openllm-core"/"src").__fspath__())
+sys.path.insert(1, (_ROOT/"openllm-python"/"src").__fspath__())
+from openllm_core._configuration import LiteralRuntime
 from openllm.models import auto
 from openllm import CONFIG_MAPPING
 
@@ -31,7 +32,7 @@ def write_stub(framework: LiteralRuntime, _path: str) -> list[str]:
           f"# To update this, run ./{_path}",
           "from __future__ import annotations",
           "import typing as _t",
-          "from openllm.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends",
+          "from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends",
           ]
   base.extend([v for it in [make_class_stub(k, framework) for k in get_mapping(framework)] for v in it])
   # autoclass
diff --git a/tools/update-models-import.py b/tools/update-models-import.py
index 32022db2..1f201090 100755
--- a/tools/update-models-import.py
+++ b/tools/update-models-import.py
@@ -7,9 +7,9 @@ _TARGET_FILE = Path(__file__).parent.parent/"openllm-python"/"src"/"openllm"/"mo
 
 def create_module_import() -> str:
   r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']]
-  return f"_MODELS: set[str] = {{{', '.join(sorted(r))}}}"
-def create_stubs_import() -> list[str]: return ["if t.TYPE_CHECKING: from . import "+",".join([f"{p.name} as {p.name}" for p in sorted(_TARGET_FILE.parent.glob("*/")) if p.name not in {"__pycache__", "__init__.py", ".DS_Store"}]),
-                                                '__lazy=LazyModule(__name__, os.path.abspath("__file__"), {k: [] for k in _MODELS})', "__all__=__lazy.__all__", "__dir__=__lazy.__dir__", "__getattr__=__lazy.__getattr__\n"]
+  return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}"
+def create_stubs_import() -> list[str]: return ["if t.TYPE_CHECKING:from . import "+",".join([f"{p.name} as {p.name}" for p in sorted(_TARGET_FILE.parent.glob("*/")) if p.name not in {"__pycache__", "__init__.py", ".DS_Store"}]),
+                                                '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', "__all__=__lazy.__all__", "__dir__=__lazy.__dir__", "__getattr__=__lazy.__getattr__\n"]
 
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
@@ -17,8 +17,8 @@ def main() -> int:
     f"# This file is generated by {_path}. DO NOT EDIT MANUALLY!",
     f"# To update this, run ./{_path}",
     "from __future__ import annotations",
-    "import typing as t, os",
-    "from openllm.utils import LazyModule",
+    "import typing as t",
+    "from openllm_core.utils import LazyModule",
     create_module_import(),
     *create_stubs_import(),
   ]))