refactor: packages (#249)

2026-05-19 14:16:22 -04:00 · 2023-08-22 08:55:46 -04:00
parent a964e659c1
commit 3ffb25a872
148 changed files with 2899 additions and 1937 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,7 +2,7 @@
 contrib/clojure/pnpm-lock.yaml linguist-generated=true
 contrib/clojure/src/generated/** linguist-generated=true

-# Python core
+# Python sdk
 openllm-python/tests/models/__snapshots__/* linguist-generated=true
 openllm-python/src/openllm/utils/dummy_*.py linguist-generated=true
 openllm-python/src/openllm/models/__init__.py linguist-generated=true
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -75,7 +75,9 @@ jobs:
      - name: Install build frontend
        run: python -m pip install --upgrade build
      - name: Build
-        run: python -m build -sw openllm-python/
+        run: |
+          bash local.sh
+          python -m build -sw openllm-python/
      - name: Upload artifacts
        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
        with:
--- a/.github/workflows/build-embedding.yml
+++ b/.github/workflows/build-embedding.yml
@@ -112,7 +112,7 @@ jobs:
      - name: Build OPT Bento with base embeddings
        id: bento-tag
        run: |
-          pip install -e ${{ github.workspace }}/openllm-python
+          bash local.sh
          openllm build opt --serialisation legacy --bento-version sha-${{ env.GITHUB_SHA_SHORT }} --machine --dockerfile-template - <<EOF
            {% extends "python_debian.j2" %}
            {% block SETUP_BENTO_BASE_IMAGE %}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -116,7 +116,7 @@ jobs:
          bentoml-version: 'main'
          python-version-file: .python-version-default
      - name: Install self
-        run: pip install ./openllm-python
+        run: bash local.sh
      - name: Speed
        run: hyperfine -m 100 --warmup 10 openllm
  brew-dry-run:
--- a/.github/workflows/compile-pypi.yml
+++ b/.github/workflows/compile-pypi.yml
@@ -66,11 +66,15 @@ jobs:
          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
          echo github.ref ${{ github.ref }}
  pure-wheels-sdist:
-    name: Pure wheels and sdist distribution
+    name: Pure wheels and sdist distribution (${{ matrix.directory }})
    runs-on: ubuntu-latest
    needs: get_commit_message
    if: >-
      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
+    strategy:
+      fail-fast: false
+      matrix:
+        directory: ["openllm-core", "openllm-python", "openllm-client"]
    steps:
      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
        with:
@@ -82,37 +86,27 @@ jobs:
          python-version-file: .python-version-default
      - name: Build
        run: hatch build
-        working-directory: openllm-python
+        working-directory: ${{ matrix.directory }}
      - name: Upload artifacts
        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
        with:
          name: python-artefacts
-          path: openllm-python/dist/*
+          path: ${{ matrix.directory }}/dist/*
          if-no-files-found: error
  mypyc:
-    name: Compiled mypyc wheels (${{ matrix.name }})
-    runs-on: ${{ matrix.os }}
-    defaults:
-      run:
-        working-directory: ./openllm-python
+    name: Compiled wheels (${{ matrix.buildplatform[1] }}-${{ matrix.directory }})
+    runs-on: ${{ matrix.buildplatform[0] }}
    strategy:
      fail-fast: false
      matrix:
-        include:
-          - os: ubuntu-latest
-            name: linux-x86_64
-          # NOTE: darwin amd64
-          - os: macos-latest
-            name: macos-x86_64
-            macos_arch: "x86_64"
-          # NOTE: darwin arm64
-          - os: macos-latest
-            name: macos-arm64
-            macos_arch: "arm64"
-          # NOTE: darwin universal2
-          - os: macos-latest
-            name: macos-universal2
-            macos_arch: "universal2"
+        # Github Actions doesn't support pairing matrix values together, let's improvise
+        # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
+        buildplatform:
+          - [ubuntu-latest, linux-x86_64, ""]
+          - [macos-latest, macos-x86_64, "x86_64"]
+          - [macos-latest, macos-arm64, "arm64"]
+          - [macos-latest, macos-universal2, "universal2"]
+        directory: ["openllm-core", "openllm-python", "openllm-client"]
    needs: get_commit_message
    if: >-
      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
@@ -128,17 +122,16 @@ jobs:
      - name: Build wheels via cibuildwheel
        uses: pypa/cibuildwheel@39a63b5912f086dd459cf6fcb13dcdd3fe3bc24d # ratchet:pypa/cibuildwheel@v2.15.0
        with:
-          package-dir: openllm-python
+          package-dir: ${{ matrix.directory }}
          config-file: pyproject.toml
        env:
          CIBW_PRERELEASE_PYTHONS: True
-          CIBW_BEFORE_BUILD_MACOS: "rustup target add aarch64-apple-darwin"
-          CIBW_ARCHS_MACOS: "${{ matrix.macos_arch }}"
+          CIBW_ARCHS_MACOS: "${{ matrix.buildplatform[2] }}"
          MYPYPATH: /project/typings
      - name: Upload wheels as workflow artifacts
        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
        with:
-          name: ${{ matrix.name }}-mypyc-wheels
+          name: ${{ matrix.buildplatform[1] }}-mypyc-wheels
          path: ./wheelhouse/*.whl
  check-download-artefacts:
    name: dry-run for downloading artefacts
--- a/.gitignore
+++ b/.gitignore
@@ -141,4 +141,4 @@ pyapp
 /target

 .pdm-python
-/openllm-python/src/openllm/_version.py
+**/_version.py
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -72,7 +72,7 @@ Before you can start developing, you'll need to set up your environment:
   dependencies.

 > [!NOTE]
-> If you want to install editable, make sure to install it from `openllm-python` folder
+> If you don't want to work with hatch, you can use the editable workflow with running `bash local.sh`

 ## Project Structure

--- a/changelog.d/249.refactor.md
+++ b/changelog.d/249.refactor.md
@@ -0,0 +1,5 @@
+OpenLLM now comprise of three packages:
+
+- openllm-core: main building blocks of OpenLLM, that doesn't depend on transformers and heavy DL libraries
+- openllm-client: The implementation of `openllm.client`
+- openllm: = openllm-core + openllm-client + DL features
--- a/compile.sh
+++ b/compile.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+pip install -U mypy==1.5.1 build==0.10.0
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-python -w -C--global-option=--verbose "$@"
+HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-core -w -C--global-option=--verbose "$@"
+HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-client -w -C--global-option=--verbose "$@"
 hatch clean
--- a/cz.py
+++ b/cz.py
@@ -4,21 +4,25 @@ import itertools, os, token, tokenize
 from tabulate import tabulate

 TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING]
-def main() -> int:
+def run_cz(dir: str, package: str):
  headers = ["Name", "Lines", "Tokens/Line"]
  table = []
-  for path, _, files in os.walk(os.path.join("openllm-python", "src", "openllm")):
+  for path, _, files in os.walk(os.path.join(dir, "src", package)):
    for name in files:
      if not name.endswith(".py"): continue
      filepath = os.path.join(path, name)
      with tokenize.open(filepath) as file_:
        tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
        token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
-        table.append([filepath.replace(os.path.join("openllm-python","src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
+        table.append([filepath.replace(os.path.join(dir ,"src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
  print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers="firstrow", floatfmt=".1f") + "\n")
  for dir_name, group in itertools.groupby(sorted([(x[0].rsplit("/", 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
    print(f"{dir_name:35s} : {sum([x[1] for x in group]):6d}")
  print(f"\ntotal line count: {sum([x[1] for x in table])}")
+def main() -> int:
+  run_cz("openllm-python", "openllm")
+  run_cz("openllm-core", "openllm_core")
+  run_cz("openllm-client", "openllm_client")
  return 0

 if __name__ == "__main__": raise SystemExit(main())
--- a/hatch.toml
+++ b/hatch.toml
@@ -1,5 +1,7 @@
 [envs.default]
 dependencies = [
+    "openllm-core @ {root:uri}/openllm-core",
+    "openllm-client @ {root:uri}/openllm-client",
    "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
    # NOTE: To run all hooks
    "pre-commit",
@@ -26,7 +28,6 @@ check-stubs = [
    "./tools/update-models-import.py",
    "update-dummy",
 ]
-compile = "bash ./compile.sh {args}"
 inplace-changelog = "towncrier build --version main --keep"
 quality = [
    "./tools/dependencies.py",
@@ -36,13 +37,12 @@ quality = [
    "check-stubs",
    "- pre-commit run --all-files",
 ]
-recompile = ["bash ./clean.sh", "compile"]
 setup = [
    "pre-commit install",
    "- ln -s .python-version-default .python-version",
    "curl -fsSL https://raw.githubusercontent.com/clj-kondo/clj-kondo/master/script/install-clj-kondo | bash -",
 ]
-tool = ["quality", "recompile -nx"]
+tool = ["quality", "bash ./clean.sh", "bash ./compile.sh {args}"]
 typing = [
    "- pre-commit run mypy {args:-a}",
    "- pre-commit run pyright {args:-a}",
@@ -50,6 +50,8 @@ typing = [
 update-dummy = ["- ./tools/update-dummy.py", "./tools/update-dummy.py"]
 [envs.tests]
 dependencies = [
+    "openllm-core @ {root:uri}/openllm-core",
+    "openllm-client @ {root:uri}/openllm-client",
    "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
    # NOTE: interact with docker for container tests.
    "docker",
@@ -94,7 +96,10 @@ clojure = ["bash tools/run-clojure-ui.sh"]
 [envs.ci]
 detached = true
 [envs.ci.scripts]
+compile = "bash ./compile.sh {args}"
+recompile = ["bash ./clean.sh", "compile"]
+edi = "bash local.sh"
 lock = [
-    "bash tools/lock-actions",
+    "bash tools/lock-actions.sh",
    "pushd contrib/clojure && pnpm i --frozen-lockfile",
 ]
--- a/local.sh
+++ b/local.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -ex
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+cd "$GIT_ROOT" || exit 1
+
+pip install -e "$GIT_ROOT/openllm-core" -v
+pip install -e "$GIT_ROOT/openllm-client" -v
+pip install -e "$GIT_ROOT/openllm-python" -v
--- a/openllm-client/.git_archival.txt
+++ b/openllm-client/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
--- a/openllm-client/.python-version-default
+++ b/openllm-client/.python-version-default
@@ -0,0 +1 @@
+../.python-version-default
--- a/openllm-client/LICENSE.md
+++ b/openllm-client/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
--- a/openllm-client/README.md
+++ b/openllm-client/README.md
@@ -0,0 +1,86 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">👾 OpenLLM Client</h1>
+    <a href="https://pypi.org/project/openllm-client">
+        <img src="https://img.shields.io/pypi/v/openllm-client.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm-client/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm-client">
+        <img src="https://img.shields.io/pypi/pyversions/openllm-client.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="htjtps://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server.<br/></p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps, and more.
+
+To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
+
+This package holds the underlying client implementation for OpenLLM. If you are
+coming from OpenLLM, the client can be accessed via `openllm.client`.
+
+It provides somewhat of a "similar" APIs to [`bentoml.Client`](https://docs.bentoml.com/en/latest/guides/client.html)
+(via `openllm_client.benmin`) for interacting with OpenLLM server. This can also be extended to use with general
+BentoML server as well.
+
+> [!NOTE]
+> The component of interop with generic BentoML server will be considered as experimental that will/can be merged back to BentoML.
+> If you are just using this package for interacting with OpenLLM server, nothing should change from `openllm.client` namespace.
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient()
+
+client.query('Explain to me the difference between "further" and "farther"')
+```
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
--- a/openllm-client/pyproject.toml
+++ b/openllm-client/pyproject.toml
@@ -0,0 +1,172 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+]
+
+[project]
+authors = [
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
+]
+dynamic = ['readme', 'version']
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+description = "OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server."
+keywords = [
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
+]
+dependencies = ["openllm-core", "httpx"]
+license = "Apache-2.0"
+name = "openllm-client"
+requires-python = ">=3.8"
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client/README.md"
+GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+[project.optional-dependencies]
+full = ["openllm-client[grpc,agents]"]
+grpc = ["bentoml[grpc]>=1.0.25"]
+agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm_client/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm_client"]
+sources = ["src"]
+[tool.hatch.build.targets.sdist]
+exclude = ["/.git_archival.txt", "tests", "/.python-version-default"]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+  "hatch-mypyc==0.16.0",
+  "mypy==1.5.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.2",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
+]
+enable-by-default = false
+include = ["src/openllm_client/__init__.py", "src/openllm_client/client.py"]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
+  </a>
+</p>
+
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+---
+
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
--- a/openllm-client/src/openllm_client/init.py
+++ b/openllm-client/src/openllm_client/init.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from . import benmin as benmin
+from ._base import BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient
+from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
--- a/openllm-client/src/openllm_client/_adapters.py
+++ b/openllm-client/src/openllm_client/_adapters.py
--- a/openllm-client/src/openllm_client/_base.py
+++ b/openllm-client/src/openllm_client/_base.py
@@ -0,0 +1,203 @@
+# mypy: disable-error-code="override,no-redef"
+from __future__ import annotations
+import typing as t, functools, openllm_core, logging, httpx, orjson, attr, abc
+from http import HTTPStatus
+from urllib.parse import urljoin
+from .benmin import Client as BentoClient, AsyncClient as AsyncBentoClient
+from openllm_core.utils import is_transformers_supports_agent, is_transformers_available, bentoml_cattr, ensure_exec_coro
+from openllm_core._typing_compat import overload, LiteralString
+if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import LiteralRuntime, DictStrAny
+  import transformers
+
+logger = logging.getLogger(__name__)
+
+@attr.define(slots=False, init=False)
+class _ClientAttr:
+  _address: str
+  _timeout: float = attr.field(default=30)
+  _api_version: str = attr.field(default="v1")
+  def __init__(self, address: str, timeout: float = 30, api_version: str = "v1"): self.__attrs_init__(address, timeout, api_version)
+
+  @abc.abstractmethod
+  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: raise NotImplementedError
+  @abc.abstractmethod
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  @abc.abstractmethod
+  def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
+  @abc.abstractmethod
+  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: raise NotImplementedError
+  # NOTE: Scikit interface
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  @abc.abstractmethod
+  def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
+  @abc.abstractmethod
+  def predict(self, prompt: str, **attrs: t.Any) -> t.Any: raise NotImplementedError
+
+  @functools.cached_property
+  def _hf_agent(self) -> transformers.HfAgent:
+    if not is_transformers_available(): raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
+    if not self.supports_hf_agent: raise RuntimeError(f"{self.model_name} ({self.framework}) does not support running HF agent.")
+    if not is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
+    import transformers
+    return transformers.HfAgent(urljoin(self._address, "/hf/agent"))
+  @property
+  def _metadata(self) -> t.Any: return self.call("metadata")
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata["model_id"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try: return self._metadata["framework"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return self._metadata["timeout"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata["configuration"])
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.get("supports_embeddings", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.get("supports_hf_agent", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def config(self) -> openllm_core.LLMConfig: return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration)
+  @functools.cached_property
+  def inner(self) -> t.Any: raise NotImplementedError("'inner' client is not implemented.")
+
+class _Client(_ClientAttr):
+  _host: str
+  _port: str
+  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
+  def health(self) -> t.Any: return self.inner.health()
+  @functools.cached_property
+  def inner(self) -> BentoClient:
+    BentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
+    return BentoClient.from_url(self._address)
+
+  # Agent integration
+  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+    try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
+    except Exception as err:
+      logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
+      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
+
+class _AsyncClient(_ClientAttr):
+  _host: str
+  _port: str
+  def __init__(self, address: str, timeout: float = 30): self._address,self._timeout = address,timeout
+  async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return await self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
+  async def health(self) -> t.Any: return await self.inner.health()
+  @functools.cached_property
+  def inner(self) -> AsyncBentoClient:
+    ensure_exec_coro(AsyncBentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout))
+    return ensure_exec_coro(AsyncBentoClient.from_url(self._address))
+
+  # Agent integration
+  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    """Async version of agent.run."""
+    if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if not is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
+    from transformers.tools.python_interpreter import evaluate
+
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+    stop = ["Task:"]
+    prompt = t.cast(str, self._hf_agent.format_prompt(task))
+    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
+      response = await client.post(self._hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop}})
+      if response.status_code != HTTPStatus.OK: raise ValueError(f"Error {response.status_code}: {response.json()}")
+
+    result = response.json()[0]["generated_text"]
+    # Inference API returns the stop sequence
+    for stop_seq in stop:
+      if result.endswith(stop_seq):
+        result = result[:-len(stop_seq)]
+        break
+    # the below have the same logic as agent.run API
+    explanation, code = clean_code_for_run(result)
+    self._hf_agent.log(f"==Explanation from the agent==\n{explanation}")
+    self._hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
+    if not return_code:
+      self._hf_agent.log("\n\n==Result==")
+      self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools)
+      return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy())
+    else:
+      tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
+      return f"{tool_code}\n{code}"
+
+class BaseClient(_Client):
+  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt)))
+  def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: return self.query(prompt, **attrs)
+  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+    r = openllm_core.GenerationOutput(**self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
+    if return_response == "attrs": return r
+    elif return_response == "raw": return bentoml_cattr.unstructure(r)
+    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
+
+class BaseAsyncClient(_AsyncClient):
+  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**(await self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt))))
+  async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return await self.query(prompt, **attrs)
+  async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+    r = openllm_core.GenerationOutput(**(await self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
+    if return_response == "attrs": return r
+    elif return_response == "raw": return bentoml_cattr.unstructure(r)
+    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
--- a/openllm-client/src/openllm_client/benmin/init.py
+++ b/openllm-client/src/openllm_client/benmin/init.py
@@ -0,0 +1,87 @@
+"""This holds a simple client implementation, somewhat similar to `bentoml.client`.
+
+This module is subjected to change and to be merged upstream to BentoML.
+
+```python
+import openllm_client
+
+client = openllm_client.benmin.Client.from_url("http://localhost:3000")
+```
+
+The client implementation won't include a dynamic assignment of the service endpoints, rather this should be called
+via `client.call` or `await client.call`.
+"""
+from __future__ import annotations
+import typing as t, bentoml, attr, httpx
+from abc import abstractmethod
+if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI
+
+__all__ = ["Client", "AsyncClient"]
+
+@attr.define(init=False)
+class Client:
+  server_url: str
+  endpoints: t.List[str]
+  svc: bentoml.Service
+  timeout: int = attr.field(default=30)
+  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
+    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
+    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
+    for it, val in kwargs.items(): object.__setattr__(self, it, val)
+  def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
+  @abstractmethod
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @abstractmethod
+  def health(self) -> t.Any: raise NotImplementedError
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> Client:
+    try:
+      from ._http import HttpClient
+      return HttpClient.from_url(url, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import GrpcClient
+      return GrpcClient.from_url(url, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
+    try:
+      from ._http import HttpClient
+      return HttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import GrpcClient
+      return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err
+
+@attr.define(init=False)
+class AsyncClient:
+  server_url: str
+  endpoints: t.List[str]
+  svc: bentoml.Service
+  timeout: int = attr.field(default=30)
+  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
+    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
+    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
+    for it, val in kwargs.items(): object.__setattr__(self, it, val)
+  async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
+  @abstractmethod
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
+  @abstractmethod
+  async def health(self) -> t.Any: raise NotImplementedError
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncClient:
+    try:
+      from ._http import AsyncHttpClient
+      return await AsyncHttpClient.from_url(url, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import AsyncGrpcClient
+      return await AsyncGrpcClient.from_url(url, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
+    try:
+      from ._http import AsyncHttpClient
+      await AsyncHttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except httpx.RemoteProtocolError:
+      from ._grpc import AsyncGrpcClient
+      await AsyncGrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
+    except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err
--- a/openllm-client/src/openllm_client/benmin/_grpc.py
+++ b/openllm-client/src/openllm_client/benmin/_grpc.py
@@ -0,0 +1,187 @@
+# mypy: disable-error-code="no-redef"
+from __future__ import annotations
+import typing as t, logging, time, functools, bentoml
+from bentoml._internal.service.inference_api import InferenceAPI
+from openllm_client.benmin import Client, AsyncClient
+from openllm_core.utils import is_grpc_available, is_grpc_health_available, ensure_exec_coro
+from openllm_core._typing_compat import NotRequired, overload
+from bentoml.grpc.utils import load_from_file, import_generated_stubs
+if not is_grpc_available() or not is_grpc_health_available(): raise ImportError("gRPC is required to use gRPC client. Install with 'pip install \"openllm-client[grpc]\"'.")
+from grpc import aio
+from google.protobuf import json_format
+import grpc, grpc_health.v1.health_pb2 as pb_health, grpc_health.v1.health_pb2_grpc as services_health
+
+pb, services = import_generated_stubs("v1")
+
+if t.TYPE_CHECKING:
+  from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse
+
+logger = logging.getLogger(__name__)
+
+class ClientCredentials(t.TypedDict):
+  root_certificates: NotRequired[t.Union[bytes, str]]
+  private_key: NotRequired[t.Union[bytes, str]]
+  certificate_chain: NotRequired[t.Union[bytes, str]]
+
+@overload
+def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: ...
+@overload
+def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: ...
+def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", ssl: bool = False, ssl_client_credentials: ClientCredentials | None = None, options: t.Any | None = None, compression: grpc.Compression | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel:
+  credentials = None
+  if ssl:
+    if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+    credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()})
+
+  if typ == "async" and ssl: return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors)
+  elif typ == "async": return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors)
+  elif typ == "sync" and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression)
+  elif typ == "sync": return grpc.insecure_channel(server_url, options=options, compression=compression)
+  else: raise ValueError(f"Unknown type: {typ}")
+
+class GrpcClient(Client):
+  ssl: bool
+  ssl_client_credentials: t.Optional[ClientCredentials]
+  options: t.Any
+  compression: t.Optional[grpc.Compression]
+
+  def __init__(
+    self,
+    server_url: str,
+    svc: bentoml.Service,
+    # gRPC specific options
+    ssl: bool = False,
+    options: t.Any | None = None,
+    compression: grpc.Compression | None = None,
+    ssl_client_credentials: ClientCredentials | None = None,
+    **kwargs: t.Any) -> None:
+    self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression
+    super().__init__(server_url, svc, **kwargs)
+  @functools.cached_property
+  def inner(self) -> grpc.Channel:
+    if self.ssl:
+      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
+      return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression)
+    return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression)
+
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
+      req = pb_health.HealthCheckRequest()
+      req.service = "bentoml.grpc.v1.BentoService"
+      health_stub = services_health.HealthStub(channel)
+      start_time = time.time()
+      while time.time() - start_time < timeout:
+        try:
+          resp = health_stub.Check(req)
+          if resp.status == pb_health.HealthCheckResponse.SERVING: break
+          else: time.sleep(check_interval)
+        except grpc.RpcError:
+          logger.debug("Waiting for server to be ready...")
+          time.sleep(check_interval)
+      try:
+        resp = health_stub.Check(req)
+        if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
+      except grpc.RpcError as err:
+        logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
+        logger.error(err)
+        raise
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient:
+    with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
+    reflection = bentoml.Service(metadata.name)
+    for api in metadata.apis:
+      try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
+            bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
+            bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
+            name=api.name, doc=api.docs)
+      except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
+    return cls(url, reflection, **kwargs)
+  def health(self) -> t.Any: return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs))
+    else: fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data))
+    api_fn = {v: k for k, v in self.svc.apis.items()}
+    stubs = services.BentoServiceStub(self.inner)
+    proto = stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
+    return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content"))))
+
+class AsyncGrpcClient(AsyncClient):
+  ssl: bool
+  ssl_client_credentials: t.Optional[ClientCredentials]
+  options: aio.ChannelArgumentType
+  interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]]
+  compression: t.Optional[grpc.Compression]
+
+  def __init__(
+    self,
+    server_url: str,
+    svc: bentoml.Service,
+    # gRPC specific options
+    ssl: bool = False,
+    options: aio.ChannelArgumentType | None = None,
+    interceptors: t.Sequence[aio.ClientInterceptor] | None = None,
+    compression: grpc.Compression | None = None,
+    ssl_client_credentials: ClientCredentials | None = None,
+    **kwargs: t.Any) -> None:
+    self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression
+    super().__init__(server_url, svc, **kwargs)
+  @functools.cached_property
+  def inner(self) -> aio.Channel:
+    if self.ssl:
+      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
+      return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors)
+    return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)
+
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
+      req = pb_health.HealthCheckRequest()
+      req.service = "bentoml.grpc.v1.BentoService"
+      health_stub = services_health.HealthStub(channel)
+      start_time = time.time()
+      while time.time() - start_time < timeout:
+        try:
+          resp = health_stub.Check(req)
+          if resp.status == pb_health.HealthCheckResponse.SERVING: break
+          else: time.sleep(check_interval)
+        except grpc.RpcError:
+          logger.debug("Waiting for server to be ready...")
+          time.sleep(check_interval)
+      try:
+        resp = health_stub.Check(req)
+        if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
+      except grpc.RpcError as err:
+        logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
+        logger.error(err)
+        raise
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient:
+    async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
+    reflection = bentoml.Service(metadata.name)
+    for api in metadata.apis:
+      try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
+            bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
+            bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
+            name=api.name, doc=api.docs)
+      except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
+    return cls(url, reflection, **kwargs)
+  async def health(self) -> t.Any: return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
+    state = self.inner.get_state(try_to_connect=True)
+    if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready()
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = await _inference_api.input.to_proto(kwargs)
+    else: fake_resp = await _inference_api.input.to_proto(data)
+    api_fn = {v: k for k, v in self.svc.apis.items()}
+    async with self.inner:
+      stubs = services.BentoServiceStub(self.inner)
+      proto = await stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
+    return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content")))
--- a/openllm-client/src/openllm_client/benmin/_http.py
+++ b/openllm-client/src/openllm_client/benmin/_http.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+import functools, httpx, time, logging, urllib.error, typing as t, orjson, bentoml, starlette.requests, starlette.datastructures, starlette.responses, asyncio
+from bentoml._internal.service.inference_api import InferenceAPI
+from urllib.parse import urlparse
+from openllm_client.benmin import Client, AsyncClient
+from openllm_core.utils import ensure_exec_coro
+
+logger = logging.getLogger(__name__)
+
+class HttpClient(Client):
+  @functools.cached_property
+  def inner(self) -> httpx.Client:
+    if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
+    return httpx.Client(base_url=self.server_url)
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    host = host if "://" in host else "http://" + host
+    logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
+    start = time.time()
+    while time.time() - start < timeout:
+      try:
+        status = httpx.get(f"{host}:{port}/readyz").status_code
+        if status == 200: break
+        else: time.sleep(check_interval)
+      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
+        logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
+        time.sleep(check_interval)
+    # Try once more and raise for exception
+    try: httpx.get(f"{host}:{port}/readyz").raise_for_status()
+    except httpx.HTTPStatusError as err:
+      logger.error("Failed to wait until server ready: %s:%d", host, port)
+      logger.error(err)
+      raise
+  def health(self) -> httpx.Response: return self.inner.get("/readyz")
+  @classmethod
+  def from_url(cls, url: str, **kwargs: t.Any) -> HttpClient:
+    url = url if "://" in url else "http://" + url
+    resp = httpx.get(f"{url}/docs.json")
+    if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}")
+    _spec = orjson.loads(resp.content)
+
+    reflection = bentoml.Service(_spec["info"]["title"])
+
+    for route, spec in _spec["paths"].items():
+      for meth_spec in spec.values():
+        if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
+          if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
+          except Exception as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
+    return cls(url, reflection)
+
+  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    # All gRPC kwargs should be popped out.
+    kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None))
+    else: fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None))
+
+    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
+    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
+    else: body = fake_resp.body
+
+    resp = self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
+    if resp.status_code != 200: raise ValueError(f"Error while making request: {resp.status_code}: {resp.content!s}")
+    fake_req = starlette.requests.Request(scope={"type": "http"})
+    headers = starlette.datastructures.Headers(headers=resp.headers)
+    fake_req._body = resp.content
+    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
+    fake_req._headers = headers
+    return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))
+
+class AsyncHttpClient(AsyncClient):
+  @functools.cached_property
+  def inner(self) -> httpx.AsyncClient:
+    if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
+    return httpx.AsyncClient(base_url=self.server_url)
+  @staticmethod
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
+    host = host if "://" in host else "http://" + host
+    logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
+    start = time.time()
+    while time.time() - start < timeout:
+      try:
+        async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
+          resp = await sess.get("/readyz")
+          if resp.status_code == 200: break
+          else: await asyncio.sleep(check_interval)
+      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
+        logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
+        await asyncio.sleep(check_interval)
+    # Try once more and raise for exception
+    async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
+      resp = await sess.get("/readyz")
+      if resp.status_code != 200: raise TimeoutError(f"Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}")
+  async def health(self) -> httpx.Response: return await self.inner.get("/readyz")
+  @classmethod
+  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncHttpClient:
+    url = url if "://" in url else "http://" + url
+    async with httpx.AsyncClient(base_url=url) as session:
+        resp = await session.get("/docs.json")
+        if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}")
+        _spec = orjson.loads(await resp.aread())
+
+    reflection = bentoml.Service(_spec["info"]["title"])
+
+    for route, spec in _spec["paths"].items():
+      for meth_spec in spec.values():
+        if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
+          if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
+          try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
+          except ValueError as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
+    return cls(url, reflection)
+  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
+    # All gRPC kwargs should be popped out.
+    kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
+    if _inference_api.multi_input:
+      if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+      fake_resp = await _inference_api.input.to_http_response(kwargs, None)
+    else: fake_resp = await _inference_api.input.to_http_response(data, None)
+
+    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
+    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
+    else: body = t.cast(t.Any, fake_resp.body)
+
+    resp = await self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
+    if resp.status_code != 200: raise ValueError(f"Error making request: {resp.status_code}: {(await resp.aread())!s}")
+    fake_req = starlette.requests.Request(scope={"type": "http"})
+    headers = starlette.datastructures.Headers(headers=resp.headers)
+    fake_req._body = resp.content
+    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
+    fake_req._headers = headers
+    return await _inference_api.output.from_http_request(fake_req)
--- a/openllm-client/src/openllm_client/client.py
+++ b/openllm-client/src/openllm_client/client.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+import logging
+from urllib.parse import urlparse
+from ._base import BaseClient, BaseAsyncClient
+
+logger = logging.getLogger(__name__)
+
+def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
+  address = address if "://" in address else "http://" + address
+  parsed = urlparse(address)
+  self._host, *_port = parsed.netloc.split(":")
+  if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
+  else: self._port = next(iter(_port))
+
+class HTTPClient(BaseClient):
+  def __init__(self, address: str, timeout: int = 30):
+    process_http_address(self, address)
+    super().__init__(address, timeout)
+
+class AsyncHTTPClient(BaseAsyncClient):
+  def __init__(self, address: str, timeout: int = 30):
+    process_http_address(self, address)
+    super().__init__(address, timeout)
+
+class GrpcClient(BaseClient):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
+
+class AsyncGrpcClient(BaseAsyncClient):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
--- a/openllm-core/.git_archival.txt
+++ b/openllm-core/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
--- a/openllm-core/.python-version-default
+++ b/openllm-core/.python-version-default
@@ -0,0 +1 @@
+../.python-version-default
--- a/openllm-core/LICENSE.md
+++ b/openllm-core/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
--- a/openllm-core/README.md
+++ b/openllm-core/README.md
@@ -0,0 +1,75 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">🦑 OpenLLM Core</h1>
+    <a href="https://pypi.org/project/openllm-core">
+        <img src="https://img.shields.io/pypi/v/openllm-core.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm-core/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm-core">
+        <img src="https://img.shields.io/pypi/pyversions/openllm-core.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="htjtps://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>OpenLLM Core: Core components for OpenLLM.<br/></p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps, and more.
+
+To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
+
+This package holds the core components of OpenLLM, and considered as internal.
+
+Components includes:
+
+- Configuration generation.
+- Utilities for interacting with OpenLLM server.
+- Schema and generation utilities for OpenLLM server.
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
--- a/openllm-core/pyproject.toml
+++ b/openllm-core/pyproject.toml
@@ -0,0 +1,195 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+]
+
+[project]
+authors = [
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
+]
+dynamic = ['readme', 'version']
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+description = "OpenLLM Core: Core components for OpenLLM."
+keywords = [
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
+]
+dependencies = [
+  "bentoml>=1.1.2",
+  "attrs>=23.1.0",
+  "cattrs>=23.1.0",
+  "orjson",
+  "inflection",
+  "typing_extensions",
+  "mypy_extensions",
+]
+license = "Apache-2.0"
+name = "openllm-core"
+requires-python = ">=3.8"
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core/README.md"
+GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+[project.optional-dependencies]
+full = ["openllm-core[vllm,fine-tune]"]
+vllm = ["vllm", "ray"]
+fine-tune = ["transformers[torch,tokenizers,accelerate]>=4.29.0", "peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm_core/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm_core"]
+sources = ["src"]
+[tool.hatch.build.targets.sdist]
+exclude = [
+  "/.git_archival.txt",
+  "tests",
+  "/.python-version-default",
+  "ADDING_NEW_MODEL.md",
+]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+  "hatch-mypyc==0.16.0",
+  "mypy==1.5.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.2",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
+]
+enable-by-default = false
+exclude = ["src/openllm_core/_typing_compat.py"]
+include = [
+  "src/openllm_core/utils/__init__.py",
+  "src/openllm_core/__init__.py",
+  "src/openllm_core/_prompt.py",
+  "src/openllm_core/_schema.py",
+  "src/openllm_core/_strategies.py",
+  "src/openllm_core/exceptions.py",
+]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--follow-imports=skip",
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+  "--exclude='/src\\/openllm_core\\/_typing_compat\\.py$'",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
+  </a>
+</p>
+
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+---
+
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
--- a/openllm-core/src/openllm_core/init.py
+++ b/openllm-core/src/openllm_core/init.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+from . import utils as utils
+from . import exceptions as exceptions
+from ._configuration import LLMConfig as LLMConfig, GenerationConfig as GenerationConfig, SamplingParams as SamplingParams
+from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource, available_resource_spec as available_resource_spec, LiteralResourceSpec as LiteralResourceSpec, NvidiaGpuResource as NvidiaGpuResource, AmdGpuResource as AmdGpuResource
+from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+from .config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING, ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING, DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING, FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING, FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING, GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING, LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING, MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING, OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING, StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING, StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -34,8 +34,7 @@ dynamically during serve, ahead-of-serve or per requests.
 Refer to ``openllm.LLMConfig`` docstring for more information.
 """
 from __future__ import annotations
-import copy, enum, logging, os, sys, types, typing as t
-import attr, click_option_group as cog, inflection, orjson, openllm
+import copy, enum, logging, os, sys, types, typing as t, attr, click_option_group as cog, inflection, orjson, openllm_core
 from cattr.gen import make_dict_structure_fn, make_dict_unstructure_fn, override
 from deepmerge.merger import Merger
 from ._strategies import LiteralResourceSpec, available_resource_spec, resource_spec
@@ -51,9 +50,10 @@ from .utils import (
  field_env_key,
  first_not_none,
  lenient_issubclass,
+  LazyLoader
 )
 from .utils.import_utils import BACKENDS_MAPPING
-# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm.LLMConfig to become 'attrs'-ish
+# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm_core.LLMConfig to become 'attrs'-ish
 from attr._compat import set_closure_cell
 from attr._make import _CountingAttr, _make_init, _transform_attrs
 from ._typing_compat import AnyCallable, At, Self, ListStr, DictStrAny
@@ -63,11 +63,11 @@ if t.TYPE_CHECKING:
  from transformers.generation.beam_constraints import Constraint
 else:
  Constraint = t.Any
-  vllm = openllm.utils.LazyLoader("vllm", globals(), "vllm")
-  transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-  peft = openllm.utils.LazyLoader("peft", globals(), "peft")
+  vllm = LazyLoader("vllm", globals(), "vllm")
+  transformers = LazyLoader("transformers", globals(), "transformers")
+  peft = LazyLoader("peft", globals(), "peft")

-__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams"]
+__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams", "field_env_key"]

 logger = logging.getLogger(__name__)
 config_merger = Merger([(dict, "merge")], ["override"], ["override"])
@@ -225,15 +225,12 @@ class GenerationConfig(ReprMixin):
  if t.TYPE_CHECKING and not MYPY:
    # stubs this for pyright as mypy already has a attr plugin builtin
    def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: ...
-
  def __init__(self, *, _internal: bool = False, **attrs: t.Any):
    if not _internal: raise RuntimeError("GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config")
    self.__attrs_init__(**attrs)
-
  def __getitem__(self, item: str) -> t.Any:
    if hasattr(self, item): return getattr(self, item)
    raise KeyError(f"'{self.__class__.__name__}' has no attribute {item}.")
-
  @property
  def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)}

@@ -334,7 +331,7 @@ class ModelSettings(t.TypedDict, total=False):
  name_type: NotRequired[t.Optional[t.Literal["dasherize", "lowercase"]]]
  model_name: NotRequired[str]
  start_name: NotRequired[str]
-  env: NotRequired[openllm.utils.EnvVarMixin]
+  env: NotRequired[openllm_core.utils.EnvVarMixin]
  # serving configuration
  timeout: int
  workers_per_resource: t.Union[int, float]
@@ -376,7 +373,7 @@ class _ModelSettingsAttr:
    name_type: t.Optional[t.Literal["dasherize", "lowercase"]]
    model_name: str
    start_name: str
-    env: openllm.utils.EnvVarMixin
+    env: openllm_core.utils.EnvVarMixin
    timeout: int
    workers_per_resource: t.Union[int, float]
    fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig]
@@ -414,7 +411,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
    if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = "pt"
  _final_value_dct["default_implementation"] = default_implementation

-  env = openllm.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
+  env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
  _final_value_dct["env"] = env

  # bettertransformer support
@@ -459,7 +456,8 @@ _reserved_namespace = {"__config__", "GenerationConfig", "SamplingParams"}

@attr.define(slots=True)
 class _ConfigAttr:
-  Field = dantic.Field
+  @staticmethod
+  def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: return dantic.Field(default, **attrs)
  """Field is a alias to the internal dantic utilities to easily create
    attrs.fields with pydantic-compatible interface. For example:

@@ -513,11 +511,11 @@ class _ConfigAttr:
    """The accepted keys for this LLMConfig."""
    __openllm_extras__: DictStrAny = Field(None, init=False)
    """Extra metadata for this LLMConfig."""
-    __openllm_generation_class__: type[openllm._configuration.GenerationConfig] = Field(None)
+    __openllm_generation_class__: type[openllm_core._configuration.GenerationConfig] = Field(None)
    """The result generated GenerationConfig class for this LLMConfig. This will be used
        to create the generation_config argument that can be used throughout the lifecycle.
        This class will also be managed internally by OpenLLM."""
-    __openllm_sampling_class__: type[openllm._configuration.SamplingParams] = Field(None)
+    __openllm_sampling_class__: type[openllm_core._configuration.SamplingParams] = Field(None)
    """The result generated SamplingParams class for this LLMConfig. This will be used
        to create arguments for vLLM LLMEngine that can be used throughout the lifecycle.
        This class will also be managed internally by OpenLLM."""
@@ -587,7 +585,7 @@ class _ConfigAttr:
    """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
    __openllm_start_name__: str = Field(None)
    """Default name to be used with `openllm start`"""
-    __openllm_env__: openllm.utils.EnvVarMixin = Field(None)
+    __openllm_env__: openllm_core.utils.EnvVarMixin = Field(None)
    """A EnvVarMixin instance for this LLMConfig."""
    __openllm_timeout__: int = Field(None)
    """The default timeout to be set for this given LLM."""
@@ -697,7 +695,7 @@ class _ConfigBuilder:
      if not closure_cells: continue  # Catch None or the empty list.
      for cell in closure_cells:
        try: match = cell.cell_contents is self._cls
-        except ValueError: pass  # noqa: PERF203 # ValueError: Cell is empty
+        except ValueError: pass  # ValueError: Cell is empty
        else:
          if match: set_closure_cell(cell, cls)
    return cls
@@ -705,7 +703,6 @@ class _ConfigBuilder:
  def add_attrs_init(self) -> Self:
    self._cls_dict["__attrs_init__"] = codegen.add_method_dunders(self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
    return self
-
  def add_repr(self) -> Self:
    for key, fn in ReprMixin.__dict__.items():
      if key in ("__repr__", "__str__", "__repr_name__", "__repr_str__", "__repr_args__"): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
@@ -849,7 +846,7 @@ class LLMConfig(_ConfigAttr):
    unannotated = ca_names - annotated_names
    if len(unannotated) > 0:
      missing_annotated = sorted(unannotated, key=lambda n: t.cast("_CountingAttr", cd.get(n)).counter)
-      raise openllm.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
+      raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
    # We need to set the accepted key before generation_config
    # as generation_config is a special field that users shouldn't pass.
    cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
@@ -925,7 +922,7 @@ class LLMConfig(_ConfigAttr):
  @overload
  def __getitem__(self, item: t.Literal["start_name"]) -> str: ...
  @overload
-  def __getitem__(self, item: t.Literal["env"]) -> openllm.utils.EnvVarMixin: ...
+  def __getitem__(self, item: t.Literal["env"]) -> openllm_core.utils.EnvVarMixin: ...
  @overload
  def __getitem__(self, item: t.Literal["timeout"]) -> int: ...
  @overload
@@ -936,9 +933,9 @@ class LLMConfig(_ConfigAttr):
  def __getitem__(self, item: t.Literal["tokenizer_class"]) -> t.Optional[str]: ...
  # NOTE: generation_class, sampling_class and extras arguments
  @overload
-  def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...
+  def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...
  @overload
-  def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...
+  def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...
  @overload
  def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...
  # NOTE: GenerationConfig arguments
@@ -1120,7 +1117,6 @@ class LLMConfig(_ConfigAttr):
    try: new_cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__")
    except (AttributeError, ValueError): pass
    return new_cls(**attrs)
-
  def model_dump(self, flatten: bool = False, **_: t.Any) -> DictStrAny:
    dumped = bentoml_cattr.unstructure(self)
    generation_config = bentoml_cattr.unstructure(self.generation_config)
@@ -1129,15 +1125,13 @@ class LLMConfig(_ConfigAttr):
    else: dumped["generation_config"] = generation_config
    dumped.update(sampling_config)
    return dumped
-
  def model_dump_json(self, **kwargs: t.Any) -> bytes: return orjson.dumps(self.model_dump(**kwargs))

  @classmethod
  def model_construct_json(cls, json_str: str | bytes) -> Self:
    try: attrs = orjson.loads(json_str)
-    except orjson.JSONDecodeError as err: raise openllm.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
+    except orjson.JSONDecodeError as err: raise openllm_core.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
    return bentoml_cattr.structure(attrs, cls)
-
  @classmethod
  def model_construct_env(cls, **attrs: t.Any) -> Self:
    """A helpers that respect configuration values environment variables."""
@@ -1186,7 +1180,6 @@ class LLMConfig(_ConfigAttr):
    config = transformers.GenerationConfig(**bentoml_cattr.unstructure(self.generation_config))
    return config.to_dict() if return_as_dict else config
  def to_sampling_config(self) -> vllm.SamplingParams: return self.sampling_config.to_vllm()
-
  @classmethod
  def to_click_options(cls, f: AnyCallable) -> click.Command:
    """Convert current configuration to click options.
@@ -1228,6 +1221,32 @@ class LLMConfig(_ConfigAttr):
  @classmethod
  def default_implementation(cls) -> LiteralRuntime: return first_not_none(cls.__openllm_env__["framework_value"], default=get_default_implementation(cls.__openllm_default_implementation__))

+  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
+    """This handler will sanitize all attrs and setup prompt text.
+
+    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
+
+    Returns a tuple of three items:
+    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
+    - The attributes dictionary that will be passed into `self.postprocess_generate`.
+
+    `openllm.LLM` also has a sanitize_parameters that will just call this method.
+    """
+    return prompt, attrs, attrs
+
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
+    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
+
+    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
+
+    > [!NOTE]
+    > This will be used from the client side.
+
+    `openllm.LLM` also has a postprocess_generate that will just call this method.
+    """
+    return generation_result
+
+
 bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))

 def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
@@ -1253,7 +1272,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
  return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs)

 bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
-
 openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
-
-__all__ = ["LLMConfig", "field_env_key"]
--- a/openllm-core/src/openllm_core/_prompt.py
+++ b/openllm-core/src/openllm_core/_prompt.py
--- a/openllm-core/src/openllm_core/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -1,8 +1,8 @@
 """Schema definition for OpenLLM. This can be use for client interaction."""
 from __future__ import annotations
 import functools, typing as t
-import attr, inflection, openllm
-from ._configuration import GenerationConfig, LLMConfig
+import attr, inflection
+from openllm_core._configuration import GenerationConfig, LLMConfig
 from .utils import bentoml_cattr
 if t.TYPE_CHECKING: import vllm

@@ -19,9 +19,11 @@ class GenerationInput:
      if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
      return cls(**data)
  @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
+  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
+    import openllm
+    return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
  @classmethod
-  def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
+  def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
@attr.frozen(slots=True)
 class GenerationOutput:
  responses: t.List[t.Any]
--- a/openllm-core/src/openllm_core/_strategies.py
+++ b/openllm-core/src/openllm_core/_strategies.py
@@ -1,10 +1,10 @@
+# mypy: disable-error-code="no-redef"
 from __future__ import annotations
 import functools, inspect, logging, math, os, sys, types, typing as t, warnings, psutil, bentoml
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
 from .utils import DEBUG, ReprMixin
-if sys.version_info[:2] >= (3, 11): from typing import overload
-else: from typing_extensions import overload
+from ._typing_compat import overload

 class DynResource(t.Protocol):
  resource_id: t.ClassVar[str]
@@ -176,10 +176,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
  except (ImportError, RuntimeError):
    pass

-def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
-  return types.new_class(
-    name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}),
-  )
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}))

 # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
 _TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2"
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -1,13 +1,14 @@
+# mypy: disable-error-code="type-arg,valid-type"
 from __future__ import annotations
 import sys, typing as t, bentoml, attr, abc
 from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict
+
 if t.TYPE_CHECKING:
  import openllm, peft, transformers, auto_gptq as autogptq, vllm
  from bentoml._internal.runner.runnable import RunnableMethod
  from bentoml._internal.runner.runner import RunnerMethod
  from bentoml._internal.runner.strategy import Strategy

-  from .bundle.oci import LiteralContainerVersionStrategy
  from .utils.lazy import VersionInfo

 M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]")
@@ -23,6 +24,10 @@ At = t.TypeVar("At", bound=attr.AttrsInstance)
 LiteralRuntime = t.Literal["pt", "tf", "flax", "vllm"]
 AdapterType = t.Literal["lora", "adalora", "adaption_prompt", "prefix_tuning", "p_tuning", "prompt_tuning", "ia3"]

+# TODO: support quay
+LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
+LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
+
 if sys.version_info[:2] >= (3,11):
  from typing import LiteralString as LiteralString, Self as Self, overload as overload
  from typing import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform
@@ -35,12 +40,7 @@ if sys.version_info[:2] >= (3,10):
 else:
  from typing_extensions import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate

-if sys.version_info[:2] >= (3,9):
-  from typing import TypedDict as TypedDict
-else:
-  from typing_extensions import TypedDict as TypedDict
-
-class PeftAdapterOutput(TypedDict):
+class PeftAdapterOutput(t.TypedDict):
  success: bool
  result: t.Dict[str, peft.PeftConfig]
  error_msg: str
--- a/openllm-core/src/openllm_core/config/init.py
+++ b/openllm-core/src/openllm_core/config/init.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+from .configuration_auto import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
+from .configuration_baichuan import BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
+from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
+from .configuration_dolly_v2 import DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
+from .configuration_falcon import FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
+from .configuration_flan_t5 import FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
+from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
+from .configuration_llama import LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
+from .configuration_mpt import MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
+from .configuration_opt import OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
+from .configuration_stablelm import StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
+from .configuration_starcoder import StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
--- a/openllm-python/src/openllm/models/auto/configuration_auto.py
+++ b/openllm-python/src/openllm/models/auto/configuration_auto.py
@@ -1,18 +1,16 @@
 # mypy: disable-error-code="type-arg"
 from __future__ import annotations
-import typing as t
+import inflection, openllm_core, importlib, typing as t
 from collections import OrderedDict
-
-import inflection, openllm
-from openllm.utils import ReprMixin
+from openllm_core.utils import ReprMixin

 if t.TYPE_CHECKING:
  import types
-  from openllm._typing_compat import LiteralString
+  from openllm_core._typing_compat import LiteralString
  from collections import _odict_items, _odict_keys, _odict_values
-  ConfigKeysView = _odict_keys[str, type[openllm.LLMConfig]]
-  ConfigValuesView = _odict_values[str, type[openllm.LLMConfig]]
-  ConfigItemsView = _odict_items[str, type[openllm.LLMConfig]]
+  ConfigKeysView = _odict_keys[str, type[openllm_core.LLMConfig]]
+  ConfigValuesView = _odict_values[str, type[openllm_core.LLMConfig]]
+  ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]]

 # NOTE: This is the entrypoint when adding new model config
 CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")])
@@ -28,10 +26,10 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
      if inflection.underscore(key) in self._mapping: return self.__getitem__(inflection.underscore(key))
      raise KeyError(key)
    value, module_name = self._mapping[key], inflection.underscore(key)
-    if module_name not in self._modules: self._modules[module_name] = openllm.utils.EnvVarMixin(module_name).module
+    if module_name not in self._modules: self._modules[module_name] = openllm_core.utils.EnvVarMixin(module_name).module
    if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value)
    # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the object at the top level.
-    return getattr(openllm, value)
+    return getattr(importlib.import_module("openllm"), value)
  @property
  def __repr_keys__(self) -> set[str]: return set(self._mapping.keys())
  def __repr__(self) -> str: return ReprMixin.__repr__(self)
@@ -45,19 +43,19 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
    if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
    self._extra_content[key] = value

-CONFIG_MAPPING: dict[str, type[openllm.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
 # The below handle special alias when we call underscore to the name directly without processing camelcase first.
 CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder", "gpt_neo_x": "gpt_neox",}

 class AutoConfig:
  def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.")
  @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
+  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig:
    model_name = inflection.underscore(model_name)
    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
  @classmethod
-  def infer_class_from_name(cls, name: str) -> type[openllm.LLMConfig]:
+  def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
    model_name = inflection.underscore(name)
    if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name]
    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name]
--- a/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
@@ -1,22 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class BaichuanConfig(openllm.LLMConfig):
-  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
-
-  Baichuan-7B is based on Transformer architecture,
-  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
-  It supports both Chinese and English languages with a context window length of 4096.
-  It has achieved the best performance among models of the same size on standard Chinese
-  and English benchmarks (C-Eval, MMLU, etc).
-  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
-                "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
-  class GenerationConfig:
-    max_new_tokens: int = 2048
-    top_p: float = 0.7
-    temperature: float = 0.95
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt

 START_BAICHUAN_COMMAND_DOCSTRING = """\
 Run a LLMServer for Baichuan model.
@@ -38,3 +22,24 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
 $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+
+class BaichuanConfig(openllm_core.LLMConfig):
+  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
+
+  Baichuan-7B is based on Transformer architecture,
+  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
+  It supports both Chinese and English languages with a context window length of 4096.
+  It has achieved the best performance among models of the same size on standard Chinese
+  and English benchmarks (C-Eval, MMLU, etc).
+  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
+                "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    top_p: float = 0.7
+    temperature: float = 0.95
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
@@ -1,29 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class ChatGLMConfig(openllm.LLMConfig):
-  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
-
-  With the quantization technique, users can deploy locally on consumer-grade graphics cards
-  (only 6GB of GPU memory is required at the INT4 quantization level).
-
-  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
-  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
-  feedback bootstrap, and reinforcement learning wit human feedback.
-  With only about 6.2 billion parameters, the model is able to generate answers that are in line
-  with human preference.
-
-  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
-                "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
-  retain_history: bool = openllm.LLMConfig.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
-  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-  class GenerationConfig:
-    max_new_tokens: int = 2048
-    num_beams: int = 1
-    top_p: float = 0.7
-    temperature: float = 0.95
+import openllm_core, typing as t
+from openllm_core.utils import dantic

 START_CHATGLM_COMMAND_DOCSTRING = """\
 Run a LLMServer for ChatGLM model.
@@ -45,3 +22,42 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
 $ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class ChatGLMConfig(openllm_core.LLMConfig):
+  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
+
+  With the quantization technique, users can deploy locally on consumer-grade graphics cards
+  (only 6GB of GPU memory is required at the INT4 quantization level).
+
+  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
+  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
+  feedback bootstrap, and reinforcement learning wit human feedback.
+  With only about 6.2 billion parameters, the model is able to generate answers that are in line
+  with human preference.
+
+  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
+                "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
+  retain_history: bool = dantic.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
+  use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    num_beams: int = 1
+    top_p: float = 0.7
+    temperature: float = 0.95
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    prompt_text = ""
+    if use_default_prompt_template and chat_history is not None:
+      for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
+      prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
+    else: prompt_text = prompt
+    postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
+    return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
+  def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
+    generated, history = generation_result
+    if self.config.retain_history:
+      if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
+      chat_history.extend(history)
+    return generated
--- a/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -1,29 +1,9 @@
 from __future__ import annotations
-import typing as t, openllm
+import typing as t, openllm_core
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic
 if t.TYPE_CHECKING: import transformers

-class DollyV2Config(openllm.LLMConfig):
-  """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
-
-  Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
-  generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
-  classification, closed QA, generation, information extraction, open QA and summarization.
-
-  dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
-  following behavior not characteristic of the foundation model on which it is based.
-
-  Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
-  """
-  __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
-                "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
-  return_full_text: bool = openllm.LLMConfig.Field(False, description="Whether to return the full prompt to the users.")
-  class GenerationConfig:
-    temperature: float = 0.9
-    top_p: float = 0.92
-    top_k: int = 5
-    max_new_tokens: int = 256
-    eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
-
 START_DOLLY_V2_COMMAND_DOCSTRING = """\
 Run a LLMServer for dolly-v2 model.

@@ -74,3 +54,28 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
  token_ids = tokenizer.encode(key)
  if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
  return token_ids[0]
+
+class DollyV2Config(openllm_core.LLMConfig):
+  """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
+
+  Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
+  generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
+  classification, closed QA, generation, information extraction, open QA and summarization.
+
+  dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
+  following behavior not characteristic of the foundation model on which it is based.
+
+  Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
+  """
+  __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
+  return_full_text: bool = dantic.Field(False, description="Whether to return the full prompt to the users.")
+  class GenerationConfig:
+    temperature: float = 0.9
+    top_p: float = 0.92
+    top_k: int = 5
+    max_new_tokens: int = 256
+    eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
--- a/openllm-python/src/openllm/models/falcon/configuration_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/configuration_falcon.py
@@ -1,22 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class FalconConfig(openllm.LLMConfig):
-  """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
-
-  It is made available under the TII Falcon LLM License.
-
-  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
-                "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
-                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
-  class GenerationConfig:
-    max_new_tokens: int = 200
-    top_k: int = 10
-    num_return_sequences: int = 1
-    num_beams: int = 4
-    early_stopping: bool = True
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt

 START_FALCON_COMMAND_DOCSTRING = """\
 Run a LLMServer for FalconLM model.
@@ -43,3 +27,24 @@ DEFAULT_PROMPT_TEMPLATE = """{context}
 {user_name}: {instruction}
 {agent}:
 """
+
+class FalconConfig(openllm_core.LLMConfig):
+  """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
+
+  It is made available under the TII Falcon LLM License.
+
+  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
+                "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
+  class GenerationConfig:
+    max_new_tokens: int = 200
+    top_k: int = 10
+    num_return_sequences: int = 1
+    num_beams: int = 4
+    early_stopping: bool = True
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -1,21 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class FlanT5Config(openllm.LLMConfig):
-  """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
-
-  It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
-
-  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
-  """
-  __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
-                "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 2048
-    top_k: int = 50
-    top_p: float = 0.4
-    repetition_penalty = 1.0
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt

 START_FLAN_T5_COMMAND_DOCSTRING = """\
 Run a LLMServer for FLAN-T5 model.
@@ -43,3 +28,23 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
 $ openllm start flan-t5 --model-id google/flan-t5-xxl
 """
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
+
+class FlanT5Config(openllm_core.LLMConfig):
+  """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
+
+  It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
+
+  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
+  """
+  __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
+                "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 2048
+    top_k: int = 50
+    top_p: float = 0.4
+    repetition_penalty = 1.0
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
@@ -1,28 +1,7 @@
 from __future__ import annotations
-import openllm
-
-class GPTNeoXConfig(openllm.LLMConfig):
-  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
-
-  It is, to the best of our knowledge, the largest dense autoregressive model
-  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
-  can be found at https://github.com/EleutherAI/gpt-neox.
-
-  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
-
-  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
-  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
-
-  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
-  for more information.
-  """
-  __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
-                "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
-  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 100
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic

 START_GPT_NEOX_COMMAND_DOCSTRING = """\
 Run a LLMServer for GPTNeoX model.
@@ -44,3 +23,29 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
 $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class GPTNeoXConfig(openllm_core.LLMConfig):
+  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
+
+  It is, to the best of our knowledge, the largest dense autoregressive model
+  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
+  can be found at https://github.com/EleutherAI/gpt-neox.
+
+  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
+
+  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
+  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
+
+  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
+  for more information.
+  """
+  __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
+                "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
+  use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
+
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 100
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/llama/configuration_llama.py
+++ b/openllm-python/src/openllm/models/llama/configuration_llama.py
@@ -1,32 +1,7 @@
 from __future__ import annotations
-import typing as t, openllm
-
-class LlamaConfig(openllm.LLMConfig):
-  """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-
-  It is a collection of foundation language models ranging from 7B to 65B parameters.
-
-  Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
-
-  Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
-
-  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
-  for more information.
-  """
-  use_llama2_prompt: bool = openllm.LLMConfig.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
-  __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
-                "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
-                                                                          "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
-                                                                          "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
-                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
-  class GenerationConfig:
-    max_new_tokens: int = 128
-    temperature: float = 0.6
-    top_p: float = 0.9
-    top_k: int = 12
-  class SamplingParams:
-    best_of: int = 1
-    presence_penalty: float = 0.5
+import typing as t, openllm_core
+from openllm_core._prompt import process_prompt
+from openllm_core.utils import dantic

 START_LLAMA_COMMAND_DOCSTRING = """\
 Run a LLMServer for Llama model.
@@ -68,3 +43,33 @@ _v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_
 PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
 def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
+class LlamaConfig(openllm_core.LLMConfig):
+  """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+
+  It is a collection of foundation language models ranging from 7B to 65B parameters.
+
+  Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
+
+  Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
+
+  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
+  for more information.
+  """
+  use_llama2_prompt: bool = dantic.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
+                "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
+                                                                          "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
+                                                                          "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0.6
+    top_p: float = 0.9
+    top_k: int = 12
+  class SamplingParams:
+    best_of: int = 1
+    presence_penalty: float = 0.5
+  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/mpt/configuration_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/configuration_mpt.py
@@ -1,28 +1,9 @@
 from __future__ import annotations
-import sys, typing as t
+import typing as t, openllm_core
+from openllm_core.utils import dantic
+from openllm_core._prompt import process_prompt

-import openllm
-
-if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
-else: MPTPromptType = str
-
-class MPTConfig(openllm.LLMConfig):
-  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
-
-  This model was trained by [MosaicML](https://www.mosaicml.com/).
-
-  ``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
-  on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
-  for more details on specific models.
-  """
-  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
-                "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
-  prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
-  max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
-  class GenerationConfig:
-    max_new_tokens: int = 128
-    temperature: float = 0
-    top_p: float = 0.8
+MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]

 START_MPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for MPT model.
@@ -63,3 +44,32 @@ _chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instr
 PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
 def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
+class MPTConfig(openllm_core.LLMConfig):
+  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
+
+  This model was trained by [MosaicML](https://www.mosaicml.com/).
+
+  ``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
+  on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
+  for more details on specific models.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
+                "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
+  prompt_type: MPTPromptType = dantic.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
+  max_sequence_length: int = dantic.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0
+    top_p: float = 0.8
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    _template = None
+    if use_default_prompt_template:
+      if prompt_type is None:
+        if "instruct" in self.model_id: prompt_type = "instruct"
+        elif "storywriter" in self.model_id: prompt_type = "storywriter"
+        elif "chat" in self.model_id: prompt_type = "chat"
+        else: prompt_type = "default"
+      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
+    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/opt/configuration_opt.py
+++ b/openllm-python/src/openllm/models/opt/configuration_opt.py
@@ -1,27 +1,7 @@
 from __future__ import annotations
-import openllm
-
-class OPTConfig(openllm.LLMConfig):
-  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
-
-  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
-  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
-  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
-  the self-supervised causal language modeling objective.
-
-  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
-  """
-  __config__ = {
-      "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
-      "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
-      "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
-  }
-  format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
-  class GenerationConfig:
-    top_k: int = 15
-    temperature: float = 0.75
-    max_new_tokens: int = 1024
-    num_return_sequences: int = 1
+import openllm_core, typing as t
+from openllm_core.utils import dantic
+from openllm_core._prompt import process_prompt

 START_OPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for OPT model.
@@ -49,3 +29,30 @@ or provide `--model-id` flag when running ``openllm start opt``:
 $ openllm start opt --model-id facebook/opt-6.7b
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+
+class OPTConfig(openllm_core.LLMConfig):
+  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
+
+  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
+  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
+  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
+  the self-supervised causal language modeling objective.
+
+  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
+  """
+  __config__ = {
+      "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
+      "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
+      "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
+  }
+  format_outputs: bool = dantic.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
+  class GenerationConfig:
+    top_k: int = 15
+    temperature: float = 0.75
+    max_new_tokens: int = 1024
+    num_return_sequences: int = 1
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
+    if len(generation_result) == 1: return generation_result[0]
+    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+    else: return "\n".join(generation_result)
--- a/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
@@ -1,27 +1,6 @@
 from __future__ import annotations
-import openllm
-
-class StableLMConfig(openllm.LLMConfig):
-  """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
-
-  It is pre-trained on a diverse collection of English datasets with a sequence
-  length of 4096 to push beyond the context window limitations of existing open-source language models.
-
-  StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
-  built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
-  instruction-following datasets.
-
-  Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
-  and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
-  for more information.
-  """
-  __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
-                "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 128
-    top_k: int = 0
-    top_p: float = 0.9
+import openllm_core, typing as t
+from openllm_core._prompt import process_prompt

 START_STABLELM_COMMAND_DOCSTRING = """\
 Run a LLMServer for StableLM model.
@@ -49,3 +28,32 @@ SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM will refuse to participate in anything that could harm a human.
 """
 DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
+
+class StableLMConfig(openllm_core.LLMConfig):
+  """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
+
+  It is pre-trained on a diverse collection of English datasets with a sequence
+  length of 4096 to push beyond the context window limitations of existing open-source language models.
+
+  StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
+  built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
+  instruction-following datasets.
+
+  Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
+  and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
+  for more information.
+  """
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 128
+    top_k: int = 0
+    top_p: float = 0.9
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    if "tuned" in self._model_id and use_default_prompt_template:
+      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
+      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
+    else: prompt_text = prompt
+    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
@@ -1,26 +1,5 @@
 from __future__ import annotations
-
-import openllm
-
-class StarCoderConfig(openllm.LLMConfig):
-  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
-
-  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
-  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
-  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
-
-  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
-  """
-  __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
-                "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
-  class GenerationConfig:
-    temperature: float = 0.2
-    max_new_tokens: int = 256
-    min_new_tokens: int = 32
-    top_k: float = 50
-    top_p: float = 0.95
-    pad_token_id: int = 49152
-    repetition_penalty: float = 1.2
+import openllm_core, typing as t

 START_STARCODER_COMMAND_DOCSTRING = """\
 Run a LLMServer for StarCoder model.
@@ -43,3 +22,33 @@ $ openllm start starcoder --model-id 'bigcode/starcoder'
 """
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
 FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
+
+class StarCoderConfig(openllm_core.LLMConfig):
+  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
+
+  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
+  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
+  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
+
+  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
+                "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
+  class GenerationConfig:
+    temperature: float = 0.2
+    max_new_tokens: int = 256
+    min_new_tokens: int = 32
+    top_k: float = 50
+    top_p: float = 0.95
+    pad_token_id: int = 49152
+    repetition_penalty: float = 1.2
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
+    if fim_mode:
+      try: prefix, suffix = prompt.split(FIM_INDICATOR)
+      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+    else: prompt_text = prompt
+    # XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD
+    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
--- a/openllm-core/src/openllm_core/exceptions.py
+++ b/openllm-core/src/openllm_core/exceptions.py
@@ -0,0 +1,19 @@
+"""Base exceptions for OpenLLM. This extends BentoML exceptions."""
+from __future__ import annotations
+import bentoml
+class OpenLLMException(bentoml.exceptions.BentoMLException):
+  """Base class for all OpenLLM exceptions. This extends BentoMLException."""
+class GpuNotAvailableError(OpenLLMException):
+  """Raised when there is no GPU available in given system."""
+class ValidationError(OpenLLMException):
+  """Raised when a validation fails."""
+class ForbiddenAttributeError(OpenLLMException):
+  """Raised when using an _internal field."""
+class MissingAnnotationAttributeError(OpenLLMException):
+  """Raised when a field under openllm.LLMConfig is missing annotations."""
+class MissingDependencyError(BaseException):
+  """Raised when a dependency is missing."""
+class Error(BaseException):
+  """To be used instead of naked raise."""
+class FineTuneStrategyNotSupportedError(OpenLLMException):
+  """Raised when a fine-tune strategy is not supported for given LLM."""
--- a/openllm-core/src/openllm_core/utils/init.py
+++ b/openllm-core/src/openllm_core/utils/init.py
@@ -0,0 +1,304 @@
+"""Utilities function for OpenLLM.
+
+User can import these function for convenience, but we won't ensure backward compatibility for these functions. So use with caution.
+"""
+from __future__ import annotations
+import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm_core, asyncio
+from pathlib import Path
+from circus.exc import ConflictError
+from bentoml._internal.configuration import (
+  DEBUG_ENV_VAR as DEBUG_ENV_VAR,
+  GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
+  QUIET_ENV_VAR as QUIET_ENV_VAR,
+  get_debug_mode as _get_debug_mode,
+  get_quiet_mode as _get_quiet_mode,
+  set_quiet_mode as set_quiet_mode,
+)
+from bentoml._internal.models.model import ModelContext as _ModelContext
+from bentoml._internal.types import LazyType as LazyType
+from bentoml._internal.utils import (
+  LazyLoader as LazyLoader,
+  bentoml_cattr as bentoml_cattr,
+  calc_dir_size as calc_dir_size,
+  first_not_none as first_not_none,
+  pkg as pkg,
+  reserve_free_port as reserve_free_port,
+  resolve_user_filepath as resolve_user_filepath,
+)
+from openllm_core.utils.lazy import (
+  LazyModule as LazyModule,
+  VersionInfo as VersionInfo,
+)
+
+if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import AnyCallable
+
+logger = logging.getLogger(__name__)
+try: from typing import GenericAlias as _TypingGenericAlias  # type: ignore
+except ImportError: _TypingGenericAlias = ()  # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
+else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
+
+DEV_DEBUG_VAR = "OPENLLMDEVDEBUG"
+
+def set_debug_mode(enabled: bool, level: int = 1) -> None:
+  # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs
+  if enabled: os.environ[DEV_DEBUG_VAR] = str(level)
+  os.environ[DEBUG_ENV_VAR] = str(enabled)
+  os.environ[_GRPC_DEBUG_ENV_VAR] = "DEBUG" if enabled else "ERROR"
+
+def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
+  try:
+    return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
+  except TypeError:
+    if isinstance(cls, _WithArgsTypes): return False
+    raise
+
+def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
+    loop = asyncio.get_event_loop()
+    if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
+    else: return loop.run_until_complete(coro)
+
+def available_devices() -> tuple[str, ...]:
+  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+  from openllm_core._strategies import NvidiaGpuResource
+  return tuple(NvidiaGpuResource.from_system())
+
+@functools.lru_cache(maxsize=128)
+def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str:
+  """Generate a hash from given file's modification time.
+
+  Args:
+  f: The file to generate the hash from.
+  algorithm: The hashing algorithm to use. Defaults to 'sha1' (similar to how Git generate its commit hash.)
+
+  Returns:
+  The generated hash.
+  """
+  return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
+
+@functools.lru_cache(maxsize=1)
+def device_count() -> int: return len(available_devices())
+
+# equivocal setattr to save one lookup per assignment
+_object_setattr = object.__setattr__
+
+def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
+  """This makes sure that we don't overwrite any existing attributes on the object."""
+  _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
+  if not hasattr(obj, name): _setattr(name, value)
+
+def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
+
+# Special debug flag controled via OPENLLMDEVDEBUG
+DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
+# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
+MYPY = False
+SHOW_CODEGEN: bool = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
+
+def get_debug_mode() -> bool: return DEBUG or _get_debug_mode()
+def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode()
+
+class ExceptionFilter(logging.Filter):
+  def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
+    """A filter of all exception."""
+    if exclude_exceptions is None: exclude_exceptions = [ConflictError]
+    if ConflictError not in exclude_exceptions: exclude_exceptions.append(ConflictError)
+    super(ExceptionFilter, self).__init__(**kwargs)
+    self.EXCLUDE_EXCEPTIONS = exclude_exceptions
+
+  def filter(self, record: logging.LogRecord) -> bool:
+    if record.exc_info:
+      etype, _, _ = record.exc_info
+      if etype is not None:
+        for exc in self.EXCLUDE_EXCEPTIONS:
+          if issubclass(etype, exc): return False
+    return True
+
+class InfoFilter(logging.Filter):
+  def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING
+
+_LOGGING_CONFIG: dict[str, t.Any] = {
+    "version": 1, "disable_existing_loggers": True,
+    "filters": {"excfilter": {"()": "openllm_core.utils.ExceptionFilter"}, "infofilter": {"()": "openllm_core.utils.InfoFilter"}},
+    "handlers": {"bentomlhandler": {"class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout"}, "defaulthandler": {"class": "logging.StreamHandler", "level": logging.WARNING}},
+    "loggers": {"bentoml": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}, "openllm": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}},
+    "root": {"level": logging.WARNING},
+}
+
+def configure_logging() -> None:
+  """Configure logging for OpenLLM.
+
+  Behaves similar to how BentoML loggers are being configured.
+  """
+  if get_quiet_mode():
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.ERROR
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.ERROR
+    _LOGGING_CONFIG["root"]["level"] = logging.ERROR
+  elif get_debug_mode() or DEBUG:
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.DEBUG
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.DEBUG
+    _LOGGING_CONFIG["root"]["level"] = logging.DEBUG
+  else:
+    _LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.INFO
+    _LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.INFO
+    _LOGGING_CONFIG["root"]["level"] = logging.INFO
+
+  logging.config.dictConfig(_LOGGING_CONFIG)
+
+@functools.lru_cache(maxsize=1)
+def in_notebook() -> bool:
+  try:
+    from IPython.core.getipython import get_ipython
+    if t.TYPE_CHECKING:
+      from IPython.core.interactiveshell import InteractiveShell
+    return "IPKernelApp" in t.cast("dict[str, t.Any]", t.cast(t.Callable[[], "InteractiveShell"], get_ipython)().config)
+  except (ImportError, AttributeError): return False
+
+_dockerenv, _cgroup = Path("/.dockerenv"), Path("/proc/self/cgroup")
+
+class suppress(contextlib.suppress, contextlib.ContextDecorator):
+  """A version of contextlib.suppress with decorator support.
+
+  >>> @suppress(KeyError)
+  ... def key_error():
+  ...     {}['']
+  >>> key_error()
+  """
+
+def compose(*funcs: AnyCallable) -> AnyCallable:
+  """Compose any number of unary functions into a single unary function.
+
+  >>> import textwrap
+  >>> expected = str.strip(textwrap.dedent(compose.__doc__))
+  >>> strip_and_dedent = compose(str.strip, textwrap.dedent)
+  >>> strip_and_dedent(compose.__doc__) == expected
+  True
+
+  Compose also allows the innermost function to take arbitrary arguments.
+
+  >>> round_three = lambda x: round(x, ndigits=3)
+  >>> f = compose(round_three, int.__truediv__)
+  >>> [f(3*x, x+1) for x in range(1,10)]
+  [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7]
+  """
+  def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs))
+  return functools.reduce(compose_two, funcs)
+
+def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
+  """Decorate a function with a transform function that is invoked on results returned from the decorated function.
+
+  ```python
+  @apply(reversed)
+  def get_numbers(start):
+  "doc for get_numbers"
+  return range(start, start+3)
+  list(get_numbers(4))
+  # [6, 5, 4]
+  ```
+  ```python
+  get_numbers.__doc__
+  # 'doc for get_numbers'
+  ```
+  """
+  return lambda func: functools.wraps(func)(compose(transform, func))
+
+@apply(bool)
+@suppress(FileNotFoundError)
+def _text_in_file(text: str, filename: Path) -> bool:
+  return any(text in line for line in filename.open())
+
+def in_docker() -> bool:
+  """Is this current environment running in docker?
+
+  ```python
+  type(in_docker())
+  ```
+  """
+  return _dockerenv.exists() or _text_in_file("docker", _cgroup)
+
+T, K = t.TypeVar("T"), t.TypeVar("K")
+
+def resolve_filepath(path: str, ctx: str | None = None) -> str:
+  """Resolve a file path to an absolute path, expand user and environment variables."""
+  try: return resolve_user_filepath(path, ctx)
+  except FileNotFoundError: return path
+
+def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path)))
+
+def generate_context(framework_name: str) -> _ModelContext:
+  framework_versions = {"transformers": pkg.get_pkg_version("transformers")}
+  if openllm_core.utils.is_torch_available(): framework_versions["torch"] = pkg.get_pkg_version("torch")
+  if openllm_core.utils.is_tf_available():
+    from bentoml._internal.frameworks.utils.tensorflow import get_tf_version
+    framework_versions["tensorflow"] = get_tf_version()
+  if openllm_core.utils.is_flax_available(): framework_versions.update({"flax": pkg.get_pkg_version("flax"), "jax": pkg.get_pkg_version("jax"), "jaxlib": pkg.get_pkg_version("jaxlib")})
+  return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)
+
+_TOKENIZER_PREFIX = "_tokenizer_"
+
+def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
+  """Normalize the given attrs to a model and tokenizer kwargs accordingly."""
+  tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
+  for k in tuple(attrs.keys()):
+    if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
+  return attrs, tokenizer_attrs
+
+# NOTE: The set marks contains a set of modules name
+# that are available above and are whitelisted
+# to be included in the extra_objects map.
+_whitelist_modules = {"pkg"}
+
+# XXX: define all classes, functions import above this line
+# since _extras will be the locals() import from this file.
+_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))}
+_extras["__openllm_migration__"] = {"ModelEnv": "EnvVarMixin"}
+_import_structure: dict[str, list[str]] = {
+  "analytics": [], "codegen": [], "dantic": [], "representation": ["ReprMixin"], "lazy": ["LazyModule"],
+  "import_utils": ["OPTIONAL_DEPENDENCIES", "ENV_VARS_TRUE_VALUES", "DummyMetaclass", "EnvVarMixin", "require_backends",
+                  "is_cpm_kernels_available", "is_einops_available", "is_flax_available", "is_tf_available", "is_vllm_available", "is_torch_available", "is_bitsandbytes_available", "is_peft_available", "is_datasets_available",
+                  "is_transformers_supports_kbit", "is_transformers_supports_agent", "is_jupyter_available", "is_jupytext_available", "is_notebook_available", "is_triton_available", "is_autogptq_available", "is_sentencepiece_available",
+                  "is_xformers_available", "is_fairscale_available", "is_grpc_available", "is_grpc_health_available", "is_transformers_available"]}
+
+if t.TYPE_CHECKING:
+  # NOTE: The following exports useful utils from bentoml
+  from . import (
+    analytics as analytics,
+    codegen as codegen,
+    dantic as dantic,
+  )
+  from .import_utils import (
+    ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
+    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
+    DummyMetaclass as DummyMetaclass,
+    EnvVarMixin as EnvVarMixin,
+    is_autogptq_available as is_autogptq_available,
+    is_bitsandbytes_available as is_bitsandbytes_available,
+    is_cpm_kernels_available as is_cpm_kernels_available,
+    is_datasets_available as is_datasets_available,
+    is_einops_available as is_einops_available,
+    is_fairscale_available as is_fairscale_available,
+    is_flax_available as is_flax_available,
+    is_jupyter_available as is_jupyter_available,
+    is_jupytext_available as is_jupytext_available,
+    is_notebook_available as is_notebook_available,
+    is_peft_available as is_peft_available,
+    is_sentencepiece_available as is_sentencepiece_available,
+    is_tf_available as is_tf_available,
+    is_torch_available as is_torch_available,
+    is_transformers_supports_agent as is_transformers_supports_agent,
+    is_transformers_supports_kbit as is_transformers_supports_kbit,
+    is_triton_available as is_triton_available,
+    is_vllm_available as is_vllm_available,
+    is_xformers_available as is_xformers_available,
+    is_grpc_available as is_grpc_available,
+    is_grpc_health_available as is_grpc_health_available,
+    is_transformers_available as is_transformers_available,
+    require_backends as require_backends,
+  )
+  from .representation import ReprMixin as ReprMixin
+
+__lazy = LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects=_extras)
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
--- a/openllm-core/src/openllm_core/utils/analytics.py
+++ b/openllm-core/src/openllm_core/utils/analytics.py
@@ -3,10 +3,9 @@
 Users can disable this with OPENLLM_DO_NOT_TRACK envvar.
 """
 from __future__ import annotations
-import contextlib, functools, logging, os, re, typing as t, importlib.metadata
-import attr, openllm
+import contextlib, functools, logging, os, re, typing as t, importlib.metadata, attr, openllm_core
 from bentoml._internal.utils import analytics as _internal_analytics
-from openllm._typing_compat import ParamSpec
+from openllm_core._typing_compat import ParamSpec

 P = ParamSpec("P")
 T = t.TypeVar("T")
@@ -17,7 +16,7 @@ OPENLLM_DO_NOT_TRACK = "OPENLLM_DO_NOT_TRACK"
 DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()

@functools.lru_cache(maxsize=1)
-def do_not_track() -> bool: return DO_NOT_TRACK in openllm.utils.ENV_VARS_TRUE_VALUES
+def do_not_track() -> bool: return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
@functools.lru_cache(maxsize=1)
 def _usage_event_debugging() -> bool: return os.environ.get("__BENTOML_DEBUG_USAGE", str(False)).lower() == "true"

@@ -27,7 +26,7 @@ def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
    try: return func(*args, **kwargs)
    except Exception as err:
      if _usage_event_debugging():
-        if openllm.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
+        if openllm_core.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
        else: logger.info("Tracking Error: %s", err)
      else: logger.debug("Tracking Error: %s", err)
  return wrapper
@@ -73,8 +72,8 @@ class StartInitEvent(EventMeta):
  model_name: str
  llm_config: t.Dict[str, t.Any] = attr.field(default=None)
  @staticmethod
-  def handler(llm_config: openllm.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
+  def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())

-def track_start_init(llm_config: openllm.LLMConfig) -> None:
+def track_start_init(llm_config: openllm_core.LLMConfig) -> None:
  if do_not_track(): return
  track(StartInitEvent.handler(llm_config))
--- a/openllm-core/src/openllm_core/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -1,49 +1,14 @@
 from __future__ import annotations
-import functools, inspect, linecache, os, logging, string, types, typing as t
+import functools, inspect, linecache, logging, types, typing as t, orjson
 from operator import itemgetter
-from pathlib import Path
-import orjson

 if t.TYPE_CHECKING:
-  from fs.base import FS
-
-  import openllm
-  from openllm._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
+  import openllm_core
+  from openllm_core._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
  PartialAny = functools.partial[t.Any]

 _T = t.TypeVar("_T", bound=t.Callable[..., t.Any])
 logger = logging.getLogger(__name__)
-OPENLLM_MODEL_NAME = "# openllm: model name"
-OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
-class ModelNameFormatter(string.Formatter):
-  model_keyword: LiteralString = "__model_name__"
-  def __init__(self, model_name: str):
-    """The formatter that extends model_name to be formatted the 'service.py'."""
-    super().__init__()
-    self.model_name = model_name
-  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
-  def can_format(self, value: str) -> bool:
-    try:
-      self.parse(value)
-      return True
-    except ValueError: return False
-class ModelIdFormatter(ModelNameFormatter):
-  model_keyword: LiteralString = "__model_id__"
-class ModelAdapterMapFormatter(ModelNameFormatter):
-  model_keyword: LiteralString = "__model_adapter_map__"
-
-_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
-def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
-  from openllm.utils import DEBUG
-  model_name = llm.config["model_name"]
-  logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
-  with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
-  for it in src_contents:
-    if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
-    elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
-  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
-  if DEBUG: logger.info("Generated script:\n%s", script)
-  llm_fs.writetext(llm.config["service_name"], script)

 # sentinel object for unequivocal object() getattr
 _sentinel = object()
@@ -72,7 +37,7 @@ def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str
  try: method_or_cls.__doc__ = _overwrite_doc or "Generated by ``openllm.LLMConfig`` for class " f"{cls.__qualname__}."
  except AttributeError: pass
  return method_or_cls
-def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs)  # noqa: S307
+def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs)
 def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable:
  locs: DictStrAny = {}
  # In order of debuggers like PDB being able to step through the code, we add a fake linecache entry.
@@ -111,15 +76,15 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.

 def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
 def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable:
-  from openllm.utils import SHOW_CODEGEN
+  from openllm_core.utils import SHOW_CODEGEN
  script = "def %s(%s):\n    %s\n" % (func_name, ", ".join(args) if args is not None else "", "\n    ".join(lines) if lines else "pass")
  meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
  if annotations: meth.__annotations__ = annotations
  if SHOW_CODEGEN: logger.info("Generated script for %s:\n\n%s", typ, script)
  return meth

-def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
-  from openllm.utils import dantic, field_env_key
+def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
+  from openllm_core.utils import dantic, field_env_key
  def identity(_: str, x_value: t.Any) -> t.Any: return x_value
  default_callback = identity if default_callback is None else default_callback
  globs = {} if globs is None else globs
@@ -129,13 +94,13 @@ def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix:
  return generate_function(cls, "__auto_env", lines, args=("_", "fields"), globs=globs, annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann})
 def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
  """Enhance sdk with nice repr that plays well with your brain."""
-  from openllm.utils import ReprMixin
+  from openllm_core.utils import ReprMixin
  if name is None: name = func.__name__.strip("_")
  _signatures = inspect.signature(func).parameters
  def _repr(self: ReprMixin) -> str: return f"<generated function {name} {orjson.dumps(dict(self.__repr_args__()), option=orjson.OPT_NON_STR_KEYS | orjson.OPT_INDENT_2).decode()}>"
  def _repr_args(self: ReprMixin) -> t.Iterator[t.Tuple[str, t.Any]]: return ((k, _signatures[k].annotation) for k in self.__repr_keys__)
  if func.__doc__ is None: doc = f"Generated SDK for {func.__name__}"
  else: doc = func.__doc__
-  return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm",}),)(func, **attrs), func,))
+  return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm"}),)(func, **attrs), func,))

-__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function", "OPENLLM_MODEL_NAME", "OPENLLM_MODEL_ADAPTER_MAP"]
+__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function"]
--- a/openllm-core/src/openllm_core/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -335,7 +335,6 @@ class CudaValueType(ParamType):
  name = "cuda"
  envvar_list_splitter = ","
  is_composite = True
-  typ = click_types.convert_type(str)

  def split_envvar_value(self, rv: str) -> t.Sequence[str]:
    var = tuple(i for i in rv.split(self.envvar_list_splitter))
@@ -352,10 +351,11 @@ class CudaValueType(ParamType):
    param: The parameter that is requesting completion.
    incomplete: Value being completed. May be empty.
    """
-    from openllm.utils import available_devices
+    from openllm_core.utils import available_devices
    mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices()
    return [sc.CompletionItem(str(i), help=f"CUDA device index {i}") for i in mapping]
  def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any:
+    typ = click_types.convert_type(str)
    if isinstance(value, bytes):
      enc = _get_argv_encoding()
      try: value = value.decode(enc)
@@ -365,7 +365,7 @@ class CudaValueType(ParamType):
          try: value = value.decode(fs_enc)
          except UnicodeError: value = value.decode("utf-8", "replace")
        else: value = value.decode("utf-8", "replace")
-    return tuple(self.typ(x, param, ctx) for x in value.split(","))
+    return tuple(typ(x, param, ctx) for x in value.split(","))

  def __repr__(self) -> str: return "STRING"

--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -1,19 +1,19 @@
 """Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons."""
 from __future__ import annotations
-import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t
+import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t, openllm_core
 from collections import OrderedDict
 import inflection, packaging.version
 from bentoml._internal.utils import LazyLoader, pkg
-from openllm._typing_compat import overload, LiteralString
+from openllm_core._typing_compat import overload, LiteralString

 from .representation import ReprMixin

 if t.TYPE_CHECKING:
  BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
-  from openllm._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralRuntime

 logger = logging.getLogger(__name__)
-OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq",}
+OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq"}
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
 USE_TF = os.environ.get("USE_TF", "AUTO").upper()
@@ -32,6 +32,9 @@ _torch_available = importlib.util.find_spec("torch") is not None
 _tf_available = importlib.util.find_spec("tensorflow") is not None
 _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
 _vllm_available = importlib.util.find_spec("vllm") is not None
+_transformers_available = _is_package_available("transformers")
+_grpc_available = importlib.util.find_spec("grpc") is not None
+_grpc_health_available = importlib.util.find_spec("grpc_health") is not None
 _peft_available = _is_package_available("peft")
 _einops_available = _is_package_available("einops")
 _cpm_kernel_available = _is_package_available("cpm_kernels")
@@ -46,6 +49,9 @@ _sentencepiece_available = _is_package_available("sentencepiece")
 _xformers_available = _is_package_available("xformers")
 _fairscale_available = _is_package_available("fairscale")

+def is_transformers_available() -> bool: return _transformers_available
+def is_grpc_available() -> bool: return _grpc_available
+def is_grpc_health_available() -> bool: return _grpc_health_available
 def is_transformers_supports_kbit() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 30)
 def is_transformers_supports_agent() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 29)
 def is_jupyter_available() -> bool: return _jupyter_available
@@ -86,7 +92,7 @@ def is_tf_available() -> bool:
          try:
            _tf_version = importlib.metadata.version(_pkg)
            break
-          except importlib.metadata.PackageNotFoundError: pass  # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
+          except importlib.metadata.PackageNotFoundError: pass  # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
        _tf_available = _tf_version is not None
      if _tf_available:
        if _tf_version and packaging.version.parse(_tf_version) < packaging.version.parse("2"):
@@ -281,7 +287,7 @@ class EnvVarMixin(ReprMixin):
    raise KeyError(f"Key {item} not found in {self}")
  def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
    """EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
-    from openllm._configuration import field_env_key
+    from openllm_core.utils import field_env_key
    self.model_name = inflection.underscore(model_name)
    self._implementation = implementation
    self._model_id = model_id
@@ -307,6 +313,6 @@ class EnvVarMixin(ReprMixin):
  @property
  def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
  @property
-  def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
+  def start_docstring(self) -> str: return getattr(openllm_core.config, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
  @property
  def module(self) -> LazyLoader: return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
--- a/openllm-core/src/openllm_core/utils/lazy.py
+++ b/openllm-core/src/openllm_core/utils/lazy.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
-import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t
-import attr, openllm
+import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t, attr, openllm_core

 __all__ = ["VersionInfo", "LazyModule"]
 # vendorred from attrs
@@ -75,7 +74,7 @@ class LazyModule(types.ModuleType):

    It also contains a special case for all of the metadata information, such as __version__ and __version_info__.
    """
-    if name in _reserved_namespace: raise openllm.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
+    if name in _reserved_namespace: raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
    dunder_to_metadata = {"__title__": "Name", "__copyright__": "", "__version__": "version", "__version_info__": "version", "__description__": "summary", "__uri__": "", "__url__": "", "__author__": "", "__email__": "", "__license__": "license", "__homepage__": ""}
    if name in dunder_to_metadata:
      if name not in {"__version_info__", "__copyright__", "__version__"}: warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2)
--- a/openllm-core/src/openllm_core/utils/representation.py
+++ b/openllm-core/src/openllm_core/utils/representation.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
-import typing as t
 from abc import abstractmethod
-import attr, orjson
-from openllm import utils
-if t.TYPE_CHECKING: from openllm._typing_compat import TypeAlias
+import attr, orjson, typing as t
+from openllm_core import utils
+if t.TYPE_CHECKING: from openllm_core._typing_compat import TypeAlias

 ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None]
 class ReprMixin:
--- a/openllm-python/ADDING_NEW_MODEL.md
+++ b/openllm-python/ADDING_NEW_MODEL.md
@@ -8,21 +8,24 @@ environment by referring to our
 ## Procedure

 All the relevant code for incorporating a new model resides within
-[`src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
+[`$GIT_ROOT/openllm-python/src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
 Here's your roadmap:

 - [ ] Generate model configuration file:
-      `src/openllm/models/{model_name}/configuration_{model_name}.py`
+      `$GIT_ROOT/openllm-core/src/openllm_core/config/configuration_{model_name}.py`
 - [ ] Establish model implementation files:
-      `src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
+      `$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
 - [ ] Create module's `__init__.py`:
-      `src/openllm/models/{model_name}/__init__.py`
- [ ] Adjust the entrypoints for files at `src/openllm/models/auto/*` If it is a
-      new runtime, then add it a `src/openllm/models/auto/modeling_{runtime}_auto.py`.
+      `$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/__init__.py`
+- [ ] Adjust the entrypoints for files at `$GIT_ROOT/openllm-python/src/openllm/models/auto/*` If it is a
+      new runtime, then add it a `$GIT_ROOT/openllm-python/src/openllm/models/auto/modeling_{runtime}_auto.py`.
      See the other auto runtime for example.
- [ ] Modify the main `__init__.py`: `src/openllm/models/__init__.py`
+- [ ] Run the following script: `$GIT_ROOT/tools/update-models-import.py`
 - [ ] Run the following to update stubs: `hatch run check-stubs`

+> [!NOTE]
+> `$GIT_ROOT` refers to `$(git rev-parse --show-toplevel)`
+
 For a working example, check out any existing model.

 ### Model Configuration
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -5,7 +5,6 @@ requires = [
  "hatchling==1.18.0",
  "hatch-vcs==0.3.0",
  "hatch-fancy-pypi-readme==23.1.0",
-  "hatch-mypyc==0.16.0",
 ]

 [project]
@@ -39,20 +38,14 @@ classifiers = [
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "bentoml[grpc,io]>=1.0.25",
+  "bentoml[grpc,io]>=1.1.2",
  "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "openllm-client",
  "safetensors",
  "optimum",
-  "attrs>=23.1.0",
-  "cattrs>=23.1.0",
-  "orjson",
-  "inflection",
-  "tabulate[widechars]>=0.9.0",
-  "httpx",
-  "click>=8.1.3",
-  "typing_extensions",
-  "mypy_extensions",
  "ghapi",
+  "tabulate[widechars]>=0.9.0",
+  "click>=8.1.3",
  "cuda-python;platform_system!=\"Darwin\"",
  "bitsandbytes<0.42",
 ]
@@ -102,34 +95,21 @@ Twitter = "https://twitter.com/bentomlai"

 [project.optional-dependencies]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-all = [
-  "openllm[agents]",
-  "openllm[baichuan]",
-  "openllm[chatglm]",
-  "openllm[falcon]",
-  "openllm[fine-tune]",
-  "openllm[flan-t5]",
-  "openllm[ggml]",
-  "openllm[gptq]",
-  "openllm[llama]",
-  "openllm[mpt]",
-  "openllm[openai]",
-  "openllm[opt]",
-  "openllm[playground]",
-  "openllm[starcoder]",
-  "openllm[vllm]",
-]
+all = ["openllm[full]"]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
 falcon = ["einops", "xformers"]
 fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
-flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
+full = [
+  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,llama,mpt,openai,opt,playground,starcoder,vllm]",
+]
 ggml = ["ctransformers"]
 gptq = ["auto-gptq[triton]"]
 llama = ["fairscale", "sentencepiece"]
 mpt = ["triton", "einops"]
 openai = ["openai", "tiktoken"]
-opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
 vllm = ["vllm", "ray"]
@@ -165,10 +145,10 @@ exclude = [
 [tool.hatch.build.targets.wheel.hooks.mypyc]
 dependencies = [
  "hatch-mypyc==0.16.0",
-  "mypy==1.4.1",
+  "mypy==1.5.1",
  # avoid https://github.com/pallets/click/issues/2558
  "click==8.1.3",
-  "bentoml==1.1.1",
+  "bentoml==1.1.2",
  "transformers>=4.31.0",
  "pandas-stubs",
  "types-psutil",
@@ -177,19 +157,14 @@ dependencies = [
  "types-protobuf",
 ]
 enable-by-default = false
-exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
+exclude = ["src/openllm/_service.py", "src/openllm/utils/__init__.py"]
 include = [
-  "src/openllm/models/__init__.py",
-  "src/openllm/models/auto/__init__.py",
-  "src/openllm/utils/__init__.py",
  "src/openllm/__init__.py",
-  "src/openllm/_prompt.py",
-  "src/openllm/_schema.py",
  "src/openllm/_quantisation.py",
  "src/openllm/_generation.py",
-  "src/openllm/_strategies.py",
  "src/openllm/exceptions.py",
  "src/openllm/testing.py",
+  "src/openllm/utils",
 ]
 # NOTE: This is consistent with pyproject.toml
 mypy-args = [
@@ -204,7 +179,6 @@ mypy-args = [
  "--no-warn-no-return",
  "--no-warn-unused-ignores",
  "--exclude='/src\\/openllm\\/playground\\/**'",
-  "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
 ]
 options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
 require-runtime-dependencies = true
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -9,13 +9,18 @@ deploy, and monitor any LLMs with ease.
 * Native integration with BentoML and LangChain for custom LLM apps
 """
 from __future__ import annotations
-import logging as _logging, os as _os, typing as _t, warnings as _warnings
+import logging as _logging, os as _os, typing as _t, warnings as _warnings, openllm_core
 from pathlib import Path as _Path
 from . import exceptions as exceptions, utils as utils

-if utils.DEBUG:
-  utils.set_debug_mode(True)
-  utils.set_quiet_mode(False)
+from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
+from openllm_core._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
+from openllm_core._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig
+
+if openllm_core.utils.DEBUG:
+  openllm_core.utils.set_debug_mode(True)
+  openllm_core.utils.set_quiet_mode(False)
  _logging.basicConfig(level=_logging.NOTSET)
 else:
  # configuration for bitsandbytes before import
@@ -28,40 +33,26 @@ else:
  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")

 _import_structure: dict[str, list[str]] = {
-    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
-    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
-    "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
-    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [],
+    "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": [], "models.baichuan": [], "models.dolly_v2": [], "models.falcon": [], "models.flan_t5": [], "models.gpt_neox": [], "models.llama": [], "models.mpt": [], "models.opt": [], "models.stablelm": [], "models.starcoder": []
 }
 COMPILED = _Path(__file__).suffix in (".pyd", ".so")

 if _t.TYPE_CHECKING:
  from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
-  from ._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
  from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
  from ._quantisation import infer_quantisation_config as infer_quantisation_config
-  from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
  from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
-  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
  from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
-  from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
-  from .models.baichuan import BaichuanConfig as BaichuanConfig
-  from .models.chatglm import ChatGLMConfig as ChatGLMConfig
-  from .models.dolly_v2 import DollyV2Config as DollyV2Config
-  from .models.falcon import FalconConfig as FalconConfig
-  from .models.flan_t5 import FlanT5Config as FlanT5Config
-  from .models.gpt_neox import GPTNeoXConfig as GPTNeoXConfig
-  from .models.llama import LlamaConfig as LlamaConfig
-  from .models.mpt import MPTConfig as MPTConfig
-  from .models.opt import OPTConfig as OPTConfig
-  from .models.stablelm import StableLMConfig as StableLMConfig
-  from .models.starcoder import StarCoderConfig as StarCoderConfig
+  from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
  from .serialisation import ggml as ggml, transformers as transformers
  from .utils import infer_auto_class as infer_auto_class

 try:
-  if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
 else:
@@ -71,7 +62,7 @@ else:
    from .models.baichuan import Baichuan as Baichuan
    from .models.chatglm import ChatGLM as ChatGLM
 try:
-  if not (utils.is_torch_available() and utils.is_triton_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
  else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
@@ -79,7 +70,7 @@ else:
  _import_structure["models.mpt"].extend(["MPT"])
  if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
 try:
-  if not (utils.is_torch_available() and utils.is_einops_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
  else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
@@ -88,7 +79,7 @@ else:
  if _t.TYPE_CHECKING: from .models.falcon import Falcon as Falcon

 try:
-  if not utils.is_torch_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")]
 else:
@@ -110,7 +101,7 @@ else:
    from .models.stablelm import StableLM as StableLM
    from .models.starcoder import StarCoder as StarCoder
 try:
-  if not utils.is_vllm_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -136,7 +127,7 @@ else:
    from .models.stablelm import VLLMStableLM as VLLMStableLM
    from .models.starcoder import VLLMStarCoder as VLLMStarCoder
 try:
-  if not utils.is_flax_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -148,7 +139,7 @@ else:
    from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
    from .models.opt import FlaxOPT as FlaxOPT
 try:
-  if not utils.is_tf_available(): raise exceptions.MissingDependencyError
+  if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
 else:
@@ -161,7 +152,7 @@ else:
    from .models.opt import TFOPT as TFOPT

 # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = utils.LazyModule(__name__, _os.path.abspath("__file__"), _import_structure, extra_objects={"COMPILED": COMPILED})
+__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,22 +1,21 @@
 from __future__ import annotations
-import functools, inspect, logging, os, re, traceback, types, typing as t, uuid
+import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc
 from abc import ABC, abstractmethod
 from pathlib import Path
-import attr, fs.path, inflection, orjson, bentoml, openllm, gc
 from huggingface_hub import hf_hub_download
 from bentoml._internal.models.model import ModelSignature

-from ._configuration import (
+from openllm_core._configuration import (
  FineTuneConfig,
  LLMConfig,
  _object_getattribute,
  _setattr_class,
 )
 from ._quantisation import infer_quantisation_config
-from ._schema import unmarshal_vllm_outputs
+from openllm_core._schema import unmarshal_vllm_outputs
 from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
 from .models.auto import AutoConfig
-from .utils import (
+from openllm_core.utils import (
  DEBUG,
  ENV_VARS_TRUE_VALUES,
  MYPY,
@@ -29,7 +28,6 @@ from .utils import (
  device_count,
  first_not_none,
  generate_hash_from_file,
-  infer_auto_class,
  is_peft_available,
  is_torch_available,
  non_intrusive_setattr,
@@ -37,8 +35,8 @@ from .utils import (
  resolve_filepath,
  validate_is_path,
 )
-
-from ._typing_compat import (
+from .utils import infer_auto_class
+from openllm_core._typing_compat import (
  AdaptersMapping,
  AdaptersTuple,
  AnyCallable,
@@ -57,8 +55,8 @@ from ._typing_compat import (

 if t.TYPE_CHECKING:
  import auto_gptq as autogptq, peft, torch, transformers, vllm
-  from ._configuration import PeftType
-  from .utils.representation import ReprArgs
+  from openllm_core._configuration import PeftType
+  from openllm_core.utils.representation import ReprArgs
 else:
  autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
  vllm = LazyLoader("vllm", globals(), "vllm")
@@ -156,27 +154,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
    """The iterator version of `generate` function."""
    raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.")

-  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
-    """This handler will sanitize all attrs and setup prompt text.
-
-    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
-
-    Returns a tuple of three items:
-    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
-    - The attributes dictionary that will be passed into `self.postprocess_generate`.
-    """
-    return prompt, attrs, attrs
-
-  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
-    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
-
-    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
-
-    > [!NOTE]
-    > This will be used from the client side.
-    """
-    return generation_result
-
  def llm_post_init(self) -> None:
    """This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals."""
    pass
@@ -380,9 +357,7 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
    anns[key] = interface_anns.get(key)
  return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)

-def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
-  return generation_result[0]["outputs"][0]["text"]
-
+def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
 def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
  outputs: list[vllm.RequestOutput] = []
  # TODO: support prompt_token_ids
@@ -430,8 +405,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
    _make_assignment_script(cls)(cls)
    if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER
-
-  # fmt: off
  @overload
  def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
  @overload
@@ -459,24 +432,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
    if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
    elif hasattr(self, item): return getattr(self, item)
    else: raise KeyError(item)
-  @classmethod
  @overload
-  def from_pretrained(
-      cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
-      quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
-  ) -> LLM[M, T]:    ...
  @classmethod
+  def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
  @overload
-  def from_pretrained(
-      cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
-      quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
-  ) -> LLM[M, T]:    ...
-  # fmt: on
  @classmethod
-  def from_pretrained(
-      cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None,
-      adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any,
-  ) -> LLM[M, T]:
+  def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
+  @classmethod
+  def from_pretrained(cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any) -> LLM[M, T]:
    """Instantiate a pretrained LLM.

    ``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
@@ -708,7 +671,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
    # NOTE: Save the args and kwargs for latter load
    self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
-
    # handle trust_remote_code
    _from_env = os.getenv("TRUST_REMOTE_CODE", None)
    self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))
@@ -723,7 +685,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
  def __setattr__(self, attr: str, value: t.Any) -> None:
    if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.")
    super().__setattr__(attr, value)
-
  @property
  def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping
  @adapters_mapping.setter
@@ -740,6 +701,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
  def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime
  @property
  def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner"
+  # NOTE: The section below defines a loose contract with langchain's LLM interface.
  @property
  def llm_type(self) -> str: return normalise_model_name(self._model_id)
  @property
@@ -755,6 +717,27 @@ class LLM(LLMInterface[M, T], ReprMixin):
    if self.__llm_bentomodel__ is None: self.__llm_bentomodel__ = openllm.serialisation.get(self)
    return self.__llm_bentomodel__

+  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
+    """This handler will sanitize all attrs and setup prompt text.
+
+    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
+
+    Returns a tuple of three items:
+    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
+    - The attributes dictionary that will be passed into `self.postprocess_generate`.
+    """
+    return self.config.sanitize_parameters(prompt, **attrs)
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
+    """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
+
+    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
+
+    > [!NOTE]
+    > This will be used from the client side.
+    """
+    if isinstance(generation_result, dict): return generation_result["text"]
+    return self.config.postprocess_generate(prompt, generation_result, **attrs)
+
  @property
  def model(self) -> M:
    # Run check for GPU
@@ -868,7 +851,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

  # order of these fields matter here, make sure to sync it with
  # openllm.models.auto.factory.BaseAutoLLMClass.for_model
-  def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] | None = None) -> LLMRunner[M, T]:
+  def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
    """Convert this LLM into a Runner.

    Args:
@@ -894,10 +877,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    try: models.append(self._bentomodel)
    except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None

-    if scheduling_strategy is None:
-      from ._strategies import CascadingResourceStrategy
-      scheduling_strategy = CascadingResourceStrategy
-
    generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
    generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
@@ -932,10 +911,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    for it in self.generate_iterator(prompt, **attrs): pass
    return [it]

-  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str:
-    if isinstance(generation_result, dict): return generation_result["text"]
-    return generation_result
-
  def generate_iterator(self, prompt: str, /,
                        *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
    # NOTE: encoder-decoder models will need to implement their own generate_iterator for now
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,12 +1,11 @@
-# mypy: disable-error-code="name-defined"
+# mypy: disable-error-code="name-defined,no-redef"
 from __future__ import annotations
-import logging, sys, typing as t
-from .utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
-if sys.version_info[:2] >= (3, 11): from typing import overload
-else: from typing_extensions import overload
+import logging, typing as t
+from openllm_core.utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
+from openllm_core._typing_compat import overload
 if t.TYPE_CHECKING:
  from ._llm import LLM
-  from ._typing_compat import DictStrAny
+  from openllm_core._typing_compat import DictStrAny

 autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers")

--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -4,15 +4,12 @@ These utilities will stay internal, and its API can be changed or updated withou
 """
 from __future__ import annotations
 import os, typing as t
-from openllm.utils import LazyModule
+from openllm_core.utils import LazyModule

 _import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]}

 if t.TYPE_CHECKING:
-  from . import (
-    _package as _package,
-    oci as oci,
-  )
+  from . import _package as _package, oci as oci
  from ._package import (
    build_editable as build_editable,
    construct_docker_options as construct_docker_options,
@@ -28,7 +25,7 @@ if t.TYPE_CHECKING:
    supported_registries as supported_registries,
  )

-__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
-__all__=__lazy.__all__
-__dir__=__lazy.__dir__
-__getattr__=__lazy.__getattr__
+__lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,35 +1,34 @@
 # mypy: disable-error-code="misc"
 from __future__ import annotations
-import importlib.metadata, inspect, logging, os, typing as t
+import fs, fs.copy, fs.errors, orjson, bentoml, openllm_core, importlib.metadata, inspect, logging, os, typing as t, string
 from pathlib import Path
-import fs, fs.copy, fs.errors, orjson, bentoml, openllm
 from simple_di import Provide, inject
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from . import oci

 if t.TYPE_CHECKING:
+  import openllm
  from fs.base import FS
-  from openllm._typing_compat import LiteralString
+  from openllm_core._typing_compat import LiteralString, LiteralContainerRegistry, LiteralContainerVersionStrategy
  from bentoml._internal.bento import BentoStore
  from bentoml._internal.models.model import ModelStore
-  from .oci import LiteralContainerRegistry, LiteralContainerVersionStrategy

 logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"

-def build_editable(path: str) -> str | None:
+def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "openllm_client"] = "openllm") -> str | None:
  """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
  if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != "true": return None
  # We need to build the package in editable mode, so that we can import it
  from build import ProjectBuilder
  from build.env import IsolatedEnvBuilder
-  module_location = openllm.utils.pkg.source_locations("openllm")
+  module_location = openllm_core.utils.pkg.source_locations(package)
  if not module_location: raise RuntimeError("Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.")
  pyproject_path = Path(module_location).parent.parent/"pyproject.toml"
  if os.path.isfile(pyproject_path.__fspath__()):
-    logger.info("OpenLLM is installed in editable mode. Generating built wheels...")
+    logger.info("Generating built wheels for package %s...", package)
    with IsolatedEnvBuilder() as env:
      builder = ProjectBuilder(pyproject_path.parent)
      builder.python_executable = env.executable
@@ -49,15 +48,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d

  req = llm.config["requirements"]
  if req is not None: packages.extend(req)
-  if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm.utils.pkg.pkg_version_info('bentoml')])}")
+  if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")

  env = llm.config["env"]
  framework_envvar = env["framework_value"]
  if framework_envvar == "flax":
-    if not openllm.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+    if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
    packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")])
  elif framework_envvar == "tf":
-    if not openllm.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+    if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
    candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",)
    # For the metadata, we have to look for both tensorflow and tensorflow-cpu
    for candidate in candidates:
@@ -68,19 +67,19 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
          _tf_version = importlib.metadata.version(candidate)
          packages.extend([f"tensorflow>={_tf_version}"])
        break
-      except importlib.metadata.PackageNotFoundError: pass  # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
+      except importlib.metadata.PackageNotFoundError: pass  # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
  else:
-    if not openllm.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
+    if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
    packages.extend([f'torch>={importlib.metadata.version("torch")}'])
  wheels: list[str] = []
-  built_wheels = build_editable(llm_fs.getsyspath("/"))
-  if built_wheels is not None: wheels.append(llm_fs.getsyspath(f"/{built_wheels.split('/')[-1]}"))
+  built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")]
+  if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])

 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
  from openllm.cli._factory import parse_config_options
  environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
-  env: openllm.utils.EnvVarMixin = llm.config["env"]
+  env: openllm_core.utils.EnvVarMixin = llm.config["env"]
  if env["framework_value"] == "vllm": serialisation_format = "legacy"
  env_dict = {
      env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
@@ -91,13 +90,45 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
  if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")

  # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)

  env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
  if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
  env_dict[_env.runtime] = _env["runtime_value"]
  return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)

+OPENLLM_MODEL_NAME = "# openllm: model name"
+OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
+class ModelNameFormatter(string.Formatter):
+  model_keyword: LiteralString = "__model_name__"
+  def __init__(self, model_name: str):
+    """The formatter that extends model_name to be formatted the 'service.py'."""
+    super().__init__()
+    self.model_name = model_name
+  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
+  def can_format(self, value: str) -> bool:
+    try:
+      self.parse(value)
+      return True
+    except ValueError: return False
+class ModelIdFormatter(ModelNameFormatter):
+  model_keyword: LiteralString = "__model_id__"
+class ModelAdapterMapFormatter(ModelNameFormatter):
+  model_keyword: LiteralString = "__model_adapter_map__"
+
+_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
+def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
+  from openllm_core.utils import DEBUG
+  model_name = llm.config["model_name"]
+  logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
+  with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
+  for it in src_contents:
+    if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
+    elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
+  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
+  if DEBUG: logger.info("Generated script:\n%s", script)
+  llm_fs.writetext(llm.config["service_name"], script)
+
@inject
 def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
                runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release",
@@ -108,14 +139,14 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
  if adapter_map: labels.update(adapter_map)
  if isinstance(workers_per_resource, str):
    if workers_per_resource == "round_robin": workers_per_resource = 1.0
-    elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
+    elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
    else:
      try: workers_per_resource = float(workers_per_resource)
      except ValueError: raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
  elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource)
  logger.info("Building Bento for '%s'", llm.config["start_name"])
  # add service.py definition to this temporary folder
-  openllm.utils.codegen.write_service(llm, adapter_map, llm_fs)
+  write_service(llm, adapter_map, llm_fs)

  llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
  build_config = BentoBuildConfig(
@@ -134,7 +165,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
    if "__bento_name__" in it: service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))

  script = "".join(service_contents)
-  if openllm.utils.DEBUG: logger.info("Generated script:\n%s", script)
+  if openllm_core.utils.DEBUG: logger.info("Generated script:\n%s", script)

  bento._fs.writetext(service_fs_path, script)
  if "model_store" in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -1,26 +1,23 @@
 # mypy: disable-error-code="misc"
 """OCI-related utilities for OpenLLM. This module is considered to be internal and API are subjected to change."""
 from __future__ import annotations
-import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t
+import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t, openllm_core
 from datetime import datetime, timedelta, timezone
 import attr, orjson, bentoml, openllm
-from openllm.utils.lazy import VersionInfo
+from openllm_core.utils.lazy import VersionInfo

 if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
  from ghapi import all
-  from openllm._typing_compat import RefTuple, LiteralString
+  from openllm_core._typing_compat import RefTuple, LiteralString

-all = openllm.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811
+all = openllm_core.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811

 logger = logging.getLogger(__name__)

 _BUILDER = bentoml.container.get_backend("buildx")
 ROOT_DIR = pathlib.Path(os.path.abspath("__file__")).parent.parent.parent

-# TODO: support quay
-LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
-LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
-
 # XXX: This registry will be hard code for now for easier to maintain
 # but in the future, we can infer based on git repo and everything to make it more options for users
 # to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
@@ -31,10 +28,10 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {"docker": "docker.io
 _OWNER = "bentoml"
 _REPO = "openllm"

-_module_location = openllm.utils.pkg.source_locations("openllm")
+_module_location = openllm_core.utils.pkg.source_locations("openllm")

@functools.lru_cache
-@openllm.utils.apply(str.lower)
+@openllm_core.utils.apply(str.lower)
 def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg]

 def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s)
@@ -43,7 +40,7 @@ def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) -
 class VersionNotSupported(openllm.exceptions.OpenLLMException):
  """Raised when the stable release is too low that it doesn't include OpenLLM base container."""

-_RefTuple: type[RefTuple] = openllm.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
+_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])

 def nightly_resolver(cls: type[RefResolver]) -> str:
  # NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -60,7 +57,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
  git_hash: str = attr.field()
-  version: openllm.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
+  version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
  strategy: LiteralContainerVersionStrategy = attr.field()
  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
  @classmethod
@@ -74,7 +71,7 @@ class RefResolver:
      version_str = meta["name"].lstrip("v")
      version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str)
    else: version = ("", version_str)
-    if openllm.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
+    if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
    return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
  @classmethod
  @functools.lru_cache(maxsize=64)
@@ -101,7 +98,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
  try:
    if not _BUILDER.health(): raise openllm.exceptions.Error
  except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None
-  if openllm.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
+  if openllm_core.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
  if not shutil.which("nvidia-container-runtime"): raise RuntimeError("NVIDIA Container Toolkit is required to compile CUDA kernel in container.")
  if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
  pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml"
@@ -111,7 +108,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
    registries = [registries] if isinstance(registries, str) else list(registries)
    tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries}
  try:
-    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm.utils.get_debug_mode() else "auto", quiet=machine)
+    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine)
    if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip()
  except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err
  return tags
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -1,16 +1,16 @@
 from __future__ import annotations
-import functools, importlib.util, os, typing as t, logging
-import click, click_option_group as cog, inflection, orjson, bentoml, openllm
+import functools, importlib.util, os, typing as t, logging, click, click_option_group as cog, inflection, orjson, bentoml, openllm
+from click import shell_completion as sc
 from bentoml_cli.utils import BentoMLCommandGroup
 from click.shell_completion import CompletionItem
-from openllm.utils import DEBUG
+from openllm_core.utils import DEBUG
 from bentoml._internal.configuration.containers import BentoMLContainer
-from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
+from openllm_core._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
 from . import termui

 if t.TYPE_CHECKING:
  import subprocess
-  from openllm._configuration import LLMConfig
+  from openllm_core._configuration import LLMConfig

 logger = logging.getLogger(__name__)

@@ -20,6 +20,12 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"]
 _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])

+def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
+  return [sc.CompletionItem(str(it.tag), help="Bento") for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {"start_name", "bundler"})]
+
+def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
+  return [sc.CompletionItem(inflection.dasherize(it), help="Model") for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+
 def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
@@ -316,7 +322,7 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
 def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
-def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
+def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
@@ -382,7 +388,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
  )(f)
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
+      "--container-registry", "container_registry", type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.

                                                                                                                        Currently, it supports 'ecr', 'ghcr.io', 'docker.io'

--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
-import itertools, logging, os, re, subprocess, sys, typing as t
-import bentoml, openllm
+import itertools, logging, os, re, subprocess, sys, typing as t, bentoml, openllm, openllm_core
 from simple_di import Provide, inject
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm.exceptions import OpenLLMException
@@ -8,10 +7,9 @@ from . import termui
 from ._factory import start_command_factory

 if t.TYPE_CHECKING:
-  from openllm._typing_compat import LiteralString, LiteralRuntime
+  from openllm_core._configuration import LLMConfig
+  from openllm_core._typing_compat import LiteralString, LiteralRuntime, LiteralContainerRegistry, LiteralContainerVersionStrategy
  from bentoml._internal.bento import BentoStore
-  from openllm._configuration import LLMConfig
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy

 logger = logging.getLogger(__name__)

@@ -58,7 +56,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
  """
  from .entrypoint import start_command, start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
-  _ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]

  args: list[str] = ["--runtime", runtime]
@@ -203,5 +201,5 @@ def _list_models() -> dict[str, t.Any]:
  return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)


-start, start_grpc, build, import_model, list_models = openllm.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm.utils.codegen.gen_sdk(_build), openllm.utils.codegen.gen_sdk(_import_model), openllm.utils.codegen.gen_sdk(_list_models)
+start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ["start", "start_grpc", "build", "import_model", "list_models"]
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -20,10 +20,9 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
 ```
 """
 from __future__ import annotations
-import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t
-import attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
-from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
+import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t, attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
 from simple_di import Provide, inject
+from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelStore
 from . import termui
@@ -56,8 +55,8 @@ from openllm.models.auto import (
  AutoConfig,
  AutoLLM,
 )
-from openllm._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
-from openllm.utils import (
+from openllm_core._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
+from openllm_core.utils import (
  DEBUG,
  DEBUG_ENV_VAR,
  OPTIONAL_DEPENDENCIES,
@@ -72,21 +71,20 @@ from openllm.utils import (
  first_not_none,
  get_debug_mode,
  get_quiet_mode,
-  infer_auto_class,
  is_torch_available,
  is_transformers_supports_agent,
  resolve_user_filepath,
  set_debug_mode,
  set_quiet_mode,
 )
+from openllm.utils import infer_auto_class

 if t.TYPE_CHECKING:
  import torch
  from bentoml._internal.bento import BentoStore
  from bentoml._internal.container import DefaultBuilder
-  from openllm.client import BaseClient
-  from openllm._schema import EmbeddingsOutput
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+  from openllm_core._schema import EmbeddingsOutput
+  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
 else: torch = LazyLoader("torch", globals(), "torch")

 P = ParamSpec("P")
@@ -271,7 +269,7 @@ def cli() -> None:
  \b
  An open platform for operating large language models in production.
  Fine-tune, serve, deploy, and monitor any LLMs with ease.
-  """  # noqa: D205
+  """

@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"])
 def start_command() -> None:
@@ -670,10 +668,8 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
  """
  client = openllm.client.HTTPClient(endpoint, timeout=timeout)

-  try:
-    client.call("metadata")
-  except http.client.BadStatusLine:
-    raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
+  try: client.call("metadata")
+  except http.client.BadStatusLine: raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
  if agent == "hf":
    if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
    _memoized = {k: v[0] for k, v in _memoized.items() if v}
@@ -700,7 +696,7 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time
  $ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
  ```
  """
-  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
  try:
    gen_embed = client.embed(text)
  except ValueError:
@@ -733,14 +729,14 @@ def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: in
  """
  _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
  if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint)
-  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
  input_fg, generated_fg = "magenta", "cyan"
  if output != "porcelain":
    termui.echo("==Input==\n", fg="white")
    termui.echo(f"{prompt}", fg=input_fg)
  res = client.query(prompt, return_response="raw", **{**client.configuration, **_memoized})
  if output == "pretty":
-    response = client.llm.postprocess_generate(prompt, res["responses"])
+    response = client.config.postprocess_generate(prompt, res["responses"])
    termui.echo("\n\n==Responses==\n", fg="white")
    termui.echo(response, fg=generated_fg)
  elif output == "json":
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -1,37 +1,26 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
-import orjson
-
-import openllm
-
-from .. import termui
-from .._factory import machine_option
-
-if t.TYPE_CHECKING:
-  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
-
+import typing as t, click, orjson, openllm
+from openllm.cli import termui
+from openllm.cli._factory import machine_option, container_registry_option
+if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
@click.command(
    "build_base_container", context_settings=termui.CONTEXT_SETTINGS, help="""Base image builder for BentoLLM.

                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
-
                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.

+                \b
                If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
-
                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.

                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
                """
 )
-@click.option("--registry", multiple=True, type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), help="Target registry to create image tag on.", default=None)
+@container_registry_option
@click.option("--version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="nightly", help="Version strategy to use for tagging the image.")
@click.option("--push/--no-push", help="Whether to push to remote repository", is_flag=True, default=False)
@machine_option
-def cli(registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
-  mapping = openllm.bundle.build_container(registry, version_strategy, push, machine)
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -1,24 +1,16 @@
-
 from __future__ import annotations
-import shutil
-import subprocess
-import typing as t
-
-import click
-import psutil
+import shutil, subprocess, typing as t, click, psutil, bentoml
 from simple_di import Provide, inject
-
-import bentoml
 from bentoml._internal.configuration.containers import BentoMLContainer

-from .. import termui
+from openllm.cli import termui
+from openllm.cli._factory import bento_complete_envvar, machine_option

-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
+if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore

@click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("bento", type=str)
-@click.option("--machine", is_flag=True, default=False, hidden=True)
+@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
+@machine_option
@click.pass_context
@inject
 def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
@@ -32,5 +24,5 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
  if machine: return bentomodel.path
  # copy and paste this into a new shell
  if psutil.WINDOWS: subprocess.check_call([shutil.which("dir") or "dir"], cwd=bentomodel.path)
-  else: subprocess.check_call([shutil.which("tree") or "tree"], cwd=bentomodel.path)
+  else: subprocess.check_call([shutil.which("ls") or "ls", "-Rrthla"], cwd=bentomodel.path)
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -1,24 +1,18 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
+import typing as t, click, bentoml
 from simple_di import Provide, inject
-
-import bentoml
 from bentoml._internal.bento.bento import BentoInfo
 from bentoml._internal.bento.build_config import DockerOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.container.generate import generate_containerfile
+from openllm.cli import termui
+from openllm.cli._factory import bento_complete_envvar
+from openllm_core.utils import bentoml_cattr

-from .. import termui
-from ...utils import bentoml_cattr
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
+if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore

@click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.")
-@click.argument("bento", type=str)
+@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
 def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -1,25 +1,18 @@
-
 from __future__ import annotations
-import typing as t
-
-import click
-import inflection
-import orjson
+import typing as t, click, inflection, orjson, openllm
 from bentoml_cli.utils import opt_callback
-
-import openllm
-
-from .. import termui
-from ..._prompt import process_prompt
+from openllm.cli import termui
+from openllm.cli._factory import model_complete_envvar, output_option, machine_option
+from openllm_core._prompt import process_prompt

 LiteralOutput = t.Literal["json", "pretty", "porcelain"]

@click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
+@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument("prompt", type=click.STRING)
-@click.option("-o", "--output", "output", type=click.Choice(["json", "pretty", "porcelain"]), default="pretty", help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True)
+@output_option
@click.option("--format", type=click.STRING, default=None)
-@click.option("--machine", is_flag=True, default=False, hidden=True)
+@machine_option
@click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
@click.pass_context
 def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -1,16 +1,8 @@
-
 from __future__ import annotations
-
-import click
-import inflection
-import orjson
-
-import bentoml
-import openllm
+import click, inflection, orjson, bentoml, openllm
 from bentoml._internal.utils import human_readable_size
-
-from .. import termui
-from .._factory import LiteralOutput, output_option
+from openllm.cli import termui
+from openllm.cli._factory import LiteralOutput, output_option

@click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS)
@output_option(default_value="json")
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 import typing as t, bentoml, openllm, orjson, inflection ,click
-from bentoml._internal.utils import human_readable_size
-
 from openllm.cli import termui
-from openllm.cli._factory import LiteralOutput, model_name_argument, output_option
+from bentoml._internal.utils import human_readable_size
+from openllm.cli._factory import LiteralOutput, model_name_argument, output_option, model_complete_envvar

-if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny

@click.command("list_models", context_settings=termui.CONTEXT_SETTINGS)
-@model_name_argument(required=False)
+@model_name_argument(required=False, shell_complete=model_complete_envvar)
@output_option(default_value="json")
 def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
  """This is equivalent to openllm models --show-available less the nice table."""
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
-import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t
-import click, yaml
+import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t, click, yaml
 from openllm.cli import termui
 from openllm import playground
-from openllm.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
+from openllm_core.utils import is_jupyter_available, is_jupytext_available, is_notebook_available

 if t.TYPE_CHECKING:
  import jupytext, nbformat
-  from openllm._typing_compat import DictStrAny
+  from openllm_core._typing_compat import DictStrAny

 logger = logging.getLogger(__name__)

@@ -38,7 +37,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  \b
  > [!NOTE]
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
-  """  # noqa: D301
+  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
  metadata = load_notebook_metadata()
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import os, typing as t, click, inflection, openllm
-if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny

 def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
  attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -0,0 +1,17 @@
+"""OpenLLM Python client.
+
+```python
+client = openllm.client.HTTPClient("http://localhost:8080")
+client.query("What is the difference between gather and scatter?")
+```
+
+If the server has embedding supports, use it via `client.embed`:
+```python
+client.embed("What is the difference between gather and scatter?")
+```
+"""
+from __future__ import annotations
+import openllm_client, typing as t
+if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
+def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client))
+def __getattr__(it: str) -> t.Any: return getattr(openllm_client, it)
--- a/openllm-python/src/openllm/client/init.py
+++ b/openllm-python/src/openllm/client/init.py
@@ -1,22 +0,0 @@
-"""OpenLLM Python client.
-
-```python
-client = openllm.client.HTTPClient("http://localhost:8080")
-client.query("What is the difference between gather and scatter?")
-```
-
-If the server has embedding supports, use it via `client.embed`:
-```python
-client.embed("What is the difference between gather and scatter?")
-```
-"""
-from __future__ import annotations
-
-from openllm.client.runtimes import (
-    AsyncGrpcClient as AsyncGrpcClient,
-    AsyncHTTPClient as AsyncHTTPClient,
-    BaseAsyncClient as BaseAsyncClient,
-    BaseClient as BaseClient,
-    GrpcClient as GrpcClient,
-    HTTPClient as HTTPClient,
-)
--- a/openllm-python/src/openllm/client/runtimes/init.py
+++ b/openllm-python/src/openllm/client/runtimes/init.py
@@ -1,15 +0,0 @@
-"""Client that supports REST/gRPC protocol to interact with a LLMServer."""
-from __future__ import annotations
-
-from openllm.client.runtimes.base import (
-    BaseAsyncClient as BaseAsyncClient,
-    BaseClient as BaseClient,
-)
-from openllm.client.runtimes.grpc import (
-    AsyncGrpcClient as AsyncGrpcClient,
-    GrpcClient as GrpcClient,
-)
-from openllm.client.runtimes.http import (
-    AsyncHTTPClient as AsyncHTTPClient,
-    HTTPClient as HTTPClient,
-)
--- a/openllm-python/src/openllm/client/runtimes/base.py
+++ b/openllm-python/src/openllm/client/runtimes/base.py
@@ -1,238 +0,0 @@
-# mypy: disable-error-code="name-defined"
-from __future__ import annotations
-import asyncio, logging, typing as t
-import bentoml, bentoml.client, openllm, httpx
-from abc import abstractmethod
-from http import HTTPStatus
-from urllib.parse import urljoin
-from openllm._typing_compat import overload, LiteralString
-
-T = t.TypeVar("T")
-T_co = t.TypeVar("T_co", covariant=True)
-
-if t.TYPE_CHECKING:
-  import transformers
-  from openllm._typing_compat import DictStrAny, LiteralRuntime
-else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-class AnnotatedClient(t.Protocol[T_co]):
-  server_url: str
-  _svc: bentoml.Service
-  endpoints: list[str]
-  def health(self, *args: t.Any, **attrs: t.Any) -> t.Any: ...
-  async def async_health(self) -> t.Any: ...
-  def generate_v1(self, qa: openllm.GenerationInput) -> T_co: ...
-  def metadata_v1(self) -> T_co: ...
-  def embeddings_v1(self) -> t.Sequence[float]: ...
-  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
-  async def async_call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
-  @staticmethod
-  def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: ...
-  @staticmethod
-  def from_url(server_url: str) -> AnnotatedClient[t.Any]: ...
-
-logger = logging.getLogger(__name__)
-
-def in_async_context() -> bool:
-  try:
-    _ = asyncio.get_running_loop()
-    return True
-  except RuntimeError: return False
-
-class ClientMeta(t.Generic[T]):
-  _api_version: str
-  _client_type: t.Literal["GrpcClient", "HTTPClient"]
-  _host: str
-  _port: str
-
-  __client__: AnnotatedClient[T] | None = None
-  __agent__: transformers.HfAgent | None = None
-  __llm__: openllm.LLM[t.Any, t.Any] | None = None
-
-  def __init__(self, address: str, timeout: int = 30): self._address,self._timeout = address,timeout
-  def __init_subclass__(cls, *, client_type: t.Literal["http", "grpc"] = "http", api_version: str = "v1"): cls._client_type, cls._api_version = "HTTPClient" if client_type == "http" else "GrpcClient", api_version
-  @property
-  def _hf_agent(self) -> transformers.HfAgent:
-    if not self.supports_hf_agent: raise openllm.exceptions.OpenLLMException(f"{self.model_name} ({self.framework}) does not support running HF agent.")
-    if self.__agent__ is None:
-      if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
-      self.__agent__ = transformers.HfAgent(urljoin(self._address, "/hf/agent"))
-    return self.__agent__
-  @property
-  def _metadata(self) -> T: return httpx.post(urljoin(self._address, f"/{self._api_version}/metadata")).json() if in_async_context() else self.call("metadata")
-  @property
-  @abstractmethod
-  def model_name(self) -> str: raise NotImplementedError
-  @property
-  @abstractmethod
-  def framework(self) -> LiteralRuntime: raise NotImplementedError
-  @property
-  @abstractmethod
-  def timeout(self) -> int: raise NotImplementedError
-  @property
-  @abstractmethod
-  def model_id(self) -> str: raise NotImplementedError
-  @property
-  @abstractmethod
-  def configuration(self) -> dict[str, t.Any]: raise NotImplementedError
-  @property
-  @abstractmethod
-  def supports_embeddings(self) -> bool: raise NotImplementedError
-  @property
-  @abstractmethod
-  def supports_hf_agent(self) -> bool: raise NotImplementedError
-  @abstractmethod
-  def postprocess(self, result: t.Any) -> openllm.GenerationOutput: ...
-  @abstractmethod
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: ...
-
-  @property
-  def config(self) -> openllm.LLMConfig: return self.llm.config
-  @property
-  def llm(self) -> openllm.LLM[t.Any, t.Any]:
-    # XXX: if the server runs vllm or any framework that is not available from the user client, client will fail.
-    if self.__llm__ is None: self.__llm__ = openllm.infer_auto_class(self.framework).for_model(self.model_name)
-    return self.__llm__
-
-  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return self._cached.call(f"{name}_{self._api_version}", *args, **attrs)
-  async def acall(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return await self._cached.async_call(f"{name}_{self._api_version}", *args, **attrs)
-  @property
-  def _cached(self) -> AnnotatedClient[T]:
-    client_class = t.cast(AnnotatedClient[T], getattr(bentoml.client, self._client_type))
-    if self.__client__ is None:
-      client_class.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
-      self.__client__ = client_class.from_url(self._address)
-    return self.__client__
-
-class BaseClient(ClientMeta[T]):
-  def health(self) -> t.Any: raise NotImplementedError
-  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
-  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
-    return_raw_response = attrs.pop("return_raw_response", None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = "raw"
-    return_attrs = attrs.pop("return_attrs", None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = "attrs"
-    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
-    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-
-    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
-    if in_async_context(): result = httpx.post(urljoin(self._address, f"/{self._api_version}/generate"), json=inputs.model_dump(), timeout=self.timeout).json()
-    else: result = self.call("generate", inputs.model_dump())
-    r = self.postprocess(result)
-    if return_response == "attrs": return r
-    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
-    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
-
-  # NOTE: Scikit interface
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], self.query(prompt, **attrs))
-
-  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
-    if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    task = kwargs.pop("task", args[0])
-    return_code = kwargs.pop("return_code", False)
-    remote = kwargs.pop("remote", False)
-    try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
-    except Exception as err:
-      logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
-      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
-
-class BaseAsyncClient(ClientMeta[T]):
-  async def health(self) -> t.Any: raise NotImplementedError
-  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
-  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  async def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
-    return_raw_response = attrs.pop("return_raw_response", None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = "raw"
-    return_attrs = attrs.pop("return_attrs", None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = "attrs"
-    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
-    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-
-    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
-    res = await self.acall("generate", inputs.model_dump())
-    r = self.postprocess(res)
-
-    if return_response == "attrs": return r
-    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
-    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
-
-  # NOTE: Scikit interface
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
-  @overload
-  async def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
-  async def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], await self.query(prompt, **attrs))
-  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
-    """Async version of agent.run."""
-    if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    task = kwargs.pop("task", args[0])
-    return_code = kwargs.pop("return_code", False)
-    remote = kwargs.pop("remote", False)
-
-    from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
-    from transformers.tools.python_interpreter import evaluate
-
-    _hf_agent = self._hf_agent
-
-    prompt = t.cast(str, _hf_agent.format_prompt(task))
-    stop = ["Task:"]
-    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
-      response = await client.post(_hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},},)
-      if response.status_code != HTTPStatus.OK:
-        raise ValueError(f"Error {response.status_code}: {response.json()}")
-
-    result = response.json()[0]["generated_text"]
-    # Inference API returns the stop sequence
-    for stop_seq in stop:
-      if result.endswith(stop_seq):
-        result = result[:-len(stop_seq)]
-        break
-
-    # the below have the same logic as agent.run API
-    explanation, code = clean_code_for_run(result)
-    _hf_agent.log(f"==Explanation from the agent==\n{explanation}")
-    _hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
-    if not return_code:
-      _hf_agent.log("\n\n==Result==")
-      _hf_agent.cached_tools = resolve_tools(code, _hf_agent.toolbox, remote=remote, cached_tools=_hf_agent.cached_tools)
-      return evaluate(code, _hf_agent.cached_tools, state=kwargs.copy())
-    else:
-      tool_code = get_tool_creation_code(code, _hf_agent.toolbox, remote=remote)
-      return f"{tool_code}\n{code}"
--- a/openllm-python/src/openllm/client/runtimes/grpc.py
+++ b/openllm-python/src/openllm/client/runtimes/grpc.py
@@ -1,93 +0,0 @@
-from __future__ import annotations
-import asyncio, logging, typing as t
-import orjson, openllm
-from openllm._typing_compat import LiteralRuntime
-from .base import BaseAsyncClient, BaseClient
-
-if t.TYPE_CHECKING:
-  from grpc_health.v1 import health_pb2
-  from bentoml.grpc.v1.service_pb2 import Response
-
-logger = logging.getLogger(__name__)
-
-class GrpcClient(BaseClient["Response"], client_type="grpc"):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(":")
-    super().__init__(address, timeout)
-  def health(self) -> health_pb2.HealthCheckResponse: return asyncio.run(self._cached.health("bentoml.grpc.v1.BentoService"))
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_name"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try:
-      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
-      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
-      return value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_id"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
-    from google.protobuf.json_format import MessageToDict
-    if isinstance(result, dict): return openllm.GenerationOutput(**result)
-    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
-
-class AsyncGrpcClient(BaseAsyncClient["Response"], client_type="grpc"):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(":")
-    super().__init__(address, timeout)
-  async def health(self) -> health_pb2.HealthCheckResponse: return await self._cached.health("bentoml.grpc.v1.BentoService")
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_name"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try:
-      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
-      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
-      return value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata.json.struct_value.fields["model_id"].string_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
-    from google.protobuf.json_format import MessageToDict
-    if isinstance(result, dict): return openllm.GenerationOutput(**result)
-    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ b/openllm-python/src/openllm/client/runtimes/http.py
@@ -1,96 +0,0 @@
-from __future__ import annotations
-import logging, typing as t
-from urllib.parse import urljoin, urlparse
-import httpx, orjson, openllm
-from .base import BaseAsyncClient, BaseClient, in_async_context
-from openllm._typing_compat import DictStrAny, LiteralRuntime
-
-logger = logging.getLogger(__name__)
-def process_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
-  address = address if "://" in address else "http://" + address
-  parsed = urlparse(address)
-  self._host, *_port = parsed.netloc.split(":")
-  if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
-  else: self._port = next(iter(_port))
-
-class HTTPClient(BaseClient[DictStrAny]):
-  def __init__(self, address: str, timeout: int = 30):
-    process_address(self, address)
-    super().__init__(address, timeout)
-
-  def health(self) -> t.Any: return self._cached.health()
-  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if isinstance(prompt, str): prompt = [prompt]
-    result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
-    return openllm.EmbeddingsOutput(**result)
-
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try: return self._metadata["framework"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return self._metadata["timeout"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata["configuration"])
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.get("supports_embeddings", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.get("supports_hf_agent", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
-
-class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
-  def __init__(self, address: str, timeout: int = 30):
-    process_address(self, address)
-    super().__init__(address, timeout)
-
-  async def health(self) -> t.Any: return await self._cached.async_health()
-  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if isinstance(prompt, str): prompt = [prompt]
-    res = await self.acall("embeddings", list(prompt))
-    return openllm.EmbeddingsOutput(**res)
-
-  @property
-  def model_name(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def model_id(self) -> str:
-    try: return self._metadata["model_name"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def framework(self) -> LiteralRuntime:
-    try: return self._metadata["framework"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def timeout(self) -> int:
-    try: return self._metadata["timeout"]
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try: return orjson.loads(self._metadata["configuration"])
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_embeddings(self) -> bool:
-    try: return self._metadata.get("supports_embeddings", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  @property
-  def supports_hf_agent(self) -> bool:
-    try: return self._metadata.get("supports_hf_agent", False)
-    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
-  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
--- a/openllm-python/src/openllm/exceptions.py
+++ b/openllm-python/src/openllm/exceptions.py
@@ -1,19 +1,3 @@
 """Base exceptions for OpenLLM. This extends BentoML exceptions."""
 from __future__ import annotations
-import bentoml
-class OpenLLMException(bentoml.exceptions.BentoMLException):
-  """Base class for all OpenLLM exceptions. This extends BentoMLException."""
-class GpuNotAvailableError(OpenLLMException):
-  """Raised when there is no GPU available in given system."""
-class ValidationError(OpenLLMException):
-  """Raised when a validation fails."""
-class ForbiddenAttributeError(OpenLLMException):
-  """Raised when using an _internal field."""
-class MissingAnnotationAttributeError(OpenLLMException):
-  """Raised when a field under openllm.LLMConfig is missing annotations."""
-class MissingDependencyError(BaseException):
-  """Raised when a dependency is missing."""
-class Error(BaseException):
-  """To be used instead of naked raise."""
-class FineTuneStrategyNotSupportedError(OpenLLMException):
-  """Raised when a fine-tune strategy is not supported for given LLM."""
+from openllm_core.exceptions import OpenLLMException as OpenLLMException, GpuNotAvailableError as GpuNotAvailableError, ValidationError as ValidationError, ForbiddenAttributeError as ForbiddenAttributeError, MissingAnnotationAttributeError as MissingAnnotationAttributeError, MissingDependencyError as MissingDependencyError, Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError
--- a/openllm-python/src/openllm/models/init.py
+++ b/openllm-python/src/openllm/models/init.py
@@ -1,11 +1,11 @@
 # This file is generated by tools/update-models-import.py. DO NOT EDIT MANUALLY!
 # To update this, run ./tools/update-models-import.py
 from __future__ import annotations
-import typing as t, os
-from openllm.utils import LazyModule
-_MODELS: set[str] = {"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
-if t.TYPE_CHECKING: from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
-__lazy=LazyModule(__name__, os.path.abspath("__file__"), {k: [] for k in _MODELS})
+import typing as t
+from openllm_core.utils import LazyModule
+_MODELS:set[str]={"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
+if t.TYPE_CHECKING:from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
+__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})
 __all__=__lazy.__all__
 __dir__=__lazy.__dir__
 __getattr__=__lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/init.py
+++ b/openllm-python/src/openllm/models/auto/init.py
@@ -1,15 +1,11 @@
 from __future__ import annotations
 import typing as t, os
 import openllm
-from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES

-_import_structure: dict[str, list[str]] = {"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"], "modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
+_import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
 if t.TYPE_CHECKING:
-  from .configuration_auto import (
-    CONFIG_MAPPING as CONFIG_MAPPING,
-    CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-    AutoConfig as AutoConfig,
-  )
  from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
  from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
  from .modeling_tf_auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -3,10 +3,10 @@ from __future__ import annotations
 import importlib, inspect, logging, typing as t
 from collections import OrderedDict
 import inflection, openllm
-from openllm.utils import ReprMixin
+from openllm_core.utils import ReprMixin

 if t.TYPE_CHECKING:
-  from openllm._typing_compat import LiteralString, LLMRunner
+  from openllm_core._typing_compat import LiteralString, LLMRunner
  import types
  from collections import _odict_items, _odict_keys, _odict_values

--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES

 MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES

 MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")])
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES

 MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")])
 MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import typing as t
 from collections import OrderedDict
-from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
+from openllm_core.config import CONFIG_MAPPING_NAMES

 MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
--- a/openllm-python/src/openllm/models/baichuan/init.py
+++ b/openllm-python/src/openllm/models/baichuan/init.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_baichuan import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
+  BaichuanConfig as BaichuanConfig,
+)

-_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_baichuan import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
-    BaichuanConfig as BaichuanConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
 except MissingDependencyError: pass
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -1,16 +1,11 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
-
-if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+if t.TYPE_CHECKING: import transformers

 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
  __openllm_internal__ = True
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    import torch
    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers

 class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
  __openllm_internal__ = True
  tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
--- a/openllm-python/src/openllm/models/chatglm/init.py
+++ b/openllm-python/src/openllm/models/chatglm/init.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available
+from openllm_core.config.configuration_chatglm import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
+  ChatGLMConfig as ChatGLMConfig,
+)

-_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_chatglm import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
-    ChatGLMConfig as ChatGLMConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
 except MissingDependencyError: pass
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -1,32 +1,17 @@
 from __future__ import annotations
 import typing as t, openllm
-if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
-else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
-
+if t.TYPE_CHECKING: import transformers
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
  __openllm_internal__ = True
-
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    prompt_text = ""
-    if use_default_prompt_template and chat_history is not None:
-      for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
-      prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
-    else: prompt_text = prompt
-    postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
-    return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
-  def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
-    generated, history = generation_result
-    if self.config.retain_history:
-      if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
-      chat_history.extend(history)
-    return generated
  def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
+    import torch
    with torch.inference_mode():
      self.model.eval()
      # Only use half precision if the model is not yet quantized
      if self.config.use_half_precision: self.model.half()
      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    import torch, torch.nn.functional as F
    embeddings: list[list[float]] = []
    num_tokens = 0
    for prompt in prompts:
--- a/openllm-python/src/openllm/models/dolly_v2/init.py
+++ b/openllm-python/src/openllm/models/dolly_v2/init.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_dolly_v2 import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
+  DollyV2Config as DollyV2Config,
+)

-_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_dolly_v2 import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
-    DollyV2Config as DollyV2Config,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
  if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 import logging, re, typing as t, openllm
-from openllm._prompt import process_prompt
-from openllm._typing_compat import overload
-from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
+from openllm_core._typing_compat import overload
+from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id

 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
 else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -102,8 +101,6 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {}
  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
    llm_config = self.config.model_construct_env(**attrs)
    with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers

 logger = logging.getLogger(__name__)
 class VLLMDollyV2(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizer"]):
  __openllm_internal__ = True
  tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
--- a/openllm-python/src/openllm/models/falcon/init.py
+++ b/openllm-python/src/openllm/models/falcon/init.py
@@ -2,14 +2,13 @@ from __future__ import annotations
 import sys, typing as t
 from openllm.exceptions import MissingDependencyError
 from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+from openllm_core.config.configuration_falcon import (
+  DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+  START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
+  FalconConfig as FalconConfig,
+)

-_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
-if t.TYPE_CHECKING:
-  from .configuration_falcon import (
-    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
-    START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
-    FalconConfig as FalconConfig,
-  )
+_import_structure: dict[str, list[str]] = {}
 try:
  if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError: pass
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 import typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import torch, transformers
 else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")

@@ -9,8 +7,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
  __openllm_internal__ = True
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
-  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
--- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 import logging, typing as t, openllm
-from openllm._prompt import process_prompt
-from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers

 logger = logging.getLogger(__name__)
 class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
  __openllm_internal__ = True
  tokenizer_id = "local"
-  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
--- a/Show More
+++ b/Show More