refactor: monorepo (#203)

2026-05-19 14:16:22 -04:00 · 2023-08-15 02:11:14 -04:00
parent 2d33100d72
commit cd872ef631
178 changed files with 1703 additions and 586 deletions
--- a/openllm-python/.git_archival.txt
+++ b/openllm-python/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
--- a/openllm-python/LICENSE.md
+++ b/openllm-python/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -0,0 +1,786 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">🦾 OpenLLM</h1>
+    <a href="https://pypi.org/project/openllm">
+        <img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm">
+        <img src="https://img.shields.io/pypi/pyversions/openllm.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="https://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>An open platform for operating large language models (LLMs) in production.</br>
+    Fine-tune, serve, deploy, and monitor any LLMs with ease.</p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps.
+
+🚂 **State-of-the-art LLMs**: built-in supports a wide range of open-source LLMs
+and model runtime, including Llama 2，StableLM, Falcon, Dolly, Flan-T5, ChatGLM,
+StarCoder and more.
+
+🔥 **Flexible APIs**: serve LLMs over RESTful API or gRPC with one command,
+query via WebUI, CLI, our Python/Javascript client, or any HTTP client.
+
+⛓️ **Freedom To Build**: First-class support for LangChain, BentoML and Hugging
+Face that allows you to easily create your own AI apps by composing LLMs with
+other models and services.
+
+🎯 **Streamline Deployment**: Automatically generate your LLM server Docker
+Images or deploy as serverless endpoint via
+[☁️ BentoCloud](https://l.bentoml.com/bento-cloud).
+
+🤖️ **Bring your own LLM**: Fine-tune any LLM to suit your needs with
+`LLM.tuning()`. (Coming soon)
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 🏃 Getting Started
+
+To use OpenLLM, you need to have Python 3.8 (or newer) and `pip` installed on
+your system. We highly recommend using a Virtual Environment to prevent package
+conflicts.
+
+You can install OpenLLM using pip as follows:
+
+```bash
+pip install openllm
+```
+
+To verify if it's installed correctly, run:
+
+```
+$ openllm -h
+
+Usage: openllm [OPTIONS] COMMAND [ARGS]...
+
+   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
+  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
+  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
+  ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
+  ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
+   ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
+
+  An open platform for operating large language models in production.
+  Fine-tune, serve, deploy, and monitor any LLMs with ease.
+```
+
+### Starting an LLM Server
+
+To start an LLM server, use `openllm start`. For example, to start a
+[`OPT`](https://huggingface.co/docs/transformers/model_doc/opt) server, do the
+following:
+
+```bash
+openllm start opt
+```
+
+Following this, a Web UI will be accessible at http://localhost:3000 where you
+can experiment with the endpoints and sample input prompts.
+
+OpenLLM provides a built-in Python client, allowing you to interact with the
+model. In a different terminal window or a Jupyter Notebook, create a client to
+start interacting with the model:
+
+```python
+import openllm
+client = openllm.client.HTTPClient('http://localhost:3000')
+client.query('Explain to me the difference between "further" and "farther"')
+```
+
+You can also use the `openllm query` command to query the model from the
+terminal:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'Explain to me the difference between "further" and "farther"'
+```
+
+Visit `http://localhost:3000/docs.json` for OpenLLM's API specification.
+
+OpenLLM seamlessly supports many models and their variants. Users can also
+specify different variants of the model to be served, by providing the
+`--model-id` argument, e.g.:
+
+```bash
+openllm start flan-t5 --model-id google/flan-t5-large
+```
+
+> [!NOTE]
+> `openllm` also supports all variants of fine-tuning weights, custom
+> model path as well as quantized weights for any of the supported models as
+> long as it can be loaded with the model architecture. Refer to
+> [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models)
+> section for models' architecture.
+
+Use the `openllm models` command to see the list of models and their variants
+supported in OpenLLM.
+
+## 🧩 Supported Models
+
+The following models are currently supported in OpenLLM. By default, OpenLLM
+doesn't include dependencies to run all models. The extra model-specific
+dependencies can be installed with the instructions below:
+
+<!-- update-readme.py: start -->
+
+<table align='center'>
+<tr>
+<th>Model</th>
+<th>Architecture</th>
+<th>Model Ids</th>
+<th>Installation</th>
+</tr>
+<tr>
+
+<td><a href=https://github.com/THUDM/ChatGLM-6B>chatglm</a></td>
+<td><a href=https://github.com/THUDM/ChatGLM-6B><code>ChatGLMForConditionalGeneration</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/thudm/chatglm-6b><code>thudm/chatglm-6b</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm-6b-int8><code>thudm/chatglm-6b-int8</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm-6b-int4><code>thudm/chatglm-6b-int4</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm2-6b><code>thudm/chatglm2-6b</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm2-6b-int4><code>thudm/chatglm2-6b-int4</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[chatglm]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/databrickslabs/dolly>dolly-v2</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/databricks/dolly-v2-3b><code>databricks/dolly-v2-3b</code></a></li>
+<li><a href=https://huggingface.co/databricks/dolly-v2-7b><code>databricks/dolly-v2-7b</code></a></li>
+<li><a href=https://huggingface.co/databricks/dolly-v2-12b><code>databricks/dolly-v2-12b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://falconllm.tii.ae/>falcon</a></td>
+<td><a href=https://falconllm.tii.ae/><code>FalconForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/tiiuae/falcon-7b><code>tiiuae/falcon-7b</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-40b><code>tiiuae/falcon-40b</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-7b-instruct><code>tiiuae/falcon-7b-instruct</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-40b-instruct><code>tiiuae/falcon-40b-instruct</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[falcon]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/docs/transformers/model_doc/flan-t5>flan-t5</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/t5#transformers.T5ForConditionalGeneration><code>T5ForConditionalGeneration</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/google/flan-t5-small><code>google/flan-t5-small</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-base><code>google/flan-t5-base</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-large><code>google/flan-t5-large</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-xl><code>google/flan-t5-xl</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-xxl><code>google/flan-t5-xxl</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[flan-t5]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/EleutherAI/gpt-neox>gpt-neox</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/eleutherai/gpt-neox-20b><code>eleutherai/gpt-neox-20b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/facebookresearch/llama>llama</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM><code>LlamaForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/meta-llama/Llama-2-70b-chat-hf><code>meta-llama/Llama-2-70b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-13b-chat-hf><code>meta-llama/Llama-2-13b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-7b-chat-hf><code>meta-llama/Llama-2-7b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-70b-hf><code>meta-llama/Llama-2-70b-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-13b-hf><code>meta-llama/Llama-2-13b-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-7b-hf><code>meta-llama/Llama-2-7b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-70b-chat-hf><code>NousResearch/llama-2-70b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-13b-chat-hf><code>NousResearch/llama-2-13b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-7b-chat-hf><code>NousResearch/llama-2-7b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-70b-hf><code>NousResearch/llama-2-70b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-13b-hf><code>NousResearch/llama-2-13b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-7b-hf><code>NousResearch/llama-2-7b-hf</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_7b_v2><code>openlm-research/open_llama_7b_v2</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_3b_v2><code>openlm-research/open_llama_3b_v2</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_13b><code>openlm-research/open_llama_13b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-65b><code>huggyllama/llama-65b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-30b><code>huggyllama/llama-30b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-13b><code>huggyllama/llama-13b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-7b><code>huggyllama/llama-7b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[llama]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/mosaicml>mpt</a></td>
+<td><a href=https://huggingface.co/mosaicml><code>MPTForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/mosaicml/mpt-7b><code>mosaicml/mpt-7b</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-instruct><code>mosaicml/mpt-7b-instruct</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-chat><code>mosaicml/mpt-7b-chat</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-storywriter><code>mosaicml/mpt-7b-storywriter</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b><code>mosaicml/mpt-30b</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b-instruct><code>mosaicml/mpt-30b-instruct</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b-chat><code>mosaicml/mpt-30b-chat</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[mpt]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/facebook/opt-125m><code>facebook/opt-125m</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-350m><code>facebook/opt-350m</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-1.3b><code>facebook/opt-1.3b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-2.7b><code>facebook/opt-2.7b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-6.7b><code>facebook/opt-6.7b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-66b><code>facebook/opt-66b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[opt]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/Stability-AI/StableLM>stablelm</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b><code>stabilityai/stablelm-tuned-alpha-3b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b><code>stabilityai/stablelm-tuned-alpha-7b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-base-alpha-3b><code>stabilityai/stablelm-base-alpha-3b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-base-alpha-7b><code>stabilityai/stablelm-base-alpha-7b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/bigcode-project/starcoder>starcoder</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_bigcode#transformers.GPTBigCodeForCausalLM><code>GPTBigCodeForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/bigcode/starcoder><code>bigcode/starcoder</code></a></li>
+<li><a href=https://huggingface.co/bigcode/starcoderbase><code>bigcode/starcoderbase</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[starcoder]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/baichuan-inc/Baichuan-7B>baichuan</a></td>
+<td><a href=https://github.com/baichuan-inc/Baichuan-7B><code>BaiChuanForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/baichuan-inc/baichuan-7b><code>baichuan-inc/baichuan-7b</code></a></li>
+<li><a href=https://huggingface.co/baichuan-inc/baichuan-13b-base><code>baichuan-inc/baichuan-13b-base</code></a></li>
+<li><a href=https://huggingface.co/baichuan-inc/baichuan-13b-chat><code>baichuan-inc/baichuan-13b-chat</code></a></li>
+<li><a href=https://huggingface.co/fireballoon/baichuan-vicuna-chinese-7b><code>fireballoon/baichuan-vicuna-chinese-7b</code></a></li>
+<li><a href=https://huggingface.co/fireballoon/baichuan-vicuna-7b><code>fireballoon/baichuan-vicuna-7b</code></a></li>
+<li><a href=https://huggingface.co/hiyouga/baichuan-7b-sft><code>hiyouga/baichuan-7b-sft</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[baichuan]"
+```
+
+</td>
+</tr>
+</table>
+
+<!-- update-readme.py: stop -->
+
+### Runtime Implementations (Experimental)
+
+Different LLMs may have multiple runtime implementations. For instance, they
+might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`).
+
+If you wish to specify a particular runtime for a model, you can do so by
+setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable
+before running `openllm start`.
+
+For example, if you want to use the Tensorflow (`tf`) implementation for the
+`flan-t5` model, you can use the following command:
+
+```bash
+OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
+```
+
+> [!NOTE]
+> For GPU support on Flax, refers to
+> [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
+> to make sure that you have Jax support for the corresponding CUDA version.
+
+### Quantisation
+
+OpenLLM supports quantisation with
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and
+[GPTQ](https://arxiv.org/abs/2210.17323)
+
+```bash
+openllm start mpt --quantize int8
+```
+
+To run inference with `gptq`, simply pass `--quantize gptq`:
+
+```bash
+openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gptq --device 0
+```
+
+> [!NOTE]
+> In order to run GPTQ, make sure to install with
+> `pip install "openllm[gptq]"`. The weights of all supported models should be
+> quantized before serving. See
+> [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) for more
+> information on GPTQ quantisation.
+
+### Fine-tuning support (Experimental)
+
+One can serve OpenLLM models with any PEFT-compatible layers with
+`--adapter-id`:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes
+```
+
+It also supports adapters from custom paths:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
+```
+
+To use multiple adapters, use the following format:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
+```
+
+By default, the first adapter-id will be the default Lora layer, but optionally
+users can change what Lora layer to use for inference via `/v1/adapters`:
+
+```bash
+curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
+```
+
+Note that for multiple adapter-name and adapter-id, it is recommended to update
+to use the default adapter before sending the inference, to avoid any
+performance degradation
+
+To include this into the Bento, one can also provide a `--adapter-id` into
+`openllm build`:
+
+```bash
+openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
+```
+
+> [!NOTE]
+> We will gradually roll out support for fine-tuning all models. The
+> following models contain fine-tuning support: OPT, Falcon, LlaMA.
+
+### Integrating a New Model
+
+OpenLLM encourages contributions by welcoming users to incorporate their custom
+LLMs into the ecosystem. Check out
+[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
+to see how you can do it yourself.
+
+### Embeddings
+
+OpenLLM tentatively provides embeddings endpoint for supported models. This can
+be accessed via `/v1/embeddings`.
+
+To use via CLI, simply call `openllm embed`:
+
+```bash
+openllm embed --endpoint http://localhost:3000 "I like to eat apples" -o json
+{
+  "embeddings": [
+    0.006569798570126295,
+    -0.031249752268195152,
+    -0.008072729222476482,
+    0.00847396720200777,
+    -0.005293501541018486,
+    ...<many embeddings>...
+    -0.002078012563288212,
+    -0.00676426338031888,
+    -0.002022686880081892
+  ],
+  "num_tokens": 9
+}
+```
+
+To invoke this endpoint, use `client.embed` from the Python SDK:
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient("http://localhost:3000")
+
+client.embed("I like to eat apples")
+```
+
+> [!NOTE]
+> Currently, the following model family supports embeddings: Llama, T5
+> (Flan-T5, FastChat, etc.), ChatGLM
+
+## ⚙️ Integrations
+
+OpenLLM is not just a standalone product; it's a building block designed to
+integrate with other powerful tools easily. We currently offer integration with
+[BentoML](https://github.com/bentoml/BentoML),
+[LangChain](https://github.com/hwchase17/langchain), and
+[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents).
+
+### BentoML
+
+OpenLLM models can be integrated as a
+[Runner](https://docs.bentoml.com/en/latest/concepts/runner.html) in your
+BentoML service. These runners have a `generate` method that takes a string as a
+prompt and returns a corresponding output string. This will allow you to plug
+and play any OpenLLM models with your existing ML workflow.
+
+```python
+import bentoml
+import openllm
+
+model = "opt"
+
+llm_config = openllm.AutoConfig.for_model(model)
+llm_runner = openllm.Runner(model, llm_config=llm_config)
+
+svc = bentoml.Service(
+    name=f"llm-opt-service", runners=[llm_runner]
+)
+
+@svc.api(input=Text(), output=Text())
+async def prompt(input_text: str) -> str:
+    answer = await llm_runner.generate(input_text)
+    return answer
+```
+
+### [LangChain](https://python.langchain.com/docs/ecosystem/integrations/openllm)
+
+To quickly start a local LLM with `langchain`, simply do the following:
+
+```python
+from langchain.llms import OpenLLM
+
+llm = OpenLLM(model_name="llama", model_id='meta-llama/Llama-2-7b-hf')
+
+llm("What is the difference between a duck and a goose? And why there are so many Goose in Canada?")
+```
+
+> [!IMPORTANT]
+> By default, OpenLLM use `safetensors` format for saving models.
+> If the model doesn't support safetensors, make sure to pass
+> `serialisation="legacy"` to use the legacy PyTorch bin format.
+
+`langchain.llms.OpenLLM` has the capability to interact with remote OpenLLM
+Server. Given there is an OpenLLM server deployed elsewhere, you can connect to
+it by specifying its URL:
+
+```python
+from langchain.llms import OpenLLM
+
+llm = OpenLLM(server_url='http://44.23.123.1:3000', server_type='grpc')
+llm("What is the difference between a duck and a goose? And why there are so many Goose in Canada?")
+```
+
+To integrate a LangChain agent with BentoML, you can do the following:
+
+```python
+llm = OpenLLM(
+    model_name='flan-t5',
+    model_id='google/flan-t5-large',
+    embedded=False,
+    serialisation="legacy"
+)
+tools = load_tools(["serpapi", "llm-math"], llm=llm)
+agent = initialize_agent(
+    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
+)
+svc = bentoml.Service("langchain-openllm", runners=[llm.runner])
+@svc.api(input=Text(), output=Text())
+def chat(input_text: str):
+    return agent.run(input_text)
+```
+
+> [!NOTE]
+> You can find out more examples under the
+> [examples](https://github.com/bentoml/OpenLLM/tree/main/examples) folder.
+
+### Transformers Agents
+
+OpenLLM seamlessly integrates with
+[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents).
+
+> [!WARNING]
+> The Transformers Agent is still at an experimental stage. It is
+> recommended to install OpenLLM with `pip install -r nightly-requirements.txt`
+> to get the latest API update for HuggingFace agent.
+
+```python
+import transformers
+
+agent = transformers.HfAgent("http://localhost:3000/hf/agent")  # URL that runs the OpenLLM server
+
+agent.run("Is the following `text` positive or negative?", text="I don't like how this models is generate inputs")
+```
+
+> [!IMPORTANT]
+> Only `starcoder` is currently supported with Agent integration.
+> The example above was also run with four T4s on EC2 `g4dn.12xlarge`
+
+If you want to use OpenLLM client to ask questions to the running agent, you can
+also do so:
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient("http://localhost:3000")
+
+client.ask_agent(
+    task="Is the following `text` positive or negative?",
+    text="What are you thinking about?",
+)
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
+
+![Gif showing Agent integration](/.github/assets/agent.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme meta start -->
+
+## 🚀 Deploying to Production
+
+There are several ways to deploy your LLMs:
+
+### 🐳 Docker container
+
+1. **Building a Bento**: With OpenLLM, you can easily build a Bento for a
+   specific model, like `dolly-v2`, using the `build` command.:
+
+   ```bash
+   openllm build dolly-v2
+   ```
+
+   A
+   [Bento](https://docs.bentoml.com/en/latest/concepts/bento.html#what-is-a-bento),
+   in BentoML, is the unit of distribution. It packages your program's source
+   code, models, files, artefacts, and dependencies.
+
+2. **Containerize your Bento**
+
+   ```bash
+   bentoml containerize <name:version>
+   ```
+   This generates a OCI-compatible docker image that can be deployed anywhere
+   docker runs. For best scalability and reliability of your LLM service in
+   production, we recommend deploy with BentoCloud。
+
+### ☁️ BentoCloud
+
+Deploy OpenLLM with [BentoCloud](https://www.bentoml.com/bento-cloud/), the
+serverless cloud for shipping and scaling AI applications.
+
+1. **Create a BentoCloud account:** [sign up here](https://bentoml.com/cloud)
+   for early access
+
+2. **Log into your BentoCloud account:**
+
+   ```bash
+   bentoml cloud login --api-token <your-api-token> --endpoint <bento-cloud-endpoint>
+   ```
+
+> [!NOTE]
+> Replace `<your-api-token>` and `<bento-cloud-endpoint>` with your
+> specific API token and the BentoCloud endpoint respectively.
+
+3. **Bulding a Bento**: With OpenLLM, you can easily build a Bento for a
+   specific model, such as `dolly-v2`:
+
+   ```bash
+   openllm build dolly-v2
+   ```
+
+4. **Pushing a Bento**: Push your freshly-built Bento service to BentoCloud via
+   the `push` command:
+
+   ```bash
+   bentoml push <name:version>
+   ```
+
+5. **Deploying a Bento**: Deploy your LLMs to BentoCloud with a single
+   `bentoml deployment create` command following the
+   [deployment instructions](https://docs.bentoml.com/en/latest/reference/cli.html#bentoml-deployment-create).
+
+## 👥 Community
+
+Engage with like-minded individuals passionate about LLMs, AI, and more on our
+[Discord](https://l.bentoml.com/join-openllm-discord)!
+
+OpenLLM is actively maintained by the BentoML team. Feel free to reach out and
+join us in our pursuit to make LLMs more accessible and easy to use 👉
+[Join our Slack community!](https://l.bentoml.com/join-slack)
+
+## 🎁 Contributing
+
+We welcome contributions! If you're interested in enhancing OpenLLM's
+capabilities or have any questions, don't hesitate to reach out in our
+[discord channel](https://l.bentoml.com/join-openllm-discord).
+
+Checkout our
+[Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md)
+if you wish to contribute to OpenLLM's codebase.
+
+## 🍇 Telemetry
+
+OpenLLM collects usage data to enhance user experience and improve the product.
+We only report OpenLLM's internal API calls and ensure maximum privacy by
+excluding sensitive information. We will never collect user code, model data, or
+stack traces. For usage tracking, check out the
+[code](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/src/openllm/utils/analytics.py).
+
+You can opt out of usage tracking by using the `--do-not-track` CLI option:
+
+```bash
+openllm [command] --do-not-track
+```
+
+Or by setting the environment variable `OPENLLM_DO_NOT_TRACK=True`:
+
+```bash
+export OPENLLM_DO_NOT_TRACK=True
+```
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](./CITATION.cff) to
+use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme meta stop -->
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -0,0 +1,207 @@
+# NOTE: PEP517 is manged via ./tools/dependencies.py
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+    "hatchling==1.18.0",
+    "hatch-vcs==0.3.0",
+    "hatch-fancy-pypi-readme==23.1.0",
+    "hatch-mypyc==0.16.0",
+]
+
+[project]
+authors = [
+    {name = "Aaron Pham",email = "aarnphm@bentoml.com"},
+    {name = "BentoML Team",email = "contact@bentoml.com"},
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: GPU :: NVIDIA CUDA",
+    "Environment :: GPU :: NVIDIA CUDA :: 12",
+    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+    "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+    "License :: OSI Approved :: Apache Software License",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: System Administrators",
+    "Typing :: Typed",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+    "bentoml[grpc,io]>=1.0.25",
+    "transformers[torch,tokenizers,accelerate]>=4.29.0",
+    "safetensors",
+    "optimum",
+    "attrs>=23.1.0",
+    "cattrs>=23.1.0",
+    "orjson",
+    "inflection",
+    "tabulate[widechars]>=0.9.0",
+    "httpx",
+    "click>=8.1.3",
+    "typing_extensions",
+    "mypy_extensions",
+    "ghapi",
+    "cuda-python;platform_system!=\"Darwin\"",
+    "bitsandbytes<0.42",
+]
+description = "OpenLLM: Operating LLMs in production"
+dynamic = ["version", "readme"]
+keywords = [
+    "MLOps",
+    "AI",
+    "BentoML",
+    "Model Serving",
+    "Model Deployment",
+    "LLMOps",
+    "Falcon",
+    "Vicuna",
+    "Llama 2",
+    "Fine tuning",
+    "Serverless",
+    "Large Language Model",
+    "Generative AI",
+    "StableLM",
+    "Alpaca",
+    "PyTorch",
+    "Transformers",
+]
+license = "Apache-2.0"
+name = "openllm"
+requires-python = ">=3.8"
+[project.scripts]
+openllm = "openllm.cli.entrypoint:cli"
+openllm-build-base-container = "openllm.cli.extension.build_base_container:cli"
+openllm-dive-bentos = "openllm.cli.extension.dive_bentos:cli"
+openllm-get-containerfile = "openllm.cli.extension.get_containerfile:cli"
+openllm-get-prompt = "openllm.cli.extension.get_prompt:cli"
+openllm-list-bentos = "openllm.cli.extension.list_bentos:cli"
+openllm-list-models = "openllm.cli.extension.list_models:cli"
+openllm-playground = "openllm.cli.extension.playground:cli"
+
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/openllm#readme"
+GitHub = "https://github.com/bentoml/OpenLLM"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+
+[project.optional-dependencies]
+agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+all = [
+    "openllm[agents]",
+    "openllm[baichuan]",
+    "openllm[chatglm]",
+    "openllm[falcon]",
+    "openllm[fine-tune]",
+    "openllm[flan-t5]",
+    "openllm[ggml]",
+    "openllm[gptq]",
+    "openllm[llama]",
+    "openllm[mpt]",
+    "openllm[openai]",
+    "openllm[opt]",
+    "openllm[playground]",
+    "openllm[starcoder]",
+    "openllm[vllm]",
+]
+baichuan = ["cpm-kernels", "sentencepiece"]
+chatglm = ["cpm-kernels", "sentencepiece"]
+falcon = ["einops", "xformers"]
+fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+ggml = ["ctransformers"]
+gptq = ["auto-gptq[triton]"]
+llama = ["fairscale", "sentencepiece"]
+mpt = ["triton", "einops"]
+openai = ["openai", "tiktoken"]
+opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
+starcoder = ["bitsandbytes"]
+vllm = ["vllm", "ray"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+    "git",
+    "describe",
+    "--dirty",
+    "--tags",
+    "--long",
+    "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm"]
+sources = ["src"]
+dev-mode-dirs = ["."]
+[tool.hatch.build.targets.sdist]
+exclude = ["/.git_archival.txt"]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+    "hatch-mypyc==0.16.0",
+    "mypy==1.4.1",
+    # avoid https://github.com/pallets/click/issues/2558
+    "click==8.1.3",
+    "bentoml==1.1.1",
+    "transformers>=4.31.0",
+    "pandas-stubs",
+    "types-psutil",
+    "types-tabulate",
+    "types-PyYAML",
+    "types-protobuf",
+]
+enable-by-default = false
+include = [
+    "src/openllm/bundle",
+    "src/openllm/models/__init__.py",
+    "src/openllm/models/auto/__init__.py",
+    "src/openllm/utils/__init__.py",
+    "src/openllm/utils/codegen.py",
+    "src/openllm/__init__.py",
+    "src/openllm/_prompt.py",
+    "src/openllm/_schema.py",
+    "src/openllm/_quantisation.py",
+    "src/openllm/_generation.py",
+    "src/openllm/_strategies.py",
+    "src/openllm/exceptions.py",
+    "src/openllm/testing.py",
+]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+    "--strict",
+    # this is because all transient library doesn't have types
+    "--allow-subclassing-any",
+    "--follow-imports=skip",
+    "--check-untyped-defs",
+    "--ignore-missing-imports",
+    "--no-warn-return-any",
+    "--warn-unreachable",
+    "--no-warn-no-return",
+    "--no-warn-unused-ignores",
+    "--exclude='/src\\/openllm\\/playground\\/**'",
+    "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -0,0 +1,161 @@
+"""OpenLLM.
+
+An open platform for operating large language models in production. Fine-tune, serve,
+deploy, and monitor any LLMs with ease.
+
+* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
+* Option to bring your own fine-tuned LLMs
+* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
+* Native integration with BentoML and LangChain for custom LLM apps
+"""
+from __future__ import annotations
+import logging as _logging, os as _os, typing as _t, warnings as _warnings
+from pathlib import Path as _Path
+from . import exceptions as exceptions, utils as utils
+
+if utils.DEBUG:
+  utils.set_debug_mode(True)
+  utils.set_quiet_mode(False)
+  _logging.basicConfig(level=_logging.NOTSET)
+else:
+  # configuration for bitsandbytes before import
+  _os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
+  # NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
+  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
+  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
+  _warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
+  # NOTE: ignore the following warning from ghapi as it is not important for users
+  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
+
+_import_structure: dict[str, list[str]] = {
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
+    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
+}
+COMPILED = _Path(__file__).suffix in (".pyd", ".so")
+
+if _t.TYPE_CHECKING:
+  from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
+  from ._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
+  from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
+  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
+  from ._quantisation import infer_quantisation_config as infer_quantisation_config
+  from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+  from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
+  from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
+  from .models.baichuan import BaichuanConfig as BaichuanConfig
+  from .models.chatglm import ChatGLMConfig as ChatGLMConfig
+  from .models.dolly_v2 import DollyV2Config as DollyV2Config
+  from .models.falcon import FalconConfig as FalconConfig
+  from .models.flan_t5 import FlanT5Config as FlanT5Config
+  from .models.gpt_neox import GPTNeoXConfig as GPTNeoXConfig
+  from .models.llama import LlamaConfig as LlamaConfig
+  from .models.mpt import MPTConfig as MPTConfig
+  from .models.opt import OPTConfig as OPTConfig
+  from .models.stablelm import StableLMConfig as StableLMConfig
+  from .models.starcoder import StarCoderConfig as StarCoderConfig
+  from .serialisation import ggml as ggml, transformers as transformers
+  from openllm.utils import infer_auto_class as infer_auto_class
+
+try:
+  if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  _import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
+else:
+  _import_structure["models.chatglm"].extend(["ChatGLM"])
+  _import_structure["models.baichuan"].extend(["Baichuan"])
+  if _t.TYPE_CHECKING:
+    from .models.baichuan import Baichuan as Baichuan
+    from .models.chatglm import ChatGLM as ChatGLM
+try:
+  if not (utils.is_torch_available() and utils.is_triton_available()): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
+  else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
+else:
+  _import_structure["models.mpt"].extend(["MPT"])
+  if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
+try:
+  if not (utils.is_torch_available() and utils.is_einops_available()): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
+  else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
+else:
+  _import_structure["models.falcon"].extend(["Falcon"])
+  if _t.TYPE_CHECKING: from .models.falcon import Falcon as Falcon
+
+try:
+  if not utils.is_torch_available(): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  _import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")]
+else:
+  _import_structure["models.flan_t5"].extend(["FlanT5"])
+  _import_structure["models.dolly_v2"].extend(["DollyV2"])
+  _import_structure["models.starcoder"].extend(["StarCoder"])
+  _import_structure["models.stablelm"].extend(["StableLM"])
+  _import_structure["models.opt"].extend(["OPT"])
+  _import_structure["models.gpt_neox"].extend(["GPTNeoX"])
+  _import_structure["models.llama"].extend(["Llama"])
+  _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING"])
+  if _t.TYPE_CHECKING:
+    from .models.auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM
+    from .models.dolly_v2 import DollyV2 as DollyV2
+    from .models.flan_t5 import FlanT5 as FlanT5
+    from .models.gpt_neox import GPTNeoX as GPTNeoX
+    from .models.llama import Llama as Llama
+    from .models.opt import OPT as OPT
+    from .models.stablelm import StableLM as StableLM
+    from .models.starcoder import StarCoder as StarCoder
+try:
+  if not utils.is_vllm_available(): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
+else:
+  _import_structure["models.baichuan"].extend(["VLLMBaichuan"])
+  _import_structure["models.llama"].extend(["VLLMLlama"])
+  _import_structure["models.opt"].extend(["VLLMOPT"])
+  _import_structure["models.dolly_v2"].extend(["VLLMDollyV2"])
+  _import_structure["models.gpt_neox"].extend(["VLLMGPTNeoX"])
+  _import_structure["models.mpt"].extend(["VLLMMPT"])
+  _import_structure["models.stablelm"].extend(["VLLMStableLM"])
+  _import_structure["models.starcoder"].extend(["VLLMStarCoder"])
+  _import_structure["models.auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"])
+  if _t.TYPE_CHECKING:
+    from .models.auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM
+    from .models.baichuan import VLLMBaichuan as VLLMBaichuan
+    from .models.dolly_v2 import VLLMDollyV2 as VLLMDollyV2
+    from .models.gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX
+    from .models.llama import VLLMLlama as VLLMLlama
+    from .models.mpt import VLLMMPT as VLLMMPT
+    from .models.opt import VLLMOPT as VLLMOPT
+    from .models.stablelm import VLLMStableLM as VLLMStableLM
+    from .models.starcoder import VLLMStarCoder as VLLMStarCoder
+try:
+  if not utils.is_flax_available(): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  _import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
+else:
+  _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
+  _import_structure["models.opt"].extend(["FlaxOPT"])
+  _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"])
+  if _t.TYPE_CHECKING:
+    from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
+    from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
+    from .models.opt import FlaxOPT as FlaxOPT
+try:
+  if not utils.is_tf_available(): raise exceptions.MissingDependencyError
+except exceptions.MissingDependencyError:
+  _import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
+else:
+  _import_structure["models.flan_t5"].extend(["TFFlanT5"])
+  _import_structure["models.opt"].extend(["TFOPT"])
+  _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"])
+  if _t.TYPE_CHECKING:
+    from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM
+    from .models.flan_t5 import TFFlanT5 as TFFlanT5
+    from .models.opt import TFOPT as TFOPT
+
+# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
+__lazy = utils.LazyModule(__name__, _os.path.abspath("__file__"), _import_structure, extra_objects={"COMPILED": COMPILED})
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -0,0 +1,13 @@
+"""CLI entrypoint for OpenLLM.
+
+Usage:
+    openllm --help
+
+To start any OpenLLM model:
+    openllm start <model_name> --options ...
+"""
+from __future__ import annotations
+
+if __name__ == "__main__":
+  from openllm.cli.entrypoint import cli
+  cli()
--- a/openllm-python/src/openllm/_configuration.py
+++ b/openllm-python/src/openllm/_configuration.py
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -0,0 +1,22 @@
+# mypy: disable-error-code="misc"
+from __future__ import annotations
+import typing as t, transformers
+if t.TYPE_CHECKING: import torch, openllm
+
+LogitsProcessorList = transformers.LogitsProcessorList
+StoppingCriteriaList = transformers.StoppingCriteriaList
+class StopSequenceCriteria(transformers.StoppingCriteria):
+  def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
+    if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
+    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
+  def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
+class StopOnTokens(transformers.StoppingCriteria):
+  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: return t.cast(int, input_ids[0][-1]) in {50278, 50279, 50277, 1, 0}
+def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
+  generation_config = config.generation_config
+  logits_processor = transformers.LogitsProcessorList()
+  if generation_config["temperature"] >= 1e-5 and generation_config["temperature"] != 1.0: logits_processor.append(transformers.TemperatureLogitsWarper(generation_config["temperature"]))
+  if generation_config["repetition_penalty"] > 1.0: logits_processor.append(transformers.RepetitionPenaltyLogitsProcessor(generation_config["repetition_penalty"]))
+  if 1e-8 <= generation_config["top_p"]: logits_processor.append(transformers.TopPLogitsWarper(generation_config["top_p"]))
+  if generation_config["top_k"] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config["top_k"]))
+  return logits_processor
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
--- a/openllm-python/src/openllm/_prompt.py
+++ b/openllm-python/src/openllm/_prompt.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+import string, typing as t
+class PromptFormatter(string.Formatter):
+  """This PromptFormatter is largely based on langchain's implementation."""
+  def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any:
+    if len(args) > 0: raise ValueError("Positional arguments are not supported")
+    return super().vformat(format_string, args, kwargs)
+  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
+    extras = set(kwargs).difference(used_args)
+    if extras: raise KeyError(f"Extra params passed: {extras}")
+  def extract_template_variables(self, template: str) -> t.Sequence[str]:
+    return [field[1] for field in self.parse(template) if field[1] is not None]
+
+default_formatter = PromptFormatter()
+def process_prompt(prompt: str, template: str | None = None, use_prompt_template: bool = True, **attrs: t.Any) -> str:
+  # Currently, all default prompt will always have `instruction` key.
+  if not use_prompt_template: return prompt
+  elif template is None: raise ValueError("'template' can't be None while 'use_prompt_template=False'")
+  template_variables = default_formatter.extract_template_variables(template)
+  prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
+  if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'")
+  try: return template.format(instruction=prompt, **prompt_variables)
+  except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -0,0 +1,58 @@
+# mypy: disable-error-code="name-defined"
+from __future__ import annotations
+import logging, sys, typing as t
+from .utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
+if sys.version_info[:2] >= (3, 11): from typing import overload
+else: from typing_extensions import overload
+if t.TYPE_CHECKING:
+  from ._llm import LLM
+  from ._typing_compat import DictStrAny
+
+autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+
+QuantiseMode = t.Literal["int8", "int4", "gptq"]
+
+@overload
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ...
+@overload
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: ...
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
+  # 8 bit configuration
+  int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
+  int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
+  int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
+  int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
+
+  autogptq_attrs: DictStrAny = {"bits": attrs.pop("gptq_bits", 4), "group_size": attrs.pop("gptq_group_size", -1), "damp_percent": attrs.pop("gptq_damp_percent", 0.01), "desc_act": attrs.pop("gptq_desc_act", True), "sym": attrs.pop("gptq_sym", True), "true_sequential": attrs.pop("gptq_true_sequential", True),}
+
+  def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
+    if int8_skip_modules is None: int8_skip_modules = []
+    if "lm_head" not in int8_skip_modules and cls.config_class.__openllm_model_type__ == "causal_lm":
+      logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__)
+      int8_skip_modules.append("lm_head")
+    return transformers.BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload, llm_int8_threshhold=int8_threshold, llm_int8_skip_modules=int8_skip_modules, llm_int8_has_fp16_weight=int8_has_fp16_weight,)
+
+  # 4 bit configuration
+  int4_compute_dtype = attrs.pop("bnb_4bit_compute_dtype", torch.bfloat16)
+  int4_quant_type = attrs.pop("bnb_4bit_quant_type", "nf4")
+  int4_use_double_quant = attrs.pop("bnb_4bit_use_double_quant", True)
+
+  # NOTE: Quantization setup
+  # quantize is a openllm.LLM feature, where we can quantize the model
+  # with bitsandbytes or quantization aware training.
+  if not is_bitsandbytes_available(): raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
+  if quantise == "int8": quantisation_config = create_int8_config(int8_skip_modules)
+  elif quantise == "int4":
+    if is_transformers_supports_kbit(): quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant)
+    else:
+      logger.warning("'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.", pkg.pkg_version_info("transformers"))
+      quantisation_config = create_int8_config(int8_skip_modules)
+  elif quantise == "gptq":
+    if not is_autogptq_available():
+      logger.warning("'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes.")
+      quantisation_config = create_int8_config(int8_skip_modules)
+    else: quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
+  else: raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
+  return quantisation_config, attrs
--- a/openllm-python/src/openllm/_schema.py
+++ b/openllm-python/src/openllm/_schema.py
@@ -0,0 +1,54 @@
+"""Schema definition for OpenLLM. This can be use for client interaction."""
+from __future__ import annotations
+import functools, typing as t
+import attr, inflection, openllm
+from ._configuration import GenerationConfig, LLMConfig
+from .utils import bentoml_cattr
+if t.TYPE_CHECKING: import vllm
+
+@attr.frozen(slots=True)
+class GenerationInput:
+  prompt: str
+  llm_config: LLMConfig
+  adapter_name: str | None = attr.field(default=None)
+  def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name}
+  @staticmethod
+  def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
+    if isinstance(data, LLMConfig): return data
+    else:
+      if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
+      return cls(**data)
+  @classmethod
+  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
+  @classmethod
+  def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
+@attr.frozen(slots=True)
+class GenerationOutput:
+  responses: t.List[t.Any]
+  configuration: t.Dict[str, t.Any]
+  @property
+  def marshaled_config(self) -> GenerationConfig: return bentoml_cattr.structure(self.configuration, GenerationConfig)
+  @property
+  def unmarshaled(self) -> dict[str, t.Any]: return bentoml_cattr.unstructure(self)
+  def __getitem__(self, key: str) -> t.Any:
+    if hasattr(self, key): return getattr(self, key)
+    elif key in self.configuration: return self.configuration[key]
+    else: raise KeyError(key)
+@attr.frozen(slots=True)
+class MetadataOutput:
+  model_id: str
+  timeout: int
+  model_name: str
+  framework: str
+  configuration: str
+  supports_embeddings: bool
+  supports_hf_agent: bool
+@attr.frozen(slots=True)
+class EmbeddingsOutput:
+  embeddings: t.List[t.List[float]]
+  num_tokens: int
+def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]: return dict(request_id=request_output.request_id, prompt=request_output.prompt, finished=request_output.finished, prompt_token_ids=request_output.prompt_token_ids, outputs=[dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) for it in request_output.outputs])
+@attr.define
+class HfAgentInput:
+  inputs: str
+  parameters: t.Dict[str, t.Any]
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -0,0 +1,61 @@
+# mypy: disable-error-code="arg-type,misc"
+"""The service definition for running any LLMService.
+
+For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
+Codegen can be found under 'openllm.utils.codegen'
+"""
+from __future__ import annotations
+import os, warnings, orjson, bentoml, openllm, typing as t
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+if t.TYPE_CHECKING:
+  from starlette.requests import Request
+  from starlette.responses import Response
+# The following warnings from bitsandbytes, and probably not that important for users to see
+warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
+warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
+warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
+model = os.environ.get("OPENLLM_MODEL", "{__model_name__}")  # openllm: model name
+adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""")  # openllm: model adapter map
+llm_config = openllm.AutoConfig.for_model(model)
+runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
+
+@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
+async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
+  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
+  config = qa_inputs.llm_config.model_dump()
+  responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
+  return openllm.GenerationOutput(responses=responses, configuration=config)
+
+@svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent}))
+def metadata_v1(_: str) -> openllm.MetadataOutput:
+  return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
+
+if runner.supports_embeddings:
+  @svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
+  async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+    responses = await runner.embeddings.async_run(phrases)
+    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
+
+if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
+  async def hf_agent(request: Request) -> Response:
+    json_str = await request.body()
+    try: input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), openllm.HfAgentInput)
+    except orjson.JSONDecodeError as err: raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
+    stop = input_data.parameters.pop("stop", ["\n"])
+    try: return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200)
+    except NotImplementedError: return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
+
+  hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
+  svc.mount_asgi_app(hf_app, path="/hf")
+
+async def list_adapter_v1(_: Request) -> Response:
+  res: dict[str, t.Any] = {}
+  if runner.peft_adapters["success"] is True: res["result"] = {k: v.to_dict() for k, v in runner.peft_adapters["result"].items()}
+  res.update({"success": runner.peft_adapters["success"], "error_msg": runner.peft_adapters["error_msg"]})
+  return JSONResponse(res, status_code=200)
+
+adapters_app_v1 = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])])
+svc.mount_asgi_app(adapters_app_v1, path="/v1")
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -0,0 +1,332 @@
+from __future__ import annotations
+import functools, inspect, logging, math, os, sys, types, typing as t, warnings, psutil, bentoml
+from bentoml._internal.resource import get_resource, system_resources
+from bentoml._internal.runner.strategy import THREAD_ENVS
+from .utils import DEBUG, ReprMixin
+if sys.version_info[:2] >= (3, 11): from typing import overload
+else: from typing_extensions import overload
+
+class DynResource(t.Protocol):
+  resource_id: t.ClassVar[str]
+  @classmethod
+  def from_system(cls) -> t.Sequence[t.Any]: ...
+
+logger = logging.getLogger(__name__)
+def _strtoul(s: str) -> int:
+  """Return -1 or positive integer sequence string starts with,."""
+  if not s: return -1
+  idx = 0
+  for idx, c in enumerate(s):
+    if not (c.isdigit() or (idx == 0 and c in "+-")): break
+    if idx + 1 == len(s): idx += 1  # noqa: PLW2901
+  # NOTE: idx will be set via enumerate
+  return int(s[:idx]) if idx > 0 else -1
+
+def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
+  rcs: list[str] = []
+  for elem in lst.split(","):
+    # Repeated id results in empty set
+    if elem in rcs: return []
+    # Anything other but prefix is ignored
+    if not elem.startswith(prefix): break
+    rcs.append(elem)
+  return rcs
+
+_STACK_LEVEL = 3
+
+
+@overload  # variant: default callback
+def _parse_visible_devices() -> list[str] | None: ...
+@overload  # variant: specify None, and respect_env
+def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ...
+@overload  # variant: default var is something other than None
+def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ...
+def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
+  """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
+  if respect_env:
+    spec = os.environ.get("CUDA_VISIBLE_DEVICES", default_var)
+    if not spec: return None
+  else:
+    if default_var is None: raise ValueError("spec is required to be not None when parsing spec.")
+    spec = default_var
+
+  if spec.startswith("GPU-"): return _parse_list_with_prefix(spec, "GPU-")
+  if spec.startswith("MIG-"): return _parse_list_with_prefix(spec, "MIG-")
+  # XXX: We need to somehow handle cases such as '100m'
+  # CUDA_VISIBLE_DEVICES uses something like strtoul
+  # which makes `1gpu2,2ampere` is equivalent to `1,2`
+  rc: list[int] = []
+  for el in spec.split(","):
+    x = _strtoul(el.strip())
+    # Repeated ordinal results in empty set
+    if x in rc: return []
+    # Negative value aborts the sequence
+    if x < 0: break
+    rc.append(x)
+  return [str(i) for i in rc]
+
+def _from_system(cls: type[DynResource]) -> list[str]:
+  visible_devices = _parse_visible_devices()
+  if visible_devices is None:
+    if cls.resource_id == "amd.com/gpu":
+      if not psutil.LINUX:
+        if DEBUG: warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL)
+        return []
+      # ROCm does not currently have the rocm_smi wheel.
+      # So we need to use the ctypes bindings directly.
+      # we don't want to use CLI because parsing is a pain.
+      sys.path.append("/opt/rocm/libexec/rocm_smi")
+      try:
+        from ctypes import byref, c_uint32
+
+        # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
+        from rsmiBindings import rocmsmi, rsmi_status_t
+
+        device_count = c_uint32(0)
+        ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
+        if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)]
+        return []
+      # In this case the binary is not found, returning empty list
+      except (ModuleNotFoundError, ImportError):
+        return []
+      finally:
+        sys.path.remove("/opt/rocm/libexec/rocm_smi")
+    else:
+      try:
+        from cuda import cuda
+        cuda.cuInit(0)
+        _, dev = cuda.cuDeviceGetCount()
+        return [str(i) for i in range(dev)]
+      except (ImportError, RuntimeError, AttributeError):
+        return []
+  return visible_devices
+
+@overload
+def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ...
+@overload
+def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ...
+@overload
+def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ...
+def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
+  if isinstance(spec, int):
+    if spec in (-1, 0): return []
+    if spec < -1: raise ValueError("Spec cannot be < -1.")
+    return [str(i) for i in range(spec)]
+  elif isinstance(spec, str):
+    if not spec: return []
+    if spec.isdigit(): spec = ",".join([str(i) for i in range(_strtoul(spec))])
+    return _parse_visible_devices(spec, respect_env=False)
+  elif isinstance(spec, list): return [str(x) for x in spec]
+  else: raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
+
+def _raw_device_uuid_nvml() -> list[str] | None:
+  from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
+
+  try: nvml_h = CDLL("libnvidia-ml.so.1")
+  except Exception:
+    warnings.warn("Failed to find nvidia binding", stacklevel=_STACK_LEVEL)
+    return None
+
+  rc = nvml_h.nvmlInit()
+  if rc != 0:
+    warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
+    return None
+  dev_count = c_int(-1)
+  rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+  if rc != 0:
+    warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL)
+    return None
+  uuids: list[str] = []
+  for idx in range(dev_count.value):
+    dev_id = c_void_p()
+    rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+    if rc != 0:
+      warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL)
+      return None
+    buf_len = 96
+    buf = create_string_buffer(buf_len)
+    rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+    if rc != 0:
+      warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL)
+      return None
+    uuids.append(buf.raw.decode("ascii").strip("\0"))
+  del nvml_h
+  return uuids
+
+def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
+  if cls.resource_id == "amd.com/gpu":
+    raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
+  if not all(isinstance(i, str) for i in val): raise ValueError("Input list should be all string type.")
+
+  try:
+    from cuda import cuda
+
+    err, *_ = cuda.cuInit(0)
+    if err != cuda.CUresult.CUDA_SUCCESS:
+      raise RuntimeError("Failed to initialise CUDA runtime binding.")
+    # correctly parse handle
+    for el in val:
+      if el.startswith("GPU-") or el.startswith("MIG-"):
+        uuids = _raw_device_uuid_nvml()
+        if uuids is None: raise ValueError("Failed to parse available GPUs UUID")
+        if el not in uuids: raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})")
+      elif el.isdigit():
+        err, _ = cuda.cuDeviceGet(int(el))
+        if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f"Failed to get device {el}")
+  except (ImportError, RuntimeError):
+    pass
+
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
+  return types.new_class(
+    name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}),
+  )
+
+# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
+_TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2"
+_AMD_GPU_RESOURCE: t.Literal["amd.com/gpu"] = "amd.com/gpu"
+_NVIDIA_GPU_RESOURCE: t.Literal["nvidia.com/gpu"] = "nvidia.com/gpu"
+_CPU_RESOURCE: t.Literal["cpu"] = "cpu"
+
+NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource.
+
+    This is a modified version of internal's BentoML's NvidiaGpuResource
+    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""")
+AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource.
+
+    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
+    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""")
+
+LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"]
+
+# convenient mapping
+def resource_spec(name: t.Literal["tpu", "amd", "nvidia", "cpu"]) -> LiteralResourceSpec:
+  if name == "tpu": return _TPU_RESOURCE
+  elif name == "amd": return _AMD_GPU_RESOURCE
+  elif name == "nvidia": return _NVIDIA_GPU_RESOURCE
+  elif name == "cpu": return _CPU_RESOURCE
+  else: raise ValueError("Unknown alias. Accepted: ['tpu', 'amd', 'nvidia', 'cpu']")
+
+@functools.lru_cache
+def available_resource_spec() -> tuple[LiteralResourceSpec, ...]:
+  """This is a utility function helps to determine the available resources from given running system.
+
+  It will first check for TPUs -> AMD GPUS -> NVIDIA GPUS -> CPUs.
+
+  TODO: Supports TPUs
+  """
+  available: list[LiteralResourceSpec] = []
+  if len(AmdGpuResource.from_system()) > 0: available.append(_AMD_GPU_RESOURCE)
+  if len(NvidiaGpuResource.from_system()) > 0: available.append(_NVIDIA_GPU_RESOURCE)
+  available.append(_CPU_RESOURCE)
+  return tuple(available)
+
+class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
+  """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
+
+  It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
+  See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
+  for ROCm's support for CUDA_VISIBLE_DEVICES.
+
+  TODO: Support CloudTPUResource
+  """
+  @classmethod
+  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float) -> int:
+    """Return the number of workers to be used for the given runnable class.
+
+    Note that for all available GPU, the number of workers will always be 1.
+    """
+    if resource_request is None: resource_request = system_resources()
+    # use NVIDIA
+    kind = "nvidia.com/gpu"
+    nvidia_req = get_resource(resource_request, kind)
+    if nvidia_req is not None: return 1
+    # use AMD
+    kind = "amd.com/gpu"
+    amd_req = get_resource(resource_request, kind, validate=False)
+    if amd_req is not None: return 1
+    # use CPU
+    cpus = get_resource(resource_request, "cpu")
+    if cpus is not None and cpus > 0:
+      if "cpu" not in runnable_class.SUPPORTED_RESOURCES: logger.warning("No known supported resource available for %s, falling back to using CPU.", runnable_class)
+
+      if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
+        if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError("Fractional CPU multi threading support is not yet supported.")
+        return int(workers_per_resource)
+      return math.ceil(cpus) * workers_per_resource
+
+    # this should not be reached by user since we always read system resource as default
+    raise ValueError(f"No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.")
+
+  @classmethod
+  def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float, worker_index: int) -> dict[str, t.Any]:
+    """Get worker env for this given worker_index.
+
+    Args:
+      runnable_class: The runnable class to be run.
+      resource_request: The resource request of the runnable.
+      workers_per_resource: # of workers per resource.
+      worker_index: The index of the worker, start from 0.
+    """
+    cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    disabled = cuda_env in ("", "-1")
+    environ: dict[str, t.Any] = {}
+
+    if resource_request is None: resource_request = system_resources()
+    # use NVIDIA
+    kind = "nvidia.com/gpu"
+    typ = get_resource(resource_request, kind)
+    if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
+      if disabled:
+        logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
+        environ["CUDA_VISIBLE_DEVICES"] = cuda_env
+        return environ
+      environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
+      logger.debug("Environ for worker %s: %s", worker_index, environ)
+      return environ
+    # use AMD
+    kind = "amd.com/gpu"
+    typ = get_resource(resource_request, kind, validate=False)
+    if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
+      if disabled:
+        logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
+        environ["CUDA_VISIBLE_DEVICES"] = cuda_env
+        return environ
+      environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
+      logger.debug("Environ for worker %s: %s", worker_index, environ)
+      return environ
+    # use CPU
+    cpus = get_resource(resource_request, "cpu")
+    if cpus is not None and cpus > 0:
+      environ["CUDA_VISIBLE_DEVICES"] = "-1"  # disable gpu
+      if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
+        thread_count = math.ceil(cpus)
+        for thread_env in THREAD_ENVS:
+          environ[thread_env] = os.environ.get(thread_env, str(thread_count))
+        logger.debug("Environ for worker %s: %s", worker_index, environ)
+        return environ
+      for thread_env in THREAD_ENVS:
+        environ[thread_env] = os.environ.get(thread_env, "1")
+      return environ
+    return environ
+
+  @staticmethod
+  def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
+    # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
+    if isinstance(workers_per_resource, float):
+      # NOTE: We hit this branch when workers_per_resource is set to
+      # float, for example 0.5 or 0.25
+      if workers_per_resource > 1:
+        raise ValueError("Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case.")
+      # We are round the assigned resource here. This means if workers_per_resource=.4
+      # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
+      assigned_resource_per_worker = round(1 / workers_per_resource)
+      if len(gpus) < assigned_resource_per_worker:
+        logger.warning("Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])", gpus, worker_index, assigned_resource_per_worker)
+        raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
+      assigned_gpu = gpus[assigned_resource_per_worker * worker_index:assigned_resource_per_worker * (worker_index+1)]
+      dev = ",".join(assigned_gpu)
+    else:
+      idx = worker_index // workers_per_resource
+      if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
+      dev = str(gpus[idx])
+    return dev
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-python/src/openllm/_typing_compat.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+import sys, typing as t, bentoml, attr, abc
+from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict
+if t.TYPE_CHECKING:
+  import openllm, peft, transformers, auto_gptq as autogptq, vllm
+  from bentoml._internal.runner.runnable import RunnableMethod
+  from bentoml._internal.runner.runner import RunnerMethod
+  from bentoml._internal.runner.strategy import Strategy
+
+  from .bundle.oci import LiteralContainerVersionStrategy
+  from .utils.lazy import VersionInfo
+
+M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]")
+T = t.TypeVar("T", bound="t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]")
+
+AnyCallable = t.Callable[..., t.Any]
+DictStrAny = t.Dict[str, t.Any]
+ListAny = t.List[t.Any]
+ListStr = t.List[str]
+TupleAny = t.Tuple[t.Any, ...]
+At = t.TypeVar("At", bound=attr.AttrsInstance)
+
+LiteralRuntime = t.Literal["pt", "tf", "flax", "vllm"]
+AdapterType = t.Literal["lora", "adalora", "adaption_prompt", "prefix_tuning", "p_tuning", "prompt_tuning", "ia3"]
+
+if sys.version_info[:2] >= (3,11):
+  from typing import LiteralString as LiteralString, Self as Self, overload as overload
+  from typing import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform
+else:
+  from typing_extensions import LiteralString as LiteralString, Self as Self, overload as overload
+  from typing_extensions import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform
+
+if sys.version_info[:2] >= (3,10):
+  from typing import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate
+else:
+  from typing_extensions import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate
+
+if sys.version_info[:2] >= (3,9):
+  from typing import TypedDict as TypedDict
+else:
+  from typing_extensions import TypedDict as TypedDict
+
+class PeftAdapterOutput(TypedDict):
+  success: bool
+  result: t.Dict[str, peft.PeftConfig]
+  error_msg: str
+
+class LLMEmbeddings(t.TypedDict):
+  embeddings: t.List[t.List[float]]
+  num_tokens: int
+
+class AdaptersTuple(TupleAny):
+  adapter_id: str
+  name: t.Optional[str]
+  config: DictStrAny
+
+AdaptersMapping = t.Dict[AdapterType, t.Tuple[AdaptersTuple, ...]]
+
+class RefTuple(TupleAny):
+  git_hash: str
+  version: VersionInfo
+  strategy: LiteralContainerVersionStrategy
+
+class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
+  SUPPORTED_RESOURCES = ("amd.com/gpu", "nvidia.com/gpu", "cpu")
+  SUPPORTS_CPU_MULTI_THREADING = True
+  __call__: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
+  set_adapter: RunnableMethod[LLMRunnable[M, T], [str], dict[t.Literal["success", "error_msg"], bool | str]]
+  embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
+  generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
+  generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
+
+class LLMRunner(bentoml.Runner, t.Generic[M, T]):
+  __doc__: str
+  __module__: str
+  llm_type: str
+  identifying_params: dict[str, t.Any]
+  llm: openllm.LLM[M, T]
+  config: openllm.LLMConfig
+  implementation: LiteralRuntime
+  supports_embeddings: bool
+  supports_hf_agent: bool
+  has_adapters: bool
+  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
+  generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
+  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
+  def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: ...
+  def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: ...
+  @abc.abstractmethod
+  def embed(self, prompt: str | list[str]) -> LLMEmbeddings: ...
+  def run(self, prompt: str, **attrs: t.Any) -> t.Any: ...
+  async def async_run(self, prompt: str, **attrs: t.Any) -> t.Any: ...
+  @abc.abstractmethod
+  def download_model(self) -> bentoml.Model: ...
+  @property
+  @abc.abstractmethod
+  def peft_adapters(self) -> PeftAdapterOutput: ...
+  @property
+  @abc.abstractmethod
+  def __repr_keys__(self) -> set[str]: ...
--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -0,0 +1,34 @@
+"""Build-related utilities. Some of these utilities are mainly used for 'openllm.build'.
+
+These utilities will stay internal, and its API can be changed or updated without backward-compatibility.
+"""
+from __future__ import annotations
+import os, typing as t
+from openllm.utils import LazyModule
+
+_import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]}
+
+if t.TYPE_CHECKING:
+  from . import (
+    _package as _package,
+    oci as oci,
+  )
+  from ._package import (
+    build_editable as build_editable,
+    construct_docker_options as construct_docker_options,
+    construct_python_options as construct_python_options,
+    create_bento as create_bento,
+  )
+  from .oci import (
+    CONTAINER_NAMES as CONTAINER_NAMES,
+    RefResolver as RefResolver,
+    build_container as build_container,
+    get_base_container_name as get_base_container_name,
+    get_base_container_tag as get_base_container_tag,
+    supported_registries as supported_registries,
+  )
+
+__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
+__all__=__lazy.__all__
+__dir__=__lazy.__dir__
+__getattr__=__lazy.__getattr__
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -0,0 +1,144 @@
+# mypy: disable-error-code="misc"
+from __future__ import annotations
+import importlib.metadata, inspect, logging, os, typing as t
+from pathlib import Path
+import fs, fs.copy, fs.errors, orjson, bentoml, openllm
+from simple_di import Provide, inject
+from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
+from bentoml._internal.configuration.containers import BentoMLContainer
+from . import oci
+
+if t.TYPE_CHECKING:
+  from fs.base import FS
+  from openllm._typing_compat import LiteralString
+  from bentoml._internal.bento import BentoStore
+  from bentoml._internal.models.model import ModelStore
+  from .oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+
+logger = logging.getLogger(__name__)
+
+OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
+
+def build_editable(path: str) -> str | None:
+  """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
+  if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != "true": return None
+  # We need to build the package in editable mode, so that we can import it
+  from build import ProjectBuilder
+  from build.env import IsolatedEnvBuilder
+  module_location = openllm.utils.pkg.source_locations("openllm")
+  if not module_location: raise RuntimeError("Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.")
+  pyproject_path = Path(module_location).parent.parent/"pyproject.toml"
+  if os.path.isfile(pyproject_path.__fspath__()):
+    logger.info("OpenLLM is installed in editable mode. Generating built wheels...")
+    with IsolatedEnvBuilder() as env:
+      builder = ProjectBuilder(pyproject_path.parent)
+      builder.python_executable = env.executable
+      builder.scripts_dir = env.scripts_dir
+      env.install(builder.build_system_requires)
+      return builder.build("wheel", path, config_settings={"--global-option": "--quiet"})
+  raise RuntimeError("Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.")
+
+def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
+  packages = ["openllm", "scipy"]  # apparently bnb misses this one
+  if adapter_map is not None: packages += ["openllm[fine-tune]"]
+  # NOTE: add openllm to the default dependencies
+  # if users has openllm custom built wheels, it will still respect
+  # that since bentoml will always install dependencies from requirements.txt
+  # first, then proceed to install everything inside the wheels/ folder.
+  if extra_dependencies is not None: packages += [f"openllm[{k}]" for k in extra_dependencies]
+
+  req = llm.config["requirements"]
+  if req is not None: packages.extend(req)
+  if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm.utils.pkg.pkg_version_info('bentoml')])}")
+
+  env = llm.config["env"]
+  framework_envvar = env["framework_value"]
+  if framework_envvar == "flax":
+    if not openllm.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+    packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")])
+  elif framework_envvar == "tf":
+    if not openllm.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+    candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",)
+    # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+    for candidate in candidates:
+      try:
+        pkgver = importlib.metadata.version(candidate)
+        if pkgver == candidate: packages.extend(["tensorflow"])
+        else:
+          _tf_version = importlib.metadata.version(candidate)
+          packages.extend([f"tensorflow>={_tf_version}"])
+        break
+      except importlib.metadata.PackageNotFoundError: pass  # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
+  else:
+    if not openllm.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
+    packages.extend([f'torch>={importlib.metadata.version("torch")}'])
+  wheels: list[str] = []
+  built_wheels = build_editable(llm_fs.getsyspath("/"))
+  if built_wheels is not None: wheels.append(llm_fs.getsyspath(f"/{built_wheels.split('/')[-1]}"))
+  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
+
+def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
+  _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
+  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
+  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+  env: openllm.utils.EnvVarMixin = llm.config["env"]
+  if env["framework_value"] == "vllm": serialisation_format = "legacy"
+  env_dict = {
+      env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
+      env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
+      "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
+      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
+  }
+  if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
+
+  # We need to handle None separately here, as env from subprocess doesn't accept None value.
+  _env = openllm.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+
+  env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
+  if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
+  env_dict[_env.runtime] = _env["runtime_value"]
+  return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
+
+@inject
+def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
+                runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release",
+                _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
+  framework_envvar = llm.config["env"]["framework_value"]
+  labels = dict(llm.identifying_params)
+  labels.update({"_type": llm.llm_type, "_framework": framework_envvar, "start_name": llm.config["start_name"], "base_name_or_path": llm.model_id, "bundler": "openllm.bundle"})
+  if adapter_map: labels.update(adapter_map)
+  if isinstance(workers_per_resource, str):
+    if workers_per_resource == "round_robin": workers_per_resource = 1.0
+    elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
+    else:
+      try: workers_per_resource = float(workers_per_resource)
+      except ValueError: raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
+  elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource)
+  logger.info("Building Bento for '%s'", llm.config["start_name"])
+  # add service.py definition to this temporary folder
+  openllm.utils.codegen.write_service(llm, adapter_map, llm_fs)
+
+  llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
+  build_config = BentoBuildConfig(
+      service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec],
+      docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
+  )
+
+  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
+  # NOTE: the model_id_path here are only used for setting this environment variable within the container
+  # built with for BentoLLM.
+  service_fs_path = fs.path.join("src", llm.config["service_name"])
+  service_path = bento._fs.getsyspath(service_fs_path)
+  with open(service_path, "r") as f:
+    service_contents = f.readlines()
+
+  for it in service_contents:
+    if "__bento_name__" in it: service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
+
+  script = "".join(service_contents)
+  if openllm.utils.DEBUG: logger.info("Generated script:\n%s", script)
+
+  bento._fs.writetext(service_fs_path, script)
+  if "model_store" in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)
+  # backward arguments. `model_store` is added recently
+  return bento.save(bento_store=_bento_store)
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -0,0 +1,165 @@
+# syntax=docker/dockerfile-upstream:master
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
+FROM debian:bullseye-slim as pytorch-install
+
+ARG PYTORCH_VERSION=2.0.0
+ARG PYTHON_VERSION=3.9
+ARG CUDA_VERSION=11.8
+ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+
+# Automatically set by buildx
+ARG TARGETPLATFORM
+
+ENV PATH /opt/conda/bin:$PATH
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN <<EOT
+case ${TARGETPLATFORM} in
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;;
+    *)              MAMBA_ARCH=x86_64   ;;
+esac
+curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+EOT
+
+RUN <<EOT
+chmod +x ~/mambaforge.sh
+bash ~/mambaforge.sh -b -p /opt/conda
+rm ~/mambaforge.sh
+EOT
+
+
+# Install pytorch
+# On arm64 we exit with an error code
+RUN <<EOT
+case ${TARGETPLATFORM} in
+    "linux/arm64")  exit 1 ;;
+    *)              /opt/conda/bin/conda update -y conda &&  /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;;
+esac
+/opt/conda/bin/conda clean -ya
+EOT
+
+# CUDA kernels builder image
+FROM pytorch-install as kernel-builder
+
+RUN apt-get update && apt-get install -y --no-install-recommends ninja-build && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
+    /opt/conda/bin/conda clean -ya
+
+# NOTE: Build vllm CUDA kernels
+FROM kernel-builder as vllm-builder
+
+ENV COMMIT_HASH e8ddc08ec85495e5faca31bdf9129e0bf59a4fac
+ARG COMMIT_HASH=${COMMIT_HASH}
+
+WORKDIR /usr/src
+
+RUN <<EOT
+git clone https://github.com/vllm-project/vllm.git && cd vllm
+git fetch && git checkout ${COMMIT_HASH}
+python setup.py build
+EOT
+
+# NOTE: Build flash-attention-2 CUDA kernels
+FROM kernel-builder as flash-attn-v2-builder
+
+ENV COMMIT_HASH 4c98d0b41f38ee638a979064856ae06fc1aec8b6
+ARG COMMIT_HASH=${COMMIT_HASH}
+
+WORKDIR /usr/src
+
+RUN <<EOT
+pip install packaging
+git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2 && cd flash-attention-v2
+git fetch && git checkout ${COMMIT_HASH}
+python setup.py build
+EOT
+
+# NOTE: Build auto-gptq CUDA kernels
+FROM kernel-builder as auto-gptq-builder
+
+ENV COMMIT_HASH 18326851213568df3c5bbbb1169fe51c7f7d6c60
+ARG COMMIT_HASH=${COMMIT_HASH}
+
+WORKDIR /usr/src
+
+RUN <<EOT
+pip install packaging
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+git fetch && git checkout ${COMMIT_HASH}
+TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
+EOT
+
+# base image
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container
+
+# Conda env
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+WORKDIR /usr/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libssl-dev ca-certificates make && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
+# Copy build artefacts for vllm
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artefacts for flash-attention-v2
+COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artefacts for auto-gptq
+COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Install required dependencies
+COPY openllm-python/src src
+COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+
+# Install all required dependencies
+RUN --mount=type=cache,target=/root/.cache/pip pip install "ray==2.6.0" "einops" "jax[cuda11_local]" "torch>=2.0.1" xformers -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir
+
+FROM base-container
+
+ENTRYPOINT ["python3", "-m", "openllm"]
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -0,0 +1,129 @@
+# mypy: disable-error-code="misc"
+"""OCI-related utilities for OpenLLM. This module is considered to be internal and API are subjected to change."""
+from __future__ import annotations
+import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t
+from datetime import datetime, timedelta, timezone
+import attr, orjson, bentoml, openllm
+from openllm.utils.lazy import VersionInfo
+
+if t.TYPE_CHECKING:
+  from ghapi import all
+  from openllm._typing_compat import RefTuple, LiteralString
+
+all = openllm.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811
+
+logger = logging.getLogger(__name__)
+
+_BUILDER = bentoml.container.get_backend("buildx")
+ROOT_DIR = pathlib.Path(os.path.abspath("__file__")).parent.parent.parent
+
+# TODO: support quay
+LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
+LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
+
+# XXX: This registry will be hard code for now for easier to maintain
+# but in the future, we can infer based on git repo and everything to make it more options for users
+# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
+# NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
+_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {"docker": "docker.io/bentoml/openllm", "gh": "ghcr.io/bentoml/openllm", "ecr": "public.ecr.aws/y5w8i4y6/bentoml/openllm"}
+
+# TODO: support custom fork. Currently it only support openllm main.
+_OWNER = "bentoml"
+_REPO = "openllm"
+
+_module_location = openllm.utils.pkg.source_locations("openllm")
+
+@functools.lru_cache
+@openllm.utils.apply(str.lower)
+def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg]
+
+def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s)
+def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) - timedelta(days=r)).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+class VersionNotSupported(openllm.exceptions.OpenLLMException):
+  """Raised when the stable release is too low that it doesn't include OpenLLM base container."""
+
+_RefTuple: type[RefTuple] = openllm.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
+
+def nightly_resolver(cls: type[RefResolver]) -> str:
+  # NOTE: all openllm container will have sha-<git_hash[:7]>
+  # This will use docker to run skopeo to determine the correct latest tag that is available
+  # If docker is not found, then fallback to previous behaviour. Which the container might not exists.
+  docker_bin = shutil.which("docker")
+  if docker_bin is None:
+    logger.warning("To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)")
+    commits = t.cast("list[dict[str, t.Any]]", cls._ghapi.repos.list_commits(since=_commit_time_range()))
+    return next(f'sha-{it["sha"][:7]}' for it in commits if "[skip ci]" not in it["commit"]["message"])
+  # now is the correct behaviour
+  return orjson.loads(subprocess.check_output([docker_bin, "run", "--rm", "-it", "quay.io/skopeo/stable:latest", "list-tags", "docker://ghcr.io/bentoml/openllm"]).decode().strip())["Tags"][-2]
+
+@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+class RefResolver:
+  git_hash: str = attr.field()
+  version: openllm.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
+  strategy: LiteralContainerVersionStrategy = attr.field()
+  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
+  @classmethod
+  def _nightly_ref(cls) -> RefTuple: return _RefTuple((nightly_resolver(cls), "refs/heads/main", "nightly"))
+  @classmethod
+  def _release_ref(cls, version_str: str | None = None) -> RefTuple:
+    _use_base_strategy = version_str is None
+    if version_str is None:
+      # NOTE: This strategy will only support openllm>0.2.12
+      meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
+      version_str = meta["name"].lstrip("v")
+      version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str)
+    else: version = ("", version_str)
+    if openllm.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
+    return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
+  @classmethod
+  @functools.lru_cache(maxsize=64)
+  def from_strategy(cls, strategy_or_version: t.Literal["release", "nightly"] | LiteralString | None = None) -> RefResolver:
+    # using default strategy
+    if strategy_or_version is None or strategy_or_version == "release": return cls(*cls._release_ref())
+    elif strategy_or_version == "latest": return cls("latest", "0.0.0", "latest")
+    elif strategy_or_version == "nightly":
+      _ref = cls._nightly_ref()
+      return cls(_ref[0], "0.0.0", _ref[-1])
+    else:
+      logger.warning("Using custom %s. Make sure that it is at lease 0.2.12 for base container support.", strategy_or_version)
+      return cls(*cls._release_ref(version_str=strategy_or_version))
+  @property
+  def tag(self) -> str:
+    # NOTE: latest tag can also be nightly, but discouraged to use it. For nightly refer to use sha-<git_hash_short>
+    if self.strategy == "latest": return "latest"
+    elif self.strategy == "nightly": return self.git_hash
+    else: return repr(self.version)
+
+@functools.lru_cache(maxsize=256)
+def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: return RefResolver.from_strategy(strategy).tag
+def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = "release", push: bool = False, machine: bool = False) -> dict[str | LiteralContainerRegistry, str]:
+  try:
+    if not _BUILDER.health(): raise openllm.exceptions.Error
+  except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None
+  if openllm.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
+  if not shutil.which("nvidia-container-runtime"): raise RuntimeError("NVIDIA Container Toolkit is required to compile CUDA kernel in container.")
+  if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
+  pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml"
+  if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
+  if not registries: tags: dict[str | LiteralContainerRegistry, str] = {alias: f"{value}:{get_base_container_tag(version_strategy)}" for alias, value in _CONTAINER_REGISTRY.items()}  # default to all registries with latest tag strategy
+  else:
+    registries = [registries] if isinstance(registries, str) else list(registries)
+    tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries}
+  try:
+    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm.utils.get_debug_mode() else "auto", quiet=machine)
+    if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip()
+  except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err
+  return tags
+
+if t.TYPE_CHECKING:
+  CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
+  supported_registries: list[str]
+
+__all__ = ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]
+def __dir__() -> list[str]: return sorted(__all__)
+def __getattr__(name: str) -> t.Any:
+  if name == "supported_registries": return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
+  elif name == "CONTAINER_NAMES": return _CONTAINER_REGISTRY
+  elif name in __all__: return importlib.import_module("." + name, __name__)
+  else: raise AttributeError(f"{name} does not exists under {__name__}")
--- a/openllm-python/src/openllm/cli/init.py
+++ b/openllm-python/src/openllm/cli/init.py
@@ -0,0 +1,4 @@
+"""OpenLLM CLI.
+
+For more information see ``openllm -h``.
+"""
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -0,0 +1,404 @@
+from __future__ import annotations
+import functools, importlib.util, os, typing as t
+import click, click_option_group as cog, inflection, orjson, bentoml, openllm
+from bentoml_cli.utils import BentoMLCommandGroup
+from click.shell_completion import CompletionItem
+from bentoml._internal.configuration.containers import BentoMLContainer
+from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
+from . import termui
+
+if t.TYPE_CHECKING:
+  import subprocess
+  from openllm._configuration import LLMConfig
+
+P = ParamSpec("P")
+LiteralOutput = t.Literal["json", "pretty", "porcelain"]
+
+_AnyCallable = t.Callable[..., t.Any]
+FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])
+
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, environ: DictStrAny,) -> DictStrAny:
+  # TODO: Support amd.com/gpu on k8s
+  _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
+  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f"api_server.traffic.timeout={server_timeout}", f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
+  if device:
+    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
+    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_env += " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts)
+  environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env
+  return environ
+
+_adapter_mapping_key = "adapter_map"
+
+def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None:
+  if not value: return None
+  if _adapter_mapping_key not in ctx.params: ctx.params[_adapter_mapping_key] = {}
+  for v in value:
+    adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
+    # try to resolve the full path if users pass in relative,
+    # currently only support one level of resolve path with current directory
+    try: adapter_id = openllm.utils.resolve_user_filepath(adapter_id, os.getcwd())
+    except FileNotFoundError: pass
+    ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
+  return None
+
+def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
+  """Generate a 'click.Command' for any given LLM.
+
+  Args:
+  group: the target ``click.Group`` to save this LLM cli under
+  model: The name of the model or the ``bentoml.Bento`` instance.
+
+  Returns:
+  The click.Command for starting the model server
+
+  Note that the internal commands will return the llm_config and a boolean determine
+  whether the server is run with GPU or not.
+  """
+  llm_config = openllm.AutoConfig.for_model(model)
+
+  command_attrs: DictStrAny = dict(
+      name=llm_config["model_name"], context_settings=_context_settings or termui.CONTEXT_SETTINGS, short_help=f"Start a LLMServer for '{model}'", aliases=[llm_config["start_name"]] if llm_config["name_type"] == "dasherize" else None, help=f"""\
+{llm_config['env'].start_docstring}
+
+\b
+Note: ``{llm_config['start_name']}`` can also be run with any other models available on HuggingFace
+or fine-tuned variants as long as it belongs to the architecture generation ``{llm_config['architecture']}`` (trust_remote_code={llm_config['trust_remote_code']}).
+
+\b
+For example: One can start [Fastchat-T5](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0) with ``openllm start flan-t5``:
+
+\b
+$ openllm start flan-t5 --model-id lmsys/fastchat-t5-3b-v1.0
+
+\b
+Available official model_id(s): [default: {llm_config['default_id']}]
+
+\b
+{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
+""",
+  )
+
+  if llm_config["requires_gpu"] and openllm.utils.device_count() < 1:
+    # NOTE: The model requires GPU, therefore we will return a dummy command
+    command_attrs.update({"short_help": "(Disabled because there is no GPU available)", "help": f"""{model} is currently not available to run on your local machine because it requires GPU for inference."""})
+    return noop_command(group, llm_config, _serve_grpc, **command_attrs)
+
+  @group.command(**command_attrs)
+  @start_decorator(llm_config, serve_grpc=_serve_grpc)
+  @click.pass_context
+  def start_cmd(
+      ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool,
+      serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
+  ) -> LLMConfig | subprocess.Popen[bytes]:
+    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
+    if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+      termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow")
+    adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
+    config, server_attrs = llm_config.model_validate_click(**attrs)
+    server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
+    server_attrs.update({"working_dir": os.path.dirname(os.path.dirname(__file__)), "timeout": server_timeout})
+    if _serve_grpc: server_attrs["grpc_protocol_version"] = "v1"
+    # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
+    development = server_attrs.pop("development")
+    server_attrs.setdefault("production", not development)
+    wpr = openllm.utils.first_not_none(workers_per_resource, default=config["workers_per_resource"])
+
+    if isinstance(wpr, str):
+      if wpr == "round_robin": wpr = 1.0
+      elif wpr == "conserved":
+        if device and openllm.utils.device_count() == 0:
+          termui.echo("--device will have no effect as there is no GPUs available", fg="yellow")
+          wpr = 1.0
+        else:
+          available_gpu = len(device) if device else openllm.utils.device_count()
+          wpr = 1.0 if available_gpu == 0 else float(1 / available_gpu)
+      else:
+        wpr = float(wpr)
+    elif isinstance(wpr, int):
+      wpr = float(wpr)
+
+    # Create a new model env to work with the envvar during CLI invocation
+    env = openllm.utils.EnvVarMixin(config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
+
+    # NOTE: This is to set current configuration
+    start_env = os.environ.copy()
+    start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
+    if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
+
+    start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
+    if env["model_id_value"]: start_env[env.model_id] = str(env["model_id_value"])
+    # NOTE: quantize and bettertransformer value is already assigned within env
+    if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
+    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))
+
+    llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
+    start_env.update({env.config: llm.config.model_dump_json().decode()})
+
+    server = bentoml.GrpcServer("_service:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service:svc", **server_attrs)
+    openllm.utils.analytics.track_start_init(llm.config)
+
+    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
+      cmd_name = f"openllm build {model_name}"
+      if adapter_map is not None: cmd_name += " " + " ".join([f"--adapter-id {s}" for s in [f"{p}:{name}" if name not in (None, "default") else p for p, name in adapter_map.items()]])
+      if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg="blue")
+
+    if return_process:
+      server.start(env=start_env, text=True)
+      if server.process is None: raise click.ClickException("Failed to start the server.")
+      return server.process
+    else:
+      try:
+        server.start(env=start_env, text=True, blocking=True)
+      except KeyboardInterrupt:
+        next_step(model, adapter_map)
+      except Exception as err:
+        termui.echo(f"Error caught while running LLM Server:\n{err}", fg="red")
+      else:
+        next_step(model, adapter_map)
+
+    # NOTE: Return the configuration for telemetry purposes.
+    return config
+
+  return start_cmd
+
+def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
+  context_settings = command_attrs.pop("context_settings", {})
+  context_settings.update({"ignore_unknown_options": True, "allow_extra_args": True})
+  command_attrs["context_settings"] = context_settings
+  # NOTE: The model requires GPU, therefore we will return a dummy command
+  @group.command(**command_attrs)
+  def noop(**_: t.Any) -> LLMConfig:
+    termui.echo("No GPU available, therefore this command is disabled", fg="red")
+    openllm.utils.analytics.track_start_init(llm_config)
+    return llm_config
+
+  return noop
+
+def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
+  if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
+  if quantize and llm_config.default_implementation() == "vllm": ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
+  requirements = llm_config["requirements"]
+  if requirements is not None and len(requirements) > 0:
+    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
+    if len(missing_requirements) > 0: termui.echo(f"Make sure to have the following dependencies available: {missing_requirements}", fg="yellow")
+
+def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
+  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
+    composed = openllm.utils.compose(
+          llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
+          cog.optgroup.group("General LLM Options", help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
+          model_id_option(factory=cog.optgroup, model_env=llm_config["env"]),
+          model_version_option(factory=cog.optgroup),
+          cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"),
+          workers_per_resource_option(factory=cog.optgroup),
+          fast_option(factory=cog.optgroup),
+          cog.optgroup.group(
+              "LLM Optimization Options", help="""Optimization related options.
+
+            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
+            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
+
+            The following are either in our roadmap or currently being worked on:
+
+            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
+            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
+            """,
+          ),
+          cog.optgroup.option("--device", type=openllm.utils.dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True),
+          cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."),
+          quantize_option(factory=cog.optgroup, model_env=llm_config["env"]),
+          bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]),
+          serialisation_option(factory=cog.optgroup),
+          cog.optgroup.group(
+              "Fine-tuning related options", help="""\
+    Note that the argument `--adapter-id` can accept the following format:
+
+    - `--adapter-id /path/to/adapter` (local adapter)
+
+    - `--adapter-id remote/adapter` (remote adapter from HuggingFace Hub)
+
+    - `--adapter-id remote/adapter:eng_lora` (two previous adapter options with the given adapter_name)
+
+    ```bash
+
+    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora
+
+    ```
+    """,
+          ),
+          cog.optgroup.option("--adapter-id", default=None, help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"),
+          click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True),
+    )
+    return composed(fn)
+  return wrapper
+
+def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
+  if value is None: return value
+  if not isinstance(value, tuple): ctx.fail(f"{param} only accept multiple values, not {type(value)} (value: {value})")
+  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
+  # NOTE: --device all is a special case
+  if len(el) == 1 and el[0] == "all": return tuple(map(str, openllm.utils.available_devices()))
+  return el
+
+# NOTE: A list of bentoml option that is not needed for parsing.
+# NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
+# NOTE: production is also deprecated
+_IGNORED_OPTIONS = {"working_dir", "production", "protocol_version"}
+
+def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
+  """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
+  from bentoml_cli.cli import cli
+
+  command = "serve" if not serve_grpc else "serve-grpc"
+  group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",)
+
+  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
+    serve_command = cli.commands[command]
+    # The first variable is the argument bento
+    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
+    serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
+    for options in reversed(serve_options):
+      attrs = options.to_info_dict()
+      # we don't need param_type_name, since it should all be options
+      attrs.pop("param_type_name")
+      # name is not a valid args
+      attrs.pop("name")
+      # type can be determine from default value
+      attrs.pop("type")
+      param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts"))
+      f = cog.optgroup.option(*param_decls, **attrs)(f)
+    return group(f)
+  return decorator
+
+_http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True)
+
+def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
+  """General ``@click`` decorator with some sauce.
+
+  This decorator extends the default ``@click.option`` plus a factory option and factory attr to
+  provide type-safe click.option or click.argument wrapper for all compatible factory.
+  """
+  factory = attrs.pop("factory", click)
+  factory_attr = attrs.pop("attr", "option")
+  if factory_attr != "argument": attrs.setdefault("help", "General option for OpenLLM CLI.")
+  def decorator(f: FC | None) -> FC:
+    callback = getattr(factory, factory_attr, None)
+    if callback is None: raise ValueError(f"Factory {factory} has no attribute {factory_attr}.")
+    return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs))
+  return decorator
+
+cli_option = functools.partial(_click_factory_type, attr="option")
+cli_argument = functools.partial(_click_factory_type, attr="argument")
+
+def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = "pretty", **attrs: t.Any) -> t.Callable[[FC], FC]:
+  output = ["json", "pretty", "porcelain"]
+  def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]: return [CompletionItem(it) for it in output]
+  return cli_option("-o", "--output", "output", type=click.Choice(output), default=default_value, help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True, shell_complete=complete_output_var, **attrs)(f)
+def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store.
+
+                                                                                                          This is useful if you already downloaded or setup the model beforehand.
+                                                                                                          """, **attrs
+  )(f)
+def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
+def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
+def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
+def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
+
+def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
+
+                                                                                                                                                                            The following quantization strategies are supported:
+
+                                                                                                                                                                            - ``int8``: ``LLM.int8`` for [8-bit](https://arxiv.org/abs/2208.07339) quantization.
+
+                                                                                                                                                                            - ``int4``: ``SpQR`` for [4-bit](https://arxiv.org/abs/2306.03078) quantization.
+
+                                                                                                                                                                            - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323)
+
+                                                                                                                                                                            > [!NOTE] that the model can also be served with quantized weights.
+                                                                                                                                                                            """ + (
+          """
+                                                                                                                                                                            > [!NOTE] that this will set the mode for serving within deployment.""" if build else ""
+      ) + """
+                                                                                                                                                                            > [!NOTE] that quantization are currently only available in *PyTorch* models.""", **attrs
+  )(f)
+
+def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--workers-per-resource", default=None, callback=workers_per_resource_callback, type=str, required=False, help="""Number of workers per resource assigned.
+
+                                                                                                                                                  See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
+                                                                                                                                                  for more information. By default, this is set to 1.
+
+                                                                                                                                                  > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+
+                                                                                                                                                  - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+
+                                                                                                                                                  - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
+                                                                                                                                                  """ + (
+          """\n
+                                                                                                                                                  > [!NOTE] The workers value passed into 'build' will determine how the LLM can
+                                                                                                                                                  > be provisioned in Kubernetes as well as in standalone container. This will
+                                                                                                                                                  > ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
+      ), **attrs
+  )(f)
+
+def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs
+  )(f)
+
+def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--serialisation", "--serialization", "serialisation_format", type=click.Choice(["safetensors", "legacy"]), default="safetensors", show_default=True, show_envvar=True, envvar="OPENLLM_SERIALIZATION", help="""Serialisation format for save/load LLM.
+
+                                                                                                                  Currently the following strategies are supported:
+
+                                                                                                                  - ``safetensors``: This will use safetensors format, which is synonymous to
+
+                                                                                                                              \b
+                                                                                                                              ``safe_serialization=True``.
+
+                                                                                                                              \b
+                                                                                                                              > [!NOTE] that this format might not work for every cases, and
+                                                                                                                              you can always fallback to ``legacy`` if needed.
+
+                                                                                                                  - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files.
+                                                                                                                                  This should be used if the model doesn't yet support safetensors.
+
+                                                                                                                  > [!NOTE] that GGML format is working in progress.
+                                                                                                                  """, **attrs
+  )(f)
+
+def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option(
+      "--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
+
+                                                                                                                        Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
+
+                                                                                                                        \b
+                                                                                                                        > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
+                                                                                                                        """
+  )(f)
+
+_wpr_strategies = {"round_robin", "conserved"}
+
+def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
+  if value is None: return value
+  value = inflection.underscore(value)
+  if value in _wpr_strategies: return value
+  else:
+    try: float(value)  # type: ignore[arg-type]
+    except ValueError: raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
+    else:
+      return value
+
+def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
+  if value is None: return value
+  if value not in openllm.bundle.supported_registries: raise click.BadParameter(f"Value must be one of {openllm.bundle.supported_registries}", ctx, param)
+  return value
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+import itertools, logging, os, re, subprocess, sys, typing as t
+import bentoml, openllm
+from simple_di import Provide, inject
+from bentoml._internal.configuration.containers import BentoMLContainer
+from openllm.exceptions import OpenLLMException
+from . import termui
+from ._factory import start_command_factory
+
+if t.TYPE_CHECKING:
+  from openllm._typing_compat import LiteralString, LiteralRuntime
+  from bentoml._internal.bento import BentoStore
+  from openllm._configuration import LLMConfig
+  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+
+logger = logging.getLogger(__name__)
+
+def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", fast: bool = False, adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
+  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.
+
+  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
+  pass ``--port 5001``, you can pass ``additional_args=["--port", "5001"]``
+
+  > [!NOTE] This will create a blocking process, so if you use this API, you can create a running sub thread
+  > to start the server instead of blocking the main thread.
+
+  ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
+
+  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
+
+  Args:
+      model_name: The model name to start this LLM
+      model_id: Optional model id for this given LLM
+      timeout: The server timeout
+      workers_per_resource: Number of workers per resource assigned.
+                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                            for more information. By default, this is set to 1.
+
+                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                            >                  equivalent to ``--workers-per-resource 0.25``.
+      device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
+      argument to assign all available GPUs to this LLM.
+      quantize: Quantize the model weights. This is only applicable for PyTorch models.
+                Possible quantisation strategies:
+                - int8: Quantize the model with 8bit (bitsandbytes required)
+                - int4: Quantize the model with 4bit (bitsandbytes required)
+                - gptq: Quantize the model with GPTQ (auto-gptq required)
+      bettertransformer: Convert given model to FastTransformer with PyTorch.
+      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+      fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
+      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+      framework: The framework to use for this LLM. By default, this is set to ``pt``.
+      additional_args: Additional arguments to pass to ``openllm start``.
+  """
+  from .entrypoint import start_command, start_grpc_command
+  fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
+  llm_config = openllm.AutoConfig.for_model(model_name)
+  _ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
+
+  args: list[str] = ["--runtime", runtime]
+  if model_id: args.extend(["--model-id", model_id])
+  if timeout: args.extend(["--server-timeout", str(timeout)])
+  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+  if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
+  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  if quantize: args.extend(["--quantize", str(quantize)])
+  elif bettertransformer: args.append("--bettertransformer")
+  if fast: args.append("--fast")
+  if adapter_map: args.extend(list(itertools.chain.from_iterable([["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+  if additional_args: args.extend(additional_args)
+  if __test__: args.append("--return-process")
+
+  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
+
+@inject
+def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
+  """Package a LLM into a Bento.
+
+  The LLM will be built into a BentoService with the following structure:
+  if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
+  if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.
+
+  ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
+
+  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
+
+  Args:
+      model_name: The model name to start this LLM
+      model_id: Optional model id for this given LLM
+      model_version: Optional model version for this given LLM
+      quantize: Quantize the model weights. This is only applicable for PyTorch models.
+                Possible quantisation strategies:
+                - int8: Quantize the model with 8bit (bitsandbytes required)
+                - int4: Quantize the model with 4bit (bitsandbytes required)
+                - gptq: Quantize the model with GPTQ (auto-gptq required)
+      bettertransformer: Convert given model to FastTransformer with PyTorch.
+      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+      build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
+      enable_features: Additional OpenLLM features to be included with this BentoLLM.
+      workers_per_resource: Number of workers per resource assigned.
+                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                            for more information. By default, this is set to 1.
+
+                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                            >                  equivalent to ``--workers-per-resource 0.25``.
+      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+      dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
+      overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
+      push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
+      containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
+                    Note that 'containerize' and 'push' are mutually exclusive
+                    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+      container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+      container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
+      serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+      additional_args: Additional arguments to pass to ``openllm build``.
+      bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
+
+  Returns:
+      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
+  """
+  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
+  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  if quantize: args.extend(["--quantize", quantize])
+  if bettertransformer: args.append("--bettertransformer")
+  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
+  if push: args.extend(["--push"])
+  if containerize: args.extend(["--containerize"])
+  if model_id: args.extend(["--model-id", model_id])
+  if build_ctx: args.extend(["--build-ctx", build_ctx])
+  if enable_features: args.extend([f"--enable-features={f}" for f in enable_features])
+  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource)])
+  if overwrite: args.append("--overwrite")
+  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
+  if model_version: args.extend(["--model-version", model_version])
+  if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
+  if container_registry is None: container_registry = "ecr"
+  if container_version_strategy is None: container_version_strategy = "release"
+  args.extend(["--container-registry", container_registry, "--container-version-strategy", container_version_strategy])
+  if additional_args: args.extend(additional_args)
+
+  try:
+    output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
+  except subprocess.CalledProcessError as e:
+    logger.error("Exception caught while building %s", model_name, exc_info=e)
+    if e.stderr: raise OpenLLMException(e.stderr.decode("utf-8")) from None
+    raise OpenLLMException(str(e)) from None
+  matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip())
+  if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
+  return bentoml.get(matched.group(1), _bento_store=bento_store)
+
+def _import_model(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", implementation: LiteralRuntime = "pt", quantize: t.Literal["int8", "int4", "gptq"] | None = None, serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
+  """Import a LLM into local store.
+
+  > [!NOTE]
+  > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
+  > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
+  > support on-demand quantisation during initial startup.
+
+  ``openllm.download`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.
+
+  > [!NOTE]
+  > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
+
+  Args:
+      model_name: The model name to start this LLM
+      model_id: Optional model id for this given LLM
+      model_version: Optional model version for this given LLM
+      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+      implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
+      quantize: Quantize the model weights. This is only applicable for PyTorch models.
+                Possible quantisation strategies:
+                - int8: Quantize the model with 8bit (bitsandbytes required)
+                - int4: Quantize the model with 4bit (bitsandbytes required)
+                - gptq: Quantize the model with GPTQ (auto-gptq required)
+      serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+      Default behaviour is similar to ``safe_serialization=False``.
+      additional_args: Additional arguments to pass to ``openllm import``.
+
+  Returns:
+      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
+  """
+  from .entrypoint import import_command
+  args = [model_name, "--runtime", runtime, "--implementation", implementation, "--machine", "--serialisation", serialisation_format,]
+  if model_id is not None: args.append(model_id)
+  if model_version is not None: args.extend(["--model-version", str(model_version)])
+  if additional_args is not None: args.extend(additional_args)
+  if quantize is not None: args.extend(["--quantize", quantize])
+  return import_command.main(args=args, standalone_mode=False)
+
+def _list_models() -> dict[str, t.Any]:
+  """List all available models within the local store."""
+  from .entrypoint import models_command
+  return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)
+
+
+start, start_grpc, build, import_model, list_models = openllm.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm.utils.codegen.gen_sdk(_build), openllm.utils.codegen.gen_sdk(_import_model), openllm.utils.codegen.gen_sdk(_list_models)
+__all__ = ["start", "start_grpc", "build", "import_model", "list_models"]
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -0,0 +1,756 @@
+"""OpenLLM CLI interface.
+
+This module also contains the SDK to call ``start`` and ``build`` from SDK
+
+Start any LLM:
+
+```python
+openllm.start("falcon", model_id='tiiuae/falcon-7b-instruct')
+```
+
+Build a BentoLLM
+
+```python
+bento = openllm.build("falcon")
+```
+
+Import any LLM into local store
+```python
+bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct')
+```
+"""
+from __future__ import annotations
+import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t
+import attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
+from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
+from simple_di import Provide, inject
+from bentoml._internal.configuration.containers import BentoMLContainer
+from bentoml._internal.models.model import ModelStore
+from . import termui
+from ._factory import (
+  FC,
+  LiteralOutput,
+  _AnyCallable,
+  bettertransformer_option,
+  container_registry_option,
+  fast_option,
+  machine_option,
+  model_id_option,
+  model_name_argument,
+  model_version_option,
+  output_option,
+  parse_device_callback,
+  quantize_option,
+  serialisation_option,
+  start_command_factory,
+  workers_per_resource_option,
+)
+from openllm import bundle, serialisation
+from openllm.exceptions import OpenLLMException
+from openllm.models.auto import (
+  CONFIG_MAPPING,
+  MODEL_FLAX_MAPPING_NAMES,
+  MODEL_MAPPING_NAMES,
+  MODEL_TF_MAPPING_NAMES,
+  MODEL_VLLM_MAPPING_NAMES,
+  AutoConfig,
+  AutoLLM,
+)
+from openllm._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
+from openllm.utils import (
+  DEBUG,
+  DEBUG_ENV_VAR,
+  OPTIONAL_DEPENDENCIES,
+  QUIET_ENV_VAR,
+  EnvVarMixin,
+  LazyLoader,
+  analytics,
+  bentoml_cattr,
+  compose,
+  configure_logging,
+  dantic,
+  first_not_none,
+  get_debug_mode,
+  get_quiet_mode,
+  infer_auto_class,
+  is_torch_available,
+  is_transformers_supports_agent,
+  resolve_user_filepath,
+  set_debug_mode,
+  set_quiet_mode,
+)
+
+if t.TYPE_CHECKING:
+  import torch
+  from bentoml._internal.bento import BentoStore
+  from bentoml._internal.container import DefaultBuilder
+  from openllm.client import BaseClient
+  from openllm._schema import EmbeddingsOutput
+  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+else: torch = LazyLoader("torch", globals(), "torch")
+
+P = ParamSpec("P")
+logger = logging.getLogger(__name__)
+OPENLLM_FIGLET = """\
+ ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
+██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
+██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
+██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
+╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
+ ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
+"""
+
+ServeCommand = t.Literal["serve", "serve-grpc"]
+
+@attr.define
+class GlobalOptions:
+  cloud_context: str | None = attr.field(default=None)
+  def with_options(self, **attrs: t.Any) -> Self: return attr.evolve(self, **attrs)
+
+GrpType = t.TypeVar("GrpType", bound=click.Group)
+
+_object_setattr = object.__setattr__
+
+_EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "extension"))
+
+class Extensions(click.MultiCommand):
+  def list_commands(self, ctx: click.Context) -> list[str]: return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith(".py") and not filename.startswith("__")])
+  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
+    try: mod = __import__(f"openllm.cli.extension.{cmd_name}", None, None, ["cli"])
+    except ImportError: return None
+    return mod.cli
+
+class OpenLLMCommandGroup(BentoMLCommandGroup):
+  NUMBER_OF_COMMON_PARAMS = 5  # parameters in common_params + 1 faked group option header
+
+  @staticmethod
+  def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
+    # The following logics is similar to one of BentoMLCommandGroup
+    @cog.optgroup.group(name="Global options", help="Shared globals options for all OpenLLM CLI.")
+    @cog.optgroup.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.", show_envvar=True)
+    @cog.optgroup.option("--debug", "--verbose", "debug", envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help="Print out debug logs.", show_envvar=True)
+    @cog.optgroup.option("--do-not-track", is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help="Do not send usage info", show_envvar=True)
+    @cog.optgroup.option("--context", "cloud_context", envvar="BENTOCLOUD_CONTEXT", type=click.STRING, default=None, help="BentoCloud context name.", show_envvar=True)
+    @click.pass_context
+    @functools.wraps(f)
+    def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
+      ctx.obj = GlobalOptions(cloud_context=cloud_context)
+      if quiet:
+        set_quiet_mode(True)
+        if debug: logger.warning("'--quiet' passed; ignoring '--verbose/--debug'")
+      elif debug: set_debug_mode(True)
+      configure_logging()
+      return f(*args, **attrs)
+    return wrapper
+
+  @staticmethod
+  def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
+    command_name = attrs.get("name", func.__name__)
+
+    @functools.wraps(func)
+    def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any:
+      if do_not_track:
+        with analytics.set_bentoml_tracking(): return func(*args, **attrs)
+      start_time = time.time_ns()
+      with analytics.set_bentoml_tracking():
+        if group.name is None: raise ValueError("group.name should not be None")
+        event = analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name)
+        try:
+          return_value = func(*args, **attrs)
+          duration_in_ms = (time.time_ns() - start_time) / 1e6
+          event.duration_in_ms = duration_in_ms
+          analytics.track(event)
+          return return_value
+        except Exception as e:
+          duration_in_ms = (time.time_ns() - start_time) / 1e6
+          event.duration_in_ms = duration_in_ms
+          event.error_type = type(e).__name__
+          event.return_code = 2 if isinstance(e, KeyboardInterrupt) else 1
+          analytics.track(event)
+          raise
+    return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper)
+
+  @staticmethod
+  def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
+    command_name = attrs.get("name", func.__name__)
+    @functools.wraps(func)
+    def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
+      try: return func(*args, **attrs)
+      except OpenLLMException as err: raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red")) from err
+      except KeyboardInterrupt: pass
+    return wrapper
+
+  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
+    if cmd_name in t.cast("Extensions", extension_command).list_commands(ctx):
+      return t.cast("Extensions", extension_command).get_command(ctx, cmd_name)
+    cmd_name = self.resolve_alias(cmd_name)
+    if ctx.command.name in _start_mapping:
+      try: return _start_mapping[ctx.command.name][cmd_name]
+      except KeyError:
+        # TODO: support start from a bento
+        try:
+          bentoml.get(cmd_name)
+          raise click.ClickException(f"'openllm start {cmd_name}' is currently disabled for the time being. Please let us know if you need this feature by opening an issue on GitHub.")
+        except bentoml.exceptions.NotFound: pass
+        raise click.BadArgumentUsage(f"{cmd_name} is not a valid model identifier supported by OpenLLM.") from None
+    return super().get_command(ctx, cmd_name)
+
+  def list_commands(self, ctx: click.Context) -> list[str]:
+    if ctx.command.name in {"start", "start-grpc"}: return list(CONFIG_MAPPING.keys())
+    return super().list_commands(ctx) + t.cast("Extensions", extension_command).list_commands(ctx)
+
+  def command(self, *args: t.Any, **kwargs: t.Any) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:  # type: ignore[override] # XXX: fix decorator on BentoMLCommandGroup
+    """Override the default 'cli.command' with supports for aliases for given command, and it wraps the implementation with common parameters."""
+    if "context_settings" not in kwargs: kwargs["context_settings"] = {}
+    if "max_content_width" not in kwargs["context_settings"]: kwargs["context_settings"]["max_content_width"] = 120
+    aliases = kwargs.pop("aliases", None)
+
+    def decorator(f: _AnyCallable) -> click.Command:
+      name = f.__name__.lower()
+      if name.endswith("_command"): name = name[:-8]
+      name = name.replace("_", "-")
+      kwargs.setdefault("help", inspect.getdoc(f))
+      kwargs.setdefault("name", name)
+      wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)
+
+      # move common parameters to end of the parameters list
+      _memo = getattr(wrapped, "__click_params__", None)
+      if _memo is None: raise RuntimeError("Click command not register correctly.")
+      _object_setattr(wrapped, "__click_params__", _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
+      # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
+      cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
+      # NOTE: add aliases to a given commands if it is specified.
+      if aliases is not None:
+        if not cmd.name: raise ValueError("name is required when aliases are available.")
+        self._commands[cmd.name] = aliases
+        self._aliases.update({alias: cmd.name for alias in aliases})
+      return cmd
+
+    return decorator
+
+  def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
+    """Additional format methods that include extensions as well as the default cli command."""
+    from gettext import gettext as _
+    commands: list[tuple[str, click.Command]] = []
+    extensions: list[tuple[str, click.Command]] = []
+    _cached_extensions: list[str] = t.cast("Extensions", extension_command).list_commands(ctx)
+    for subcommand in self.list_commands(ctx):
+      cmd = self.get_command(ctx, subcommand)
+      if cmd is None or cmd.hidden: continue
+      if subcommand in _cached_extensions: extensions.append((subcommand, cmd))
+      else: commands.append((subcommand, cmd))
+    # allow for 3 times the default spacing
+    if len(commands):
+      limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands)
+      rows: list[tuple[str, str]]= []
+      for subcommand, cmd in commands:
+        help = cmd.get_short_help_str(limit)
+        rows.append((subcommand, help))
+      if rows:
+        with formatter.section(_("Commands")): formatter.write_dl(rows)
+    if len(extensions):
+      limit = formatter.width - 6 - max(len(cmd[0]) for cmd in extensions)
+      rows = []
+      for subcommand, cmd in extensions:
+        help = cmd.get_short_help_str(limit)
+        rows.append((inflection.dasherize(subcommand), help))
+      if rows:
+        with formatter.section(_("Extensions")): formatter.write_dl(rows)
+
+@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="openllm")
+@click.version_option(None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}")
+def cli() -> None:
+  """\b
+   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
+  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
+  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
+  ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
+  ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
+   ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝.
+
+  \b
+  An open platform for operating large language models in production.
+  Fine-tune, serve, deploy, and monitor any LLMs with ease.
+  """  # noqa: D205
+
+@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"])
+def start_command() -> None:
+  """Start any LLM as a REST server.
+
+  \b
+  ```bash
+  $ openllm <start|start-http> <model_name> --<options> ...
+  ```
+  """
+
+@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start-grpc")
+def start_grpc_command() -> None:
+  """Start any LLM as a gRPC server.
+
+  \b
+  ```bash
+  $ openllm start-grpc <model_name> --<options> ...
+  ```
+  """
+
+_start_mapping = {"start": {key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING}, "start-grpc": {key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING}}
+
+@cli.command(name="import", aliases=["download"])
+@model_name_argument
+@click.argument("model_id", type=click.STRING, default=None, metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]", required=False)
+@click.argument("converter", envvar="CONVERTER", type=click.STRING, default=None, required=False, metavar=None)
+@model_version_option
+@click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.")
+@output_option
+@quantize_option
+@machine_option
+@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, help="The implementation for saving this LLM.")
+@serialisation_option
+def import_command(model_name: str, model_id: str | None, converter: str | None, model_version: str | None, output: LiteralOutput, runtime: t.Literal["ggml", "transformers"], machine: bool, implementation: LiteralRuntime | None, quantize: t.Literal["int8", "int4", "gptq"] | None, serialisation_format: t.Literal["safetensors", "legacy"],) -> bentoml.Model:
+  """Setup LLM interactively.
+
+  It accepts two positional arguments: `model_name` and `model_id`. The first name determine
+  the model type to download, and the second one is the optional model id to download.
+
+  \b
+  This `model_id` can be either pretrained model id that you can get from HuggingFace Hub, or
+  a custom model path from your custom pretrained model. Note that the custom model path should
+  contain all files required to construct `transformers.PretrainedConfig`, `transformers.PreTrainedModel`
+  and `transformers.PreTrainedTokenizer` objects.
+
+  \b
+  Note: This is useful for development and setup for fine-tune.
+  This will be automatically called when `ensure_available=True` in `openllm.LLM.for_model`
+
+  \b
+  ``--model-version`` is an optional option to save the model. Note that
+  this is recommended when the model_id is a custom path. Usually, if you are only using pretrained
+  model from HuggingFace Hub, you don't need to specify this. If this is not specified, we will calculate
+  the hash from the last modified time from this custom path
+
+  \b
+  ```bash
+  $ openllm download opt facebook/opt-2.7b
+  ```
+
+  \b
+  > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
+  > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
+  > support on-demand quantisation during initial startup.
+
+  \b
+  ## Conversion strategies [EXPERIMENTAL]
+
+  \b
+  Some models will include built-in conversion strategies for specific weights format.
+  It will be determined via the `CONVERTER` environment variable. Note that this envvar should only be use provisionally as it is not RECOMMENDED to export this
+  and save to a ``.env`` file.
+
+  The conversion strategies will have the following format and will be determined per architecture implementation:
+  <base_format>-<target_format>
+
+  \b
+  For example: the below convert LlaMA-2 model format to hf:
+
+  \b
+  ```bash
+  $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
+  ```
+
+  > [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
+  """
+  llm_config = AutoConfig.for_model(model_name)
+  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
+  impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
+  llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
+  _previously_saved = False
+  try:
+    _ref = serialisation.get(llm)
+    _previously_saved = True
+  except bentoml.exceptions.NotFound:
+    if not machine and output == "pretty":
+      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
+      termui.echo(msg, fg="yellow", nl=True)
+    _ref = serialisation.get(llm, auto_import=True)
+    if impl == "pt" and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
+  if machine: return _ref
+  elif output == "pretty":
+    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg="yellow")
+    else: termui.echo(f"Saved model: {_ref.tag}")
+  elif output == "json": termui.echo(orjson.dumps({"previously_setup": _previously_saved, "framework": impl, "tag": str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
+  else: termui.echo(_ref.tag)
+  return _ref
+
+@cli.command(context_settings={"token_normalize_func": inflection.underscore})
+@model_name_argument
+@model_id_option
+@output_option
+@machine_option
+@click.option("--bento-version", type=str, default=None, help="Optional bento version for this BentoLLM. Default is the the model revision.")
+@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+@workers_per_resource_option(factory=click, build=True)
+@click.option("--device", type=dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help="Set the device", show_envvar=True)
+@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Optimisation options")
+@quantize_option(factory=cog.optgroup, build=True)
+@bettertransformer_option(factory=cog.optgroup)
+@click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.")
+@click.option("--enable-features", multiple=True, nargs=1, metavar="FEATURE[,FEATURE]", help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES)))
+@click.option("--adapter-id", default=None, multiple=True, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]", help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.")
+@click.option("--build-ctx", help="Build context. This is required if --adapter-id uses relative path", default=None)
+@model_version_option
+@click.option("--dockerfile-template", default=None, type=click.File(), help="Optional custom dockerfile template to be used with this BentoLLM.")
+@serialisation_option
+@container_registry_option
+@click.option("--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'")
+@fast_option
+@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Utilities options")
+@cog.optgroup.option("--containerize", default=False, is_flag=True, type=click.BOOL, help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.")
+@cog.optgroup.option("--push", default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
+@click.option("--force-push", default=False, is_flag=True, type=click.BOOL, help="Whether to force push.")
+@click.pass_context
+def build_command(
+    ctx: click.Context, /, model_name: str, model_id: str | None, bento_version: str | None, overwrite: bool, output: LiteralOutput, runtime: t.Literal["ggml", "transformers"], quantize: t.Literal["int8", "int4", "gptq"] | None, enable_features: tuple[str, ...] | None, bettertransformer: bool | None, workers_per_resource: float | None, adapter_id: tuple[str, ...],
+    build_ctx: str | None, machine: bool, device: tuple[str, ...], model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool, push: bool, serialisation_format: t.Literal["safetensors", "legacy"], fast: bool, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, force_push: bool, **attrs: t.Any,
+) -> bentoml.Bento:
+  """Package a given models into a Bento.
+
+  \b
+  ```bash
+  $ openllm build flan-t5 --model-id google/flan-t5-large
+  ```
+
+  \b
+  > [!NOTE]
+  > To run a container built from this Bento with GPU support, make sure
+  > to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.
+
+  \b
+  > [!IMPORTANT]
+  > To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
+  > target also use the same Python version and architecture as build machine.
+  """
+  if machine: output = "porcelain"
+  if enable_features: enable_features = tuple(itertools.chain.from_iterable((s.split(",") for s in enable_features)))
+
+  _previously_built = False
+
+  llm_config = AutoConfig.for_model(model_name)
+  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime)
+
+  # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
+  # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
+  try:
+    os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env["runtime_value"]), "OPENLLM_SERIALIZATION": serialisation_format})
+    if env["model_id_value"]: os.environ[env.model_id] = str(env["model_id_value"])
+    if env["quantize_value"]: os.environ[env.quantize] = str(env["quantize_value"])
+    os.environ[env.bettertransformer] = str(env["bettertransformer_value"])
+
+    llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
+
+    labels = dict(llm.identifying_params)
+    labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
+    workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
+
+    with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
+      dockerfile_template_path = None
+      if dockerfile_template:
+        with dockerfile_template:
+          llm_fs.writetext("Dockerfile.template", dockerfile_template.read())
+        dockerfile_template_path = llm_fs.getsyspath("/Dockerfile.template")
+
+      adapter_map: dict[str, str | None] | None = None
+      if adapter_id:
+        if not build_ctx: ctx.fail("'build_ctx' is required when '--adapter-id' is passsed.")
+        adapter_map = {}
+        for v in adapter_id:
+          _adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
+          name = adapter_name[0] if len(adapter_name) > 0 else None
+          try:
+            resolve_user_filepath(_adapter_id, build_ctx)
+            src_folder_name = os.path.basename(_adapter_id)
+            src_fs = fs.open_fs(build_ctx)
+            llm_fs.makedir(src_folder_name, recreate=True)
+            fs.copy.copy_dir(src_fs, _adapter_id, llm_fs, src_folder_name)
+            adapter_map[src_folder_name] = name
+          # this is the remote adapter, then just added back
+          # note that there is a drawback here. If the path of the local adapter
+          # path have the same name as the remote, then we currently don't support
+          # that edge case.
+          except FileNotFoundError:
+            adapter_map[_adapter_id] = name
+        os.environ["OPENLLM_ADAPTER_MAP"] = orjson.dumps(adapter_map).decode()
+
+      _bento_version = first_not_none(bento_version, default=llm.tag.version)
+      bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{_bento_version}".lower().strip())
+      try:
+        bento = bentoml.get(bento_tag)
+        if overwrite:
+          if output == "pretty": termui.echo(f"Overwriting existing Bento {bento_tag}", fg="yellow")
+          bentoml.delete(bento_tag)
+          raise bentoml.exceptions.NotFound(f"Rebuilding existing Bento {bento_tag}") from None
+        _previously_built = True
+      except bentoml.exceptions.NotFound:
+        bento = bundle.create_bento(
+            bento_tag, llm_fs, llm, workers_per_resource=workers_per_resource, adapter_map=adapter_map,
+            quantize=quantize, bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, runtime=runtime,
+            container_registry=container_registry, container_version_strategy=container_version_strategy
+        )
+  except Exception as err: raise err from None
+
+  if machine: termui.echo(f"__tag__:{bento.tag}", fg="white")
+  elif output == "pretty":
+    if not get_quiet_mode() and (not push or not containerize):
+      termui.echo("\n" + OPENLLM_FIGLET, fg="white")
+      if not _previously_built: termui.echo(f"Successfully built {bento}.", fg="green")
+      elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow")
+      termui.echo("📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" + f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" + "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n", fg="blue",)
+  elif output == "json": termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
+  else: termui.echo(bento.tag)
+
+  if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
+  elif containerize:
+    backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker"))
+    try: bentoml.container.health(backend)
+    except subprocess.CalledProcessError: raise OpenLLMException(f"Failed to use backend {backend}") from None
+    try: bentoml.container.build(bento.tag, backend=backend, features=("grpc", "io"))
+    except Exception as err: raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
+  return bento
+
+@cli.command()
+@output_option
+@click.option("--show-available", is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
+@machine_option
+@click.pass_context
+def models_command(ctx: click.Context, output: LiteralOutput, show_available: bool, machine: bool) -> DictStrAny | None:
+  """List all supported models.
+
+  \b
+  > NOTE: '--show-available' and '-o porcelain' are mutually exclusive.
+
+  \b
+  ```bash
+  openllm models --show-available
+  ```
+  """
+  from .._llm import normalise_model_name
+
+  models = tuple(inflection.dasherize(key) for key in CONFIG_MAPPING.keys())
+  if output == "porcelain":
+    if show_available: raise click.BadOptionUsage("--show-available", "Cannot use '--show-available' with '-o porcelain' (mutually exclusive).")
+    termui.echo("\n".join(models), fg="white")
+  else:
+    failed_initialized: list[tuple[str, Exception]] = []
+
+    json_data: dict[str, dict[t.Literal["architecture", "model_id", "url", "installation", "cpu", "gpu", "runtime_impl"], t.Any] | t.Any] = {}
+    converted: list[str] = []
+    for m in models:
+      config = AutoConfig.for_model(m)
+      runtime_impl: tuple[str, ...] = ()
+      if config["model_name"] in MODEL_MAPPING_NAMES: runtime_impl += ("pt",)
+      if config["model_name"] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ("flax",)
+      if config["model_name"] in MODEL_TF_MAPPING_NAMES: runtime_impl += ("tf",)
+      if config["model_name"] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ("vllm",)
+      json_data[m] = {"architecture": config["architecture"], "model_id": config["model_ids"], "cpu": not config["requires_gpu"], "gpu": True, "runtime_impl": runtime_impl, "installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm",}
+      converted.extend([normalise_model_name(i) for i in config["model_ids"]])
+      if DEBUG:
+        try:
+          AutoLLM.for_model(m, llm_config=config)
+        except Exception as e:
+          failed_initialized.append((m, e))
+
+    ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in json_data.keys()}
+    ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
+    local_models: DictStrAny | None = None
+    if show_available:
+      local_models = {k: [str(i.tag) for i in val] for k, val in ids_in_local_store.items()}
+
+    if machine:
+      if show_available: json_data["local"] = local_models
+      return json_data
+    elif output == "pretty":
+      import tabulate
+
+      tabulate.PRESERVE_WHITESPACE = True
+      # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
+      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
+      for m, v in json_data.items():
+        data.extend([(m, v["architecture"], v["model_id"], v["installation"], "❌" if not v["cpu"] else "✅", "✅", v["runtime_impl"],)])
+      column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),]
+
+      if len(data) == 0 and len(failed_initialized) > 0:
+        termui.echo("Exception found while parsing models:\n", fg="yellow")
+        for m, err in failed_initialized:
+          termui.echo(f"- {m}: ", fg="yellow", nl=False)
+          termui.echo(traceback.print_exception(None, err, None, limit=5), fg="red")  # type: ignore[func-returns-value]
+        sys.exit(1)
+
+      table = tabulate.tabulate(data, tablefmt="fancy_grid", headers=["LLM", "Architecture", "Models Id", "pip install", "CPU", "GPU", "Runtime"], maxcolwidths=column_widths)
+      termui.echo(table, fg="white")
+
+      if DEBUG and len(failed_initialized) > 0:
+        termui.echo("\nThe following models are supported but failed to initialize:\n")
+        for m, err in failed_initialized:
+          termui.echo(f"- {m}: ", fg="blue", nl=False)
+          termui.echo(err, fg="red")
+
+      if show_available:
+        if len(ids_in_local_store) == 0:
+          termui.echo("No models available locally.")
+          ctx.exit(0)
+        termui.echo("The following are available in local store:", fg="magenta")
+        termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    else:
+      if show_available: json_data["local"] = local_models
+      termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg="white")
+  ctx.exit(0)
+
+@cli.command()
+@model_name_argument(required=False)
+@click.option("-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model")
+@click.option("--include-bentos/--no-include-bentos", is_flag=True, default=False, help="Whether to also include pruning bentos.")
+@inject
+def prune_command(model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> None:
+  """Remove all saved models, (and optionally bentos) built with OpenLLM locally.
+
+  \b
+  If a model type is passed, then only prune models for that given model type.
+  """
+  available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"]
+  if model_name is not None: available = [(m, store) for m, store in available if "model_name" in m.info.labels and m.info.labels["model_name"] == inflection.underscore(model_name)]
+  if include_bentos:
+    if model_name is not None: available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)]
+    else: available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels]
+
+  for store_item, store in available:
+    if yes: delete_confirmed = True
+    else: delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
+    if delete_confirmed:
+      store.delete(store_item.tag)
+      termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg="yellow")
+
+def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, value: list[str] | str | None) -> tuple[str, bool | str] | list[str] | str | None:
+  if value is None:
+    return value
+
+  if isinstance(value, list):
+    # we only parse --text foo bar -> --text foo and omit bar
+    value = value[-1]
+
+  key, *values = value.split("=")
+  if not key.startswith("--"):
+    raise click.BadParameter(f"Invalid option format: {value}")
+  key = key[2:]
+  if len(values) == 0:
+    return key, True
+  elif len(values) == 1:
+    return key, values[0]
+  else:
+    raise click.BadParameter(f"Invalid option format: {value}")
+
+def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal["json", "porcelain", "pretty"] = "pretty") -> t.Callable[[FC], FC]:
+  options = [click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",), click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True), output_option(default_value=output_value),]
+  return compose(*options)(f) if f is not None else compose(*options)
+
+@cli.command()
+@click.argument("task", type=click.STRING, metavar="TASK")
+@shared_client_options
+@click.option("--agent", type=click.Choice(["hf"]), default="hf", help="Whether to interact with Agents from given Server endpoint.", show_default=True)
+@click.option("--remote", is_flag=True, default=False, help="Whether or not to use remote tools (inference endpoints) instead of local ones.", show_default=True)
+@click.option("--opt", help="Define prompt options. "
+              "(format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
+def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: LiteralOutput, remote: bool, task: str, _memoized: DictStrAny, **attrs: t.Any) -> str:
+  """Instruct agents interactively for given tasks, from a terminal.
+
+  \b
+  ```bash
+  $ openllm instruct --endpoint http://12.323.2.1:3000 \\
+        "Is the following `text` (in Spanish) positive or negative?" \\
+        --text "¡Este es un API muy agradable!"
+  ```
+  """
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout)
+
+  try:
+    client.call("metadata")
+  except http.client.BadStatusLine:
+    raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
+  if agent == "hf":
+    if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
+    _memoized = {k: v[0] for k, v in _memoized.items() if v}
+    client._hf_agent.set_stream(logger.info)
+    if output != "porcelain": termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg="magenta")
+    result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
+    if output == "json": termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    else: termui.echo(result, fg="white")
+    return result
+  else:
+    raise click.BadOptionUsage("agent", f"Unknown agent type {agent}")
+
+@cli.command()
+@shared_client_options(output_value="json")
+@click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True)
+@click.argument("text", type=click.STRING, nargs=-1)
+@machine_option
+@click.pass_context
+def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool) -> EmbeddingsOutput | None:
+  """Get embeddings interactively, from a terminal.
+
+  \b
+  ```bash
+  $ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
+  ```
+  """
+  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  try:
+    gen_embed = client.embed(text)
+  except ValueError:
+    raise click.ClickException(f"Endpoint {endpoint} does not support embeddings.") from None
+  if machine: return gen_embed
+  elif output == "pretty":
+    termui.echo("Generated embeddings: ", fg="magenta", nl=False)
+    termui.echo(gen_embed.embeddings, fg="white")
+    termui.echo("\nNumber of tokens: ", fg="magenta", nl=False)
+    termui.echo(gen_embed.num_tokens, fg="white")
+  elif output == "json":
+    termui.echo(orjson.dumps(bentoml_cattr.unstructure(gen_embed), option=orjson.OPT_INDENT_2).decode(), fg="white")
+  else:
+    termui.echo(gen_embed.embeddings, fg="white")
+  ctx.exit(0)
+
+@cli.command()
+@shared_client_options
+@click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True)
+@click.argument("prompt", type=click.STRING)
+@click.option("--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
+@click.pass_context
+def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any) -> None:
+  """Ask a LLM interactively, from a terminal.
+
+  \b
+  ```bash
+  $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
+  ```
+  """
+  _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
+  if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint)
+  client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
+  input_fg, generated_fg = "magenta", "cyan"
+  if output != "porcelain":
+    termui.echo("==Input==\n", fg="white")
+    termui.echo(f"{prompt}", fg=input_fg)
+  res = client.query(prompt, return_response="raw", **{**client.configuration, **_memoized})
+  if output == "pretty":
+    response = client.llm.postprocess_generate(prompt, res["responses"])
+    termui.echo("\n\n==Responses==\n", fg="white")
+    termui.echo(response, fg=generated_fg)
+  elif output == "json":
+    termui.echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  else:
+    termui.echo(res["responses"], fg="white")
+  ctx.exit(0)
+
+@cli.group(cls=Extensions, hidden=True, name="extension")
+def extension_command() -> None:
+  """Extension for OpenLLM CLI."""
+
+if __name__ == "__main__": cli()
--- a/openllm-python/src/openllm/cli/extension/init.py
+++ b/openllm-python/src/openllm/cli/extension/init.py
@@ -0,0 +1,16 @@
+"""OpenLLM CLI Extension.
+
+The following directory contains all possible extensions for OpenLLM CLI
+For adding new extension, just simply name that ext to `<name_ext>.py` and define
+a ``click.command()`` with the following format:
+
+```python
+import click
+
+@click.command(<name_ext>)
+...
+def cli(...): # <- this is important here, it should always name CLI in order for the extension resolver to know how to import this extensions.
+```
+
+NOTE: Make sure to keep this file blank such that it won't mess with the import order.
+"""
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -0,0 +1,37 @@
+
+from __future__ import annotations
+import typing as t
+
+import click
+import orjson
+
+import openllm
+
+from .. import termui
+from .._factory import machine_option
+
+if t.TYPE_CHECKING:
+  from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
+
+@click.command(
+    "build_base_container", context_settings=termui.CONTEXT_SETTINGS, help="""Base image builder for BentoLLM.
+
+                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
+
+                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
+
+                If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
+
+                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
+
+                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
+                """
+)
+@click.option("--registry", multiple=True, type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), help="Target registry to create image tag on.", default=None)
+@click.option("--version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="nightly", help="Version strategy to use for tagging the image.")
+@click.option("--push/--no-push", help="Whether to push to remote repository", is_flag=True, default=False)
+@machine_option
+def cli(registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+  mapping = openllm.bundle.build_container(registry, version_strategy, push, machine)
+  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -0,0 +1,36 @@
+
+from __future__ import annotations
+import shutil
+import subprocess
+import typing as t
+
+import click
+import psutil
+from simple_di import Provide, inject
+
+import bentoml
+from bentoml._internal.configuration.containers import BentoMLContainer
+
+from .. import termui
+
+if t.TYPE_CHECKING:
+  from bentoml._internal.bento import BentoStore
+
+@click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS)
+@click.argument("bento", type=str)
+@click.option("--machine", is_flag=True, default=False, hidden=True)
+@click.pass_context
+@inject
+def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
+  try:
+    bentomodel = _bento_store.get(bento)
+  except bentoml.exceptions.NotFound:
+    ctx.fail(f"Bento {bento} not found. Make sure to call `openllm build` first.")
+  if "bundler" not in bentomodel.info.labels or bentomodel.info.labels["bundler"] != "openllm.bundle":
+    ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
+  if machine: return bentomodel.path
+  # copy and paste this into a new shell
+  if psutil.WINDOWS: subprocess.check_call([shutil.which("dir") or "dir"], cwd=bentomodel.path)
+  else: subprocess.check_call([shutil.which("tree") or "tree"], cwd=bentomodel.path)
+  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -0,0 +1,42 @@
+
+from __future__ import annotations
+import typing as t
+
+import click
+from simple_di import Provide, inject
+
+import bentoml
+from bentoml._internal.bento.bento import BentoInfo
+from bentoml._internal.bento.build_config import DockerOptions
+from bentoml._internal.configuration.containers import BentoMLContainer
+from bentoml._internal.container.generate import generate_containerfile
+
+from .. import termui
+from ...utils import bentoml_cattr
+
+if t.TYPE_CHECKING:
+  from bentoml._internal.bento import BentoStore
+
+@click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.")
+@click.argument("bento", type=str)
+@click.pass_context
+@inject
+def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:
+  try:
+    bentomodel = _bento_store.get(bento)
+  except bentoml.exceptions.NotFound:
+    ctx.fail(f"Bento {bento} not found. Make sure to call `openllm build` first.")
+  # The logic below are similar to bentoml._internal.container.construct_containerfile
+  with open(bentomodel.path_of("bento.yaml"), "r") as f:
+    options = BentoInfo.from_yaml_file(f)
+    # NOTE: dockerfile_template is already included in the
+    # Dockerfile inside bento, and it is not relevant to
+    # construct_containerfile. Hence it is safe to set it to None here.
+    # See https://github.com/bentoml/BentoML/issues/3399.
+    docker_attrs = bentoml_cattr.unstructure(options.docker)
+    # NOTE: if users specify a dockerfile_template, we will
+    # save it to /env/docker/Dockerfile.template. This is necessary
+    # for the reconstruction of the Dockerfile.
+    if "dockerfile_template" in docker_attrs and docker_attrs["dockerfile_template"] is not None: docker_attrs["dockerfile_template"] = "env/docker/Dockerfile.template"
+    termui.echo(generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True,), fg="white")
+  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -0,0 +1,51 @@
+
+from __future__ import annotations
+import typing as t
+
+import click
+import inflection
+import orjson
+from bentoml_cli.utils import opt_callback
+
+import openllm
+
+from .. import termui
+from ..._prompt import process_prompt
+
+LiteralOutput = t.Literal["json", "pretty", "porcelain"]
+
+@click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS)
+@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
+@click.argument("prompt", type=click.STRING)
+@click.option("-o", "--output", "output", type=click.Choice(["json", "pretty", "porcelain"]), default="pretty", help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True)
+@click.option("--format", type=click.STRING, default=None)
+@click.option("--machine", is_flag=True, default=False, hidden=True)
+@click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
+@click.pass_context
+def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
+  """Get the default prompt used by OpenLLM."""
+  module = openllm.utils.EnvVarMixin(model_name).module
+  _memoized = {k: v[0] for k, v in _memoized.items() if v}
+  try:
+    template = getattr(module, "DEFAULT_PROMPT_TEMPLATE", None)
+    prompt_mapping = getattr(module, "PROMPT_MAPPING", None)
+    if template is None: raise click.BadArgumentUsage(f"model {model_name} does not have a default prompt template") from None
+    if callable(template):
+      if format is None:
+        if not hasattr(module, "PROMPT_MAPPING") or module.PROMPT_MAPPING is None: raise RuntimeError("Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.")
+        raise click.BadOptionUsage("format", f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
+      if prompt_mapping is None: raise click.BadArgumentUsage(f"Failed to fine prompt mapping while the default prompt for {model_name} is a callable.") from None
+      if format not in prompt_mapping: raise click.BadOptionUsage("format", f"Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})")
+      _prompt_template = template(format)
+    else:
+      _prompt_template = template
+    fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
+    if machine: return repr(fully_formatted)
+    elif output == "porcelain": termui.echo(repr(fully_formatted), fg="white")
+    elif output == "json": termui.echo(orjson.dumps({"prompt": fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    else:
+      termui.echo(f"== Prompt for {model_name} ==\n", fg="magenta")
+      termui.echo(fully_formatted, fg="white")
+  except AttributeError:
+    raise click.ClickException(f"Failed to determine a default prompt template for {model_name}.") from None
+  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -0,0 +1,31 @@
+
+from __future__ import annotations
+
+import click
+import inflection
+import orjson
+
+import bentoml
+import openllm
+from bentoml._internal.utils import human_readable_size
+
+from .. import termui
+from .._factory import LiteralOutput, output_option
+
+@click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS)
+@output_option(default_value="json")
+@click.pass_context
+def cli(ctx: click.Context, output: LiteralOutput) -> None:
+  """List available bentos built by OpenLLM."""
+  mapping = {
+      k: [{"tag": str(b.tag), "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), "models": [{"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]}
+          for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
+  }
+  mapping = {k: v for k, v in mapping.items() if v}
+  if output == "pretty":
+    import tabulate
+    tabulate.PRESERVE_WHITESPACE = True
+    termui.echo(tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]), fg="white")
+  else:
+    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import typing as t, bentoml, openllm, orjson, inflection ,click
+from bentoml._internal.utils import human_readable_size
+
+from openllm.cli import termui
+from openllm.cli._factory import LiteralOutput, model_name_argument, output_option
+
+if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+
+@click.command("list_models", context_settings=termui.CONTEXT_SETTINGS)
+@model_name_argument(required=False)
+@output_option(default_value="json")
+def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
+  """This is equivalent to openllm models --show-available less the nice table."""
+  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
+  ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in models}
+  if model_name is not None: ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
+  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
+  local_models = {k: [{"tag": str(i.tag), "size": human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
+  if output == "pretty":
+    import tabulate
+    tabulate.PRESERVE_WHITESPACE = True
+    termui.echo(tabulate.tabulate([(k, i["tag"], i["size"]) for k, v in local_models.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size"]), fg="white")
+  else:
+    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  return local_models
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t
+import click, yaml
+from openllm.cli import termui
+from openllm import playground
+from openllm.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
+
+if t.TYPE_CHECKING:
+  import jupytext, nbformat
+  from openllm._typing_compat import DictStrAny
+
+logger = logging.getLogger(__name__)
+
+def load_notebook_metadata() -> DictStrAny:
+  with open(os.path.join(os.path.dirname(playground.__file__), "_meta.yml"), "r") as f:
+    content = yaml.safe_load(f)
+  if not all("description" in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
+  return content
+
+@click.command("playground", context_settings=termui.CONTEXT_SETTINGS)
+@click.argument("output-dir", default=None, required=False)
+@click.option("--port", envvar="JUPYTER_PORT", show_envvar=True, show_default=True, default=8888, help="Default port for Jupyter server")
+@click.pass_context
+def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
+  """OpenLLM Playground.
+
+  A collections of notebooks to explore the capabilities of OpenLLM.
+  This includes notebooks for fine-tuning, inference, and more.
+
+  All of the script available in the playground can also be run directly as a Python script:
+  For example:
+
+  \b
+  ```bash
+  python -m openllm.playground.falcon_tuned --help
+  ```
+
+  \b
+  > [!NOTE]
+  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
+  """  # noqa: D301
+  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
+    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
+  metadata = load_notebook_metadata()
+  _temp_dir = False
+  if output_dir is None:
+    _temp_dir = True
+    output_dir = tempfile.mkdtemp(prefix="openllm-playground-")
+  else:
+    os.makedirs(os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir))), exist_ok=True)
+
+  termui.echo("The playground notebooks will be saved to: " + os.path.abspath(output_dir), fg="blue")
+  for module in pkgutil.iter_modules(playground.__path__):
+    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + ".ipynb")):
+      logger.debug("Skipping: %s (%s)", module.name, "File already exists" if not module.ispkg else f"{module.name} is a module")
+      continue
+    if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
+    termui.echo("Generating notebook for: " + module.name, fg="magenta")
+    markdown_cell = nbformat.v4.new_markdown_cell(metadata[module.name]["description"])
+    f = jupytext.read(os.path.join(module.module_finder.path, module.name + ".py"))
+    f.cells.insert(0, markdown_cell)
+    jupytext.write(f, os.path.join(output_dir, module.name + ".ipynb"), fmt="notebook")
+  try:
+    subprocess.check_output([sys.executable, "-m", "jupyter", "notebook", "--notebook-dir", output_dir, "--port", str(port), "--no-browser", "--debug"])
+  except subprocess.CalledProcessError as e:
+    termui.echo(e.output, fg="red")
+    raise click.ClickException(f"Failed to start a jupyter server:\n{e}") from None
+  except KeyboardInterrupt:
+    termui.echo("\nShutting down Jupyter server...", fg="yellow")
+    if _temp_dir: termui.echo("Note: You can access the generated notebooks in: " + output_dir, fg="blue")
+  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+import os, typing as t, click, inflection, openllm
+if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
+
+def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
+  attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None
+  if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
+
+COLUMNS: int = int(os.environ.get("COLUMNS", str(120)))
+CONTEXT_SETTINGS: DictStrAny = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore}
+__all__ = ["echo", "COLUMNS", "CONTEXT_SETTINGS"]
--- a/openllm-python/src/openllm/client/init.py
+++ b/openllm-python/src/openllm/client/init.py
@@ -0,0 +1,22 @@
+"""OpenLLM Python client.
+
+```python
+client = openllm.client.HTTPClient("http://localhost:8080")
+client.query("What is the difference between gather and scatter?")
+```
+
+If the server has embedding supports, use it via `client.embed`:
+```python
+client.embed("What is the difference between gather and scatter?")
+```
+"""
+from __future__ import annotations
+
+from openllm.client.runtimes import (
+    AsyncGrpcClient as AsyncGrpcClient,
+    AsyncHTTPClient as AsyncHTTPClient,
+    BaseAsyncClient as BaseAsyncClient,
+    BaseClient as BaseClient,
+    GrpcClient as GrpcClient,
+    HTTPClient as HTTPClient,
+)
--- a/openllm-python/src/openllm/client/runtimes/init.py
+++ b/openllm-python/src/openllm/client/runtimes/init.py
@@ -0,0 +1,15 @@
+"""Client that supports REST/gRPC protocol to interact with a LLMServer."""
+from __future__ import annotations
+
+from openllm.client.runtimes.base import (
+    BaseAsyncClient as BaseAsyncClient,
+    BaseClient as BaseClient,
+)
+from openllm.client.runtimes.grpc import (
+    AsyncGrpcClient as AsyncGrpcClient,
+    GrpcClient as GrpcClient,
+)
+from openllm.client.runtimes.http import (
+    AsyncHTTPClient as AsyncHTTPClient,
+    HTTPClient as HTTPClient,
+)
--- a/openllm-python/src/openllm/client/runtimes/base.py
+++ b/openllm-python/src/openllm/client/runtimes/base.py
@@ -0,0 +1,238 @@
+# mypy: disable-error-code="name-defined"
+from __future__ import annotations
+import asyncio, logging, typing as t
+import bentoml, bentoml.client, openllm, httpx
+from abc import abstractmethod
+from http import HTTPStatus
+from urllib.parse import urljoin
+from openllm._typing_compat import overload, LiteralString
+
+T = t.TypeVar("T")
+T_co = t.TypeVar("T_co", covariant=True)
+
+if t.TYPE_CHECKING:
+  import transformers
+  from openllm._typing_compat import DictStrAny, LiteralRuntime
+transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")  # noqa: F811
+
+class AnnotatedClient(t.Protocol[T_co]):
+  server_url: str
+  _svc: bentoml.Service
+  endpoints: list[str]
+  def health(self, *args: t.Any, **attrs: t.Any) -> t.Any: ...
+  async def async_health(self) -> t.Any: ...
+  def generate_v1(self, qa: openllm.GenerationInput) -> T_co: ...
+  def metadata_v1(self) -> T_co: ...
+  def embeddings_v1(self) -> t.Sequence[float]: ...
+  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
+  async def async_call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
+  @staticmethod
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: ...
+  @staticmethod
+  def from_url(server_url: str) -> AnnotatedClient[t.Any]: ...
+
+logger = logging.getLogger(__name__)
+
+def in_async_context() -> bool:
+  try:
+    _ = asyncio.get_running_loop()
+    return True
+  except RuntimeError: return False
+
+class ClientMeta(t.Generic[T]):
+  _api_version: str
+  _client_type: t.Literal["GrpcClient", "HTTPClient"]
+  _host: str
+  _port: str
+
+  __client__: AnnotatedClient[T] | None = None
+  __agent__: transformers.HfAgent | None = None
+  __llm__: openllm.LLM[t.Any, t.Any] | None = None
+
+  def __init__(self, address: str, timeout: int = 30): self._address,self._timeout = address,timeout
+  def __init_subclass__(cls, *, client_type: t.Literal["http", "grpc"] = "http", api_version: str = "v1"): cls._client_type, cls._api_version = "HTTPClient" if client_type == "http" else "GrpcClient", api_version
+  @property
+  def _hf_agent(self) -> transformers.HfAgent:
+    if not self.supports_hf_agent: raise openllm.exceptions.OpenLLMException(f"{self.model_name} ({self.framework}) does not support running HF agent.")
+    if self.__agent__ is None:
+      if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
+      self.__agent__ = transformers.HfAgent(urljoin(self._address, "/hf/agent"))
+    return self.__agent__
+  @property
+  def _metadata(self) -> T: return httpx.post(urljoin(self._address, f"/{self._api_version}/metadata")).json() if in_async_context() else self.call("metadata")
+  @property
+  @abstractmethod
+  def model_name(self) -> str: raise NotImplementedError
+  @property
+  @abstractmethod
+  def framework(self) -> LiteralRuntime: raise NotImplementedError
+  @property
+  @abstractmethod
+  def timeout(self) -> int: raise NotImplementedError
+  @property
+  @abstractmethod
+  def model_id(self) -> str: raise NotImplementedError
+  @property
+  @abstractmethod
+  def configuration(self) -> dict[str, t.Any]: raise NotImplementedError
+  @property
+  @abstractmethod
+  def supports_embeddings(self) -> bool: raise NotImplementedError
+  @property
+  @abstractmethod
+  def supports_hf_agent(self) -> bool: raise NotImplementedError
+  @abstractmethod
+  def postprocess(self, result: t.Any) -> openllm.GenerationOutput: ...
+  @abstractmethod
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: ...
+
+  @property
+  def config(self) -> openllm.LLMConfig: return self.llm.config
+  @property
+  def llm(self) -> openllm.LLM[t.Any, t.Any]:
+    # XXX: if the server runs vllm or any framework that is not available from the user client, client will fail.
+    if self.__llm__ is None: self.__llm__ = openllm.infer_auto_class(self.framework).for_model(self.model_name)
+    return self.__llm__
+
+  def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return self._cached.call(f"{name}_{self._api_version}", *args, **attrs)
+  async def acall(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return await self._cached.async_call(f"{name}_{self._api_version}", *args, **attrs)
+  @property
+  def _cached(self) -> AnnotatedClient[T]:
+    client_class = t.cast(AnnotatedClient[T], getattr(bentoml.client, self._client_type))
+    if self.__client__ is None:
+      client_class.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
+      self.__client__ = client_class.from_url(self._address)
+    return self.__client__
+
+class BaseClient(ClientMeta[T]):
+  def health(self) -> t.Any: raise NotImplementedError
+  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
+  @overload
+  def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
+  def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+
+    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
+    if in_async_context(): result = httpx.post(urljoin(self._address, f"/{self._api_version}/generate"), json=inputs.model_dump(), timeout=self.timeout).json()
+    else: result = self.call("generate", inputs.model_dump())
+    r = self.postprocess(result)
+    if return_response == "attrs": return r
+    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
+    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
+
+  # NOTE: Scikit interface
+  @overload
+  def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
+  def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], self.query(prompt, **attrs))
+
+  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+
+  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+    try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
+    except Exception as err:
+      logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
+      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
+
+class BaseAsyncClient(ClientMeta[T]):
+  async def health(self) -> t.Any: raise NotImplementedError
+  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
+  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
+  @overload
+  async def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  async def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  async def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
+  async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
+    return_raw_response = attrs.pop("return_raw_response", None)
+    if return_raw_response is not None:
+      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
+      if return_raw_response is True: return_response = "raw"
+    return_attrs = attrs.pop("return_attrs", None)
+    if return_attrs is not None:
+      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
+      if return_attrs is True: return_response = "attrs"
+    use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+    prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
+
+    inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
+    res = await self.acall("generate", inputs.model_dump())
+    r = self.postprocess(res)
+
+    if return_response == "attrs": return r
+    elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
+    else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
+
+  # NOTE: Scikit interface
+  @overload
+  async def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
+  @overload
+  async def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
+  @overload
+  async def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
+  async def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], await self.query(prompt, **attrs))
+  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
+    """Async version of agent.run."""
+    if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
+    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
+  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+    if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
+    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
+    task = kwargs.pop("task", args[0])
+    return_code = kwargs.pop("return_code", False)
+    remote = kwargs.pop("remote", False)
+
+    from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
+    from transformers.tools.python_interpreter import evaluate
+
+    _hf_agent = self._hf_agent
+
+    prompt = t.cast(str, _hf_agent.format_prompt(task))
+    stop = ["Task:"]
+    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
+      response = await client.post(_hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},},)
+      if response.status_code != HTTPStatus.OK:
+        raise ValueError(f"Error {response.status_code}: {response.json()}")
+
+    result = response.json()[0]["generated_text"]
+    # Inference API returns the stop sequence
+    for stop_seq in stop:
+      if result.endswith(stop_seq):
+        result = result[:-len(stop_seq)]
+        break
+
+    # the below have the same logic as agent.run API
+    explanation, code = clean_code_for_run(result)
+    _hf_agent.log(f"==Explanation from the agent==\n{explanation}")
+    _hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
+    if not return_code:
+      _hf_agent.log("\n\n==Result==")
+      _hf_agent.cached_tools = resolve_tools(code, _hf_agent.toolbox, remote=remote, cached_tools=_hf_agent.cached_tools)
+      return evaluate(code, _hf_agent.cached_tools, state=kwargs.copy())
+    else:
+      tool_code = get_tool_creation_code(code, _hf_agent.toolbox, remote=remote)
+      return f"{tool_code}\n{code}"
--- a/openllm-python/src/openllm/client/runtimes/grpc.py
+++ b/openllm-python/src/openllm/client/runtimes/grpc.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+import asyncio, logging, typing as t
+import orjson, openllm
+from openllm._typing_compat import LiteralRuntime
+from .base import BaseAsyncClient, BaseClient
+
+if t.TYPE_CHECKING:
+  from grpc_health.v1 import health_pb2
+  from bentoml.grpc.v1.service_pb2 import Response
+
+logger = logging.getLogger(__name__)
+
+class GrpcClient(BaseClient["Response"], client_type="grpc"):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
+  def health(self) -> health_pb2.HealthCheckResponse: return asyncio.run(self._cached.health("bentoml.grpc.v1.BentoService"))
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata.json.struct_value.fields["model_name"].string_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try:
+      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
+      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
+      return value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata.json.struct_value.fields["model_id"].string_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
+    from google.protobuf.json_format import MessageToDict
+    if isinstance(result, dict): return openllm.GenerationOutput(**result)
+    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
+
+class AsyncGrpcClient(BaseAsyncClient["Response"], client_type="grpc"):
+  def __init__(self, address: str, timeout: int = 30):
+    self._host, self._port = address.split(":")
+    super().__init__(address, timeout)
+  async def health(self) -> health_pb2.HealthCheckResponse: return await self._cached.health("bentoml.grpc.v1.BentoService")
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata.json.struct_value.fields["model_name"].string_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try:
+      value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
+      if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
+      return value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata.json.struct_value.fields["model_id"].string_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
+    from google.protobuf.json_format import MessageToDict
+    if isinstance(result, dict): return openllm.GenerationOutput(**result)
+    return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ b/openllm-python/src/openllm/client/runtimes/http.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+import logging, typing as t
+from urllib.parse import urljoin, urlparse
+import httpx, orjson, openllm
+from .base import BaseAsyncClient, BaseClient, in_async_context
+from openllm._typing_compat import DictStrAny, LiteralRuntime
+
+logger = logging.getLogger(__name__)
+def process_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
+  address = address if "://" in address else "http://" + address
+  parsed = urlparse(address)
+  self._host, *_port = parsed.netloc.split(":")
+  if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
+  else: self._port = next(iter(_port))
+
+class HTTPClient(BaseClient[DictStrAny]):
+  def __init__(self, address: str, timeout: int = 30):
+    process_address(self, address)
+    super().__init__(address, timeout)
+
+  def health(self) -> t.Any: return self._cached.health()
+  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
+    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
+    if isinstance(prompt, str): prompt = [prompt]
+    result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
+    return openllm.EmbeddingsOutput(**result)
+
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try: return self._metadata["framework"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return self._metadata["timeout"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata["configuration"])
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.get("supports_embeddings", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.get("supports_hf_agent", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
+
+class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
+  def __init__(self, address: str, timeout: int = 30):
+    process_address(self, address)
+    super().__init__(address, timeout)
+
+  async def health(self) -> t.Any: return await self._cached.async_health()
+  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
+    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
+    if isinstance(prompt, str): prompt = [prompt]
+    res = await self.acall("embeddings", list(prompt))
+    return openllm.EmbeddingsOutput(**res)
+
+  @property
+  def model_name(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def model_id(self) -> str:
+    try: return self._metadata["model_name"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def framework(self) -> LiteralRuntime:
+    try: return self._metadata["framework"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def timeout(self) -> int:
+    try: return self._metadata["timeout"]
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def configuration(self) -> dict[str, t.Any]:
+    try: return orjson.loads(self._metadata["configuration"])
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_embeddings(self) -> bool:
+    try: return self._metadata.get("supports_embeddings", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  @property
+  def supports_hf_agent(self) -> bool:
+    try: return self._metadata.get("supports_hf_agent", False)
+    except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+  def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
--- a/openllm-python/src/openllm/exceptions.py
+++ b/openllm-python/src/openllm/exceptions.py
@@ -0,0 +1,19 @@
+"""Base exceptions for OpenLLM. This extends BentoML exceptions."""
+from __future__ import annotations
+import bentoml
+class OpenLLMException(bentoml.exceptions.BentoMLException):
+  """Base class for all OpenLLM exceptions. This extends BentoMLException."""
+class GpuNotAvailableError(OpenLLMException):
+  """Raised when there is no GPU available in given system."""
+class ValidationError(OpenLLMException):
+  """Raised when a validation fails."""
+class ForbiddenAttributeError(OpenLLMException):
+  """Raised when using an _internal field."""
+class MissingAnnotationAttributeError(OpenLLMException):
+  """Raised when a field under openllm.LLMConfig is missing annotations."""
+class MissingDependencyError(BaseException):
+  """Raised when a dependency is missing."""
+class Error(BaseException):
+  """To be used instead of naked raise."""
+class FineTuneStrategyNotSupportedError(OpenLLMException):
+  """Raised when a fine-tune strategy is not supported for given LLM."""
--- a/openllm-python/src/openllm/models/init.py
+++ b/openllm-python/src/openllm/models/init.py
@@ -0,0 +1,11 @@
+# This file is generated by tools/update-models-import.py. DO NOT EDIT MANUALLY!
+# To update this, run ./tools/update-models-import.py
+from __future__ import annotations
+import typing as t, os
+from openllm.utils import LazyModule
+_MODELS: set[str] = {"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
+if t.TYPE_CHECKING: from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
+__lazy=LazyModule(__name__, os.path.abspath("__file__"), {k: [] for k in _MODELS})
+__all__=__lazy.__all__
+__dir__=__lazy.__dir__
+__getattr__=__lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/init.py
+++ b/openllm-python/src/openllm/models/auto/init.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+import typing as t, os
+import openllm
+from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"], "modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
+if t.TYPE_CHECKING:
+  from .configuration_auto import (
+    CONFIG_MAPPING as CONFIG_MAPPING,
+    CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
+    AutoConfig as AutoConfig,
+  )
+  from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
+  from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
+  from .modeling_tf_auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
+  from .modeling_vllm_auto import MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
+try:
+  if not is_torch_available(): raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError: pass
+else:
+  _import_structure["modeling_auto"].extend(["AutoLLM", "MODEL_MAPPING"])
+  if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM
+try:
+  if not is_vllm_available(): raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"])
+  if t.TYPE_CHECKING: from .modeling_vllm_auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM
+try:
+  if not is_flax_available(): raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError: pass
+else:
+  _import_structure["modeling_flax_auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"])
+  if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
+try:
+  if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError: pass
+else:
+  _import_structure["modeling_tf_auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"])
+  if t.TYPE_CHECKING: from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM
+
+__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
+__all__=__lazy.__all__
+__dir__=__lazy.__dir__
+__getattr__=__lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/configuration_auto.py
+++ b/openllm-python/src/openllm/models/auto/configuration_auto.py
@@ -0,0 +1,64 @@
+# mypy: disable-error-code="type-arg"
+from __future__ import annotations
+import typing as t
+from collections import OrderedDict
+
+import inflection, openllm
+from openllm.utils import ReprMixin
+
+if t.TYPE_CHECKING:
+  import types
+  from openllm._typing_compat import LiteralString
+  from collections import _odict_items, _odict_keys, _odict_values
+  ConfigKeysView = _odict_keys[str, type[openllm.LLMConfig]]
+  ConfigValuesView = _odict_values[str, type[openllm.LLMConfig]]
+  ConfigItemsView = _odict_items[str, type[openllm.LLMConfig]]
+
+# NOTE: This is the entrypoint when adding new model config
+CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")])
+
+class _LazyConfigMapping(OrderedDict, ReprMixin):
+  def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]):
+    self._mapping = mapping
+    self._extra_content: dict[str, t.Any] = {}
+    self._modules: dict[str, types.ModuleType] = {}
+  def __getitem__(self, key: str) -> t.Any:
+    if key in self._extra_content: return self._extra_content[key]
+    if key not in self._mapping:
+      if inflection.underscore(key) in self._mapping: return self.__getitem__(inflection.underscore(key))
+      raise KeyError(key)
+    value, module_name = self._mapping[key], inflection.underscore(key)
+    if module_name not in self._modules: self._modules[module_name] = openllm.utils.EnvVarMixin(module_name).module
+    if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value)
+    # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the object at the top level.
+    return getattr(openllm, value)
+  @property
+  def __repr_keys__(self) -> set[str]: return set(self._mapping.keys())
+  def __repr__(self) -> str: return ReprMixin.__repr__(self)
+  def __repr_args__(self) -> t.Generator[tuple[str, t.Any], t.Any, t.Any]: yield from self._mapping.items()
+  def keys(self) -> ConfigKeysView: return t.cast("ConfigKeysView", list(self._mapping.keys()) + list(self._extra_content.keys()))
+  def values(self) -> ConfigValuesView: return t.cast("ConfigValuesView", [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()))
+  def items(self) -> ConfigItemsView: return t.cast("ConfigItemsView", [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()))
+  def __iter__(self) -> t.Iterator[str]: return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
+  def __contains__(self, item: t.Any) -> bool: return item in self._mapping or item in self._extra_content
+  def register(self, key: str, value: t.Any) -> None:
+    if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
+    self._extra_content[key] = value
+
+CONFIG_MAPPING: dict[str, type[openllm.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+# The below handle special alias when we call underscore to the name directly without processing camelcase first.
+CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder", "gpt_neo_x": "gpt_neox",}
+
+class AutoConfig:
+  def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.")
+  @classmethod
+  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
+    model_name = inflection.underscore(model_name)
+    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
+  @classmethod
+  def infer_class_from_name(cls, name: str) -> type[openllm.LLMConfig]:
+    model_name = inflection.underscore(name)
+    if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name]
+    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name]
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -0,0 +1,122 @@
+# mypy: disable-error-code="type-arg"
+from __future__ import annotations
+import importlib, inspect, logging, typing as t
+from collections import OrderedDict
+import inflection, openllm
+from openllm.utils import ReprMixin
+
+if t.TYPE_CHECKING:
+  from openllm._typing_compat import LiteralString, LLMRunner
+  import types
+  from collections import _odict_items, _odict_keys, _odict_values
+
+  from _typeshed import SupportsIter
+  ConfigModelKeysView = _odict_keys[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
+  ConfigModelValuesView = _odict_values[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
+  ConfigModelItemsView = _odict_items[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
+
+logger = logging.getLogger(__name__)
+
+class BaseAutoLLMClass:
+  _model_mapping: t.ClassVar[_LazyAutoMapping]
+  def __init__(self, *args: t.Any, **attrs: t.Any): raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
+  @classmethod
+  def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False, **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
+    """The lower level API for creating a LLM instance.
+
+    ```python
+    >>> import openllm
+    >>> llm = openllm.AutoLLM.for_model("flan-t5")
+    ```
+    """
+    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
+    if ensure_available: llm.ensure_model_id_exists()
+    return llm
+  @classmethod
+  def create_runner(cls, model: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
+    """Create a LLM Runner for the given model name.
+
+    Args:
+    model: The model name to instantiate.
+    model_id: The pretrained model name to instantiate.
+    **attrs: Additional keyword arguments passed along to the specific configuration class.
+
+    Returns:
+    A LLM instance.
+    """
+    runner_kwargs_name = set(inspect.signature(openllm.LLM[t.Any, t.Any].to_runner).parameters)
+    runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
+    for k in runner_attrs: del attrs[k]
+    return cls.for_model(model, model_id=model_id, **attrs).to_runner(**runner_attrs)
+  @classmethod
+  def register(cls, config_class: type[openllm.LLMConfig], llm_class: type[openllm.LLM[t.Any, t.Any]]) -> None:
+    """Register a new model for this class.
+
+    Args:
+    config_class: The configuration corresponding to the model to register.
+    llm_class: The runnable to register.
+    """
+    if hasattr(llm_class, "config_class") and llm_class.config_class is not config_class:
+      raise ValueError(f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!")
+    cls._model_mapping.register(config_class, llm_class)
+  @classmethod
+  def infer_class_from_name(cls, name: str) -> type[openllm.LLM[t.Any, t.Any]]:
+    config_class = openllm.AutoConfig.infer_class_from_name(name)
+    if config_class in cls._model_mapping: return cls._model_mapping[config_class]
+    raise ValueError(f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])}).")
+
+def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
+  if attr is None: return
+  if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr)
+  if hasattr(module, attr): return getattr(module, attr)
+  # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the object at the top level.
+  openllm_module = importlib.import_module("openllm")
+  if module != openllm_module:
+    try: return getattribute_from_module(openllm_module, attr)
+    except ValueError: raise ValueError(f"Could not find {attr} neither in {module} nor in {openllm_module}!") from None
+  raise ValueError(f"Could not find {attr} in {openllm_module}!")
+
+class _LazyAutoMapping(OrderedDict, ReprMixin):
+  """Based on transformers.models.auto.configuration_auto._LazyAutoMapping.
+
+  This OrderedDict values() and keys() returns the list instead, so you don't
+  have to do list(mapping.values()) to get the list of values.
+  """
+  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
+    self._config_mapping = config_mapping
+    self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
+    self._model_mapping = model_mapping
+    self._extra_content: dict[t.Any, t.Any] = {}
+    self._modules: dict[str, types.ModuleType] = {}
+  def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]:
+    if key in self._extra_content: return self._extra_content[key]
+    model_type = self._reverse_config_mapping[key.__name__]
+    if model_type in self._model_mapping: return self._load_attr_from_module(model_type, self._model_mapping[model_type])
+    # Maybe there was several model types associated with this config.
+    model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
+    for mtype in model_types:
+      if mtype in self._model_mapping: return self._load_attr_from_module(mtype, self._model_mapping[mtype])
+    raise KeyError(key)
+  def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
+    module_name = inflection.underscore(model_type)
+    if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f".{module_name}", "openllm.models")
+    return getattribute_from_module(self._modules[module_name], attr)
+  def __len__(self) -> int: return len(set(self._config_mapping.keys()).intersection(self._model_mapping.keys())) + len(self._extra_content)
+  @property
+  def __repr_keys__(self) -> set[str]: return set(self._config_mapping.keys())
+  def __repr__(self) -> str: return ReprMixin.__repr__(self)
+  def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]: yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
+  def __bool__(self) -> bool: return bool(self.keys())
+  def keys(self) -> ConfigModelKeysView: return t.cast("ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()))
+  def values(self) -> ConfigModelValuesView: return t.cast("ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
+  def items(self) -> ConfigModelItemsView: return t.cast("ConfigModelItemsView", [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items()))
+  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]: return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys()))
+  def __contains__(self, item: t.Any) -> bool:
+    if item in self._extra_content: return True
+    if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: return False
+    return self._reverse_config_mapping[item.__name__] in self._model_mapping
+  def register(self, key: t.Any, value: t.Any) -> None:
+    if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
+      if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
+    self._extra_content[key] = value
+__all__ = ["BaseAutoLLMClass", "_LazyAutoMapping"]
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import typing as t
+from collections import OrderedDict
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import BaseAutoLLMClass, _LazyAutoMapping
+
+MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
+MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
+class AutoLLM(BaseAutoLLMClass):
+  _model_mapping: t.ClassVar = MODEL_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import typing as t
+from collections import OrderedDict
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import BaseAutoLLMClass, _LazyAutoMapping
+
+MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")])
+MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
+class AutoFlaxLLM(BaseAutoLLMClass):
+  _model_mapping: t.ClassVar = MODEL_FLAX_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import typing as t
+from collections import OrderedDict
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import BaseAutoLLMClass, _LazyAutoMapping
+
+MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")])
+MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
+class AutoTFLLM(BaseAutoLLMClass):
+  _model_mapping: t.ClassVar = MODEL_TF_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import typing as t
+from collections import OrderedDict
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import BaseAutoLLMClass, _LazyAutoMapping
+
+MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
+MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
+class AutoVLLM(BaseAutoLLMClass):
+  _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
--- a/openllm-python/src/openllm/models/baichuan/init.py
+++ b/openllm-python/src/openllm/models/baichuan/init.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_baichuan import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
+    BaichuanConfig as BaichuanConfig,
+  )
+try:
+  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_baichuan"] = ["Baichuan"]
+  if t.TYPE_CHECKING: from .modeling_baichuan import Baichuan as Baichuan
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_baichuan"] = ["VLLMBaichuan"]
+  if t.TYPE_CHECKING: from .modeling_vllm_baichuan import VLLMBaichuan as VLLMBaichuan
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+import openllm
+
+class BaichuanConfig(openllm.LLMConfig):
+  """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
+
+  Baichuan-7B is based on Transformer architecture,
+  which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
+  It supports both Chinese and English languages with a context window length of 4096.
+  It has achieved the best performance among models of the same size on standard Chinese
+  and English benchmarks (C-Eval, MMLU, etc).
+  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
+                "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    top_p: float = 0.7
+    temperature: float = 0.95
+
+START_BAICHUAN_COMMAND_DOCSTRING = """\
+Run a LLMServer for Baichuan model.
+
+\b
+> See more information about Baichuan at [baichuan-inc/Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B)
+
+\b
+## Usage
+
+Currently, Baichuan only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+Baichuan Runner will use baichuan-inc/Baichuan-7B as the default model. To change to any other
+saved pretrained Baichuan, provide ``OPENLLM_Baichuan_MODEL_ID='fireballoon/baichuan-vicuna-chinese-7b'``
+or provide `--model-id` flag when running ``openllm start baichuan``:
+
+\b
+$ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
+"""
+DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
+
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
+      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import vllm, transformers
+
+class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
--- a/openllm-python/src/openllm/models/chatglm/init.py
+++ b/openllm-python/src/openllm/models/chatglm/init.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available
+
+_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_chatglm import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
+    ChatGLMConfig as ChatGLMConfig,
+  )
+try:
+  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_chatglm"] = ["ChatGLM"]
+  if t.TYPE_CHECKING: from .modeling_chatglm import ChatGLM as ChatGLM
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+import openllm
+
+class ChatGLMConfig(openllm.LLMConfig):
+  """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
+
+  With the quantization technique, users can deploy locally on consumer-grade graphics cards
+  (only 6GB of GPU memory is required at the INT4 quantization level).
+
+  ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
+  The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
+  feedback bootstrap, and reinforcement learning wit human feedback.
+  With only about 6.2 billion parameters, the model is able to generate answers that are in line
+  with human preference.
+
+  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
+                "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
+  retain_history: bool = openllm.LLMConfig.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
+  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
+  class GenerationConfig:
+    max_new_tokens: int = 2048
+    num_beams: int = 1
+    top_p: float = 0.7
+    temperature: float = 0.95
+
+START_CHATGLM_COMMAND_DOCSTRING = """\
+Run a LLMServer for ChatGLM model.
+
+\b
+> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
+
+\b
+## Usage
+
+Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change to any other ChatGLM
+saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_MODEL_ID='thudm/chatglm-6b-int8'``
+or provide `--model-id` flag when running ``openllm start chatglm``:
+
+\b
+$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
+"""
+DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+import typing as t, openllm
+if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
+else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
+
+class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
+  __openllm_internal__ = True
+
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    prompt_text = ""
+    if use_default_prompt_template and chat_history is not None:
+      for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
+      prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
+    else: prompt_text = prompt
+    postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
+    return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
+  def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
+    generated, history = generation_result
+    if self.config.retain_history:
+      if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
+      chat_history.extend(history)
+    return generated
+  def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
+    with torch.inference_mode():
+      self.model.eval()
+      # Only use half precision if the model is not yet quantized
+      if self.config.use_half_precision: self.model.half()
+      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    embeddings: list[list[float]] = []
+    num_tokens = 0
+    for prompt in prompts:
+      input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+      with torch.inference_mode():
+        outputs = self.model(input_ids, output_hidden_states=True)
+        data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
+        embeddings.append(data.tolist())
+        num_tokens += len(input_ids[0])
+    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
--- a/openllm-python/src/openllm/models/dolly_v2/init.py
+++ b/openllm-python/src/openllm/models/dolly_v2/init.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_dolly_v2 import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
+    DollyV2Config as DollyV2Config,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_dolly_v2"] = ["DollyV2"]
+  if t.TYPE_CHECKING: from .modeling_dolly_v2 import DollyV2 as DollyV2
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_dolly_v2"] = ["VLLMDollyV2"]
+  if t.TYPE_CHECKING: from .modeling_vllm_dolly_v2 import VLLMDollyV2 as VLLMDollyV2
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+import typing as t, openllm
+if t.TYPE_CHECKING: import transformers
+
+class DollyV2Config(openllm.LLMConfig):
+  """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
+
+  Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
+  generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
+  classification, closed QA, generation, information extraction, open QA and summarization.
+
+  dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
+  following behavior not characteristic of the foundation model on which it is based.
+
+  Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
+  """
+  __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
+  return_full_text: bool = openllm.LLMConfig.Field(False, description="Whether to return the full prompt to the users.")
+  class GenerationConfig:
+    temperature: float = 0.9
+    top_p: float = 0.92
+    top_k: int = 5
+    max_new_tokens: int = 256
+    eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
+
+START_DOLLY_V2_COMMAND_DOCSTRING = """\
+Run a LLMServer for dolly-v2 model.
+
+\b
+> See more information about dolly-v2 at [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
+
+\b
+## Usage
+
+Currently, dolly-v2 only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+Dolly-v2 Runner will use databricks/dolly-v2-3b as the default model. To change to any other dolly-v2
+saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_MODEL_ID='databricks/dolly-v2-7b'``
+or provide `--model-id` flag when running ``openllm start dolly-v2``:
+
+\b
+$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
+"""
+INSTRUCTION_KEY = "### Instruction:"
+RESPONSE_KEY = "### Response:"
+END_KEY = "### End"
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+# NOTE: This is the prompt that is used for generating responses using an already
+# trained model.  It ends with the response key, where the job of the model is to provide
+# the completion that follows it (i.e. the response itself).
+DEFAULT_PROMPT_TEMPLATE = """{intro}
+{instruction_key}
+{instruction}
+{response_key}
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
+  """Gets the token ID for a given string that has been added to the tokenizer as a special token.
+
+  When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
+  treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
+
+  Args:
+  tokenizer: the tokenizer
+  key: the key to convert to a single token
+
+  Raises:
+  RuntimeError: if more than one ID was generated
+
+  Returns:
+  int: the token ID for the given key.
+  """
+  token_ids = tokenizer.encode(key)
+  if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
+  return token_ids[0]
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+import logging, re, typing as t, openllm
+from openllm._prompt import process_prompt
+from openllm._typing_compat import overload
+from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
+
+if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
+else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
+logger = logging.getLogger(__name__)
+
+@overload
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ...
+@overload
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ...
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
+  # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
+  class InstructionTextGenerationPipeline(transformers.Pipeline):
+    def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
+    def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
+      if t.TYPE_CHECKING: assert self.tokenizer is not None
+      preprocess_params: dict[str, t.Any] = {}
+      # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
+      # append a newline to yield a single token.  find whatever token is configured for the response key.
+      tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
+      response_key_token_id = None
+      end_key_token_id = None
+      if tokenizer_response_key:
+        try:
+          response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
+          end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
+          # Ensure generation stops once it generates "### End"
+          generate_kwargs["eos_token_id"] = end_key_token_id
+        except ValueError: pass
+      forward_params = generate_kwargs
+      postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
+      if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
+      return preprocess_params, forward_params, postprocess_params
+    def preprocess(self, input_: str, **generate_kwargs: t.Any) -> t.Dict[str, t.Any]:
+      if t.TYPE_CHECKING: assert self.tokenizer is not None
+      prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
+      inputs = self.tokenizer(prompt_text, return_tensors="pt")
+      inputs["prompt_text"] = prompt_text
+      inputs["instruction_text"] = input_
+      return t.cast(t.Dict[str, t.Any], inputs)
+    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
+      if t.TYPE_CHECKING: assert self.tokenizer is not None
+      input_ids, attention_mask = input_tensors["input_ids"], input_tensors.get("attention_mask", None)
+      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
+      else: in_b = input_ids.shape[0]
+      generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
+      out_b = generated_sequence.shape[0]
+      if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+      elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+      instruction_text = input_tensors.pop("instruction_text")
+      return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
+    def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal["generated_text"], str]]:
+      if t.TYPE_CHECKING: assert self.tokenizer is not None
+      _generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
+      generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
+      records: list[dict[t.Literal["generated_text"], str]] = []
+      for sequence in generated_sequence:
+        # The response will be set to this variable if we can identify it.
+        decoded = None
+        # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
+        if response_key_token_id and end_key_token_id:
+          # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
+          # prompt, we should definitely find it.  We will return the tokens found after this token.
+          try: response_pos = sequence.index(response_key_token_id)
+          except ValueError: response_pos = None
+          if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
+          if response_pos:
+            # Next find where "### End" is located.  The model has been trained to end its responses with this
+            # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
+            # this token, as the response could be truncated.  If we don't find it then just return everything
+            # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
+            try: end_pos = sequence.index(end_key_token_id)
+            except ValueError: end_pos = None
+            decoded = self.tokenizer.decode(sequence[response_pos + 1:end_pos]).strip()
+        if not decoded:
+          # Otherwise we'll decode everything and use a regex to find the response and end.
+          fully_decoded = self.tokenizer.decode(sequence)
+          # The response appears after "### Response:".  The model has been trained to append "### End" at the
+          # end.
+          m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
+          if m: decoded = m.group(1).strip()
+          else:
+            # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
+            # return everything after "### Response:".
+            m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
+            if m: decoded = m.group(1).strip()
+            else: logger.warning("Failed to find response in:\n%s", fully_decoded)
+        # If the full text is requested, then append the decoded text to the original instruction.
+        # This technically isn't the full text, as we format the instruction in the prompt the model has been
+        # trained on, but to the client it will appear to be the full text.
+        if return_full_text: decoded = f"{instruction_text}\n{decoded}"
+        records.append({"generated_text": t.cast(str, decoded)})
+      return records
+  return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline
+
+class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
+  __openllm_internal__ = True
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {}
+  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
+    llm_config = self.config.model_construct_env(**attrs)
+    with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMDollyV2(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizer"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
--- a/openllm-python/src/openllm/models/falcon/init.py
+++ b/openllm-python/src/openllm/models/falcon/init.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available
+
+_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_falcon import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
+    FalconConfig as FalconConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_falcon"] = ["Falcon"]
+  if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/falcon/configuration_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/configuration_falcon.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+import openllm
+
+class FalconConfig(openllm.LLMConfig):
+  """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
+
+  It is made available under the TII Falcon LLM License.
+
+  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
+                "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
+  class GenerationConfig:
+    max_new_tokens: int = 200
+    top_k: int = 10
+    num_return_sequences: int = 1
+    num_beams: int = 4
+    early_stopping: bool = True
+
+START_FALCON_COMMAND_DOCSTRING = """\
+Run a LLMServer for FalconLM model.
+
+\b
+> See more information about falcon at [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
+
+\b
+## Usage
+
+Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+FalconLM Runner will use tiiuae/falcon-7b as the default model. To change to any other FalconLM
+saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_MODEL_ID='tiiuae/falcon-7b-instruct'``
+or provide `--model-id` flag when running ``openllm start falcon``:
+
+\b
+$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
+"""
+DEFAULT_PROMPT_TEMPLATE = """{context}
+{user_name}: {instruction}
+{agent}:
+"""
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
+  __openllm_internal__ = True
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
+      return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([]))
+    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
+    result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    # Inference API returns the stop sequence
+    for stop_seq in stop:
+      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
+    return [{"generated_text": result}]
--- a/openllm-python/src/openllm/models/flan_t5/init.py
+++ b/openllm-python/src/openllm/models/flan_t5/init.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available
+
+_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_flan_t5 import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING,
+    FlanT5Config as FlanT5Config,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_flan_t5"] = ["FlanT5"]
+  if t.TYPE_CHECKING: from .modeling_flan_t5 import FlanT5 as FlanT5
+try:
+  if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
+  if t.TYPE_CHECKING: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
+try:
+  if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
+  if t.TYPE_CHECKING: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+import openllm
+
+class FlanT5Config(openllm.LLMConfig):
+  """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
+
+  It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
+
+  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
+  """
+  __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
+                "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 2048
+    top_k: int = 50
+    top_p: float = 0.4
+    repetition_penalty = 1.0
+
+START_FLAN_T5_COMMAND_DOCSTRING = """\
+Run a LLMServer for FLAN-T5 model.
+
+\b
+> See more information about FLAN-T5 at [huggingface/transformers](https://huggingface.co/docs/transformers/model_doc/flan-t5)
+
+\b
+## Usage
+
+By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
+
+\b
+- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"``
+
+\b
+- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"``
+
+\b
+FLAN-T5 Runner will use google/flan-t5-large as the default model. To change to any other FLAN-T5
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
+or provide `--model-id` flag when running ``openllm start flan-t5``:
+
+\b
+$ openllm start flan-t5 --model-id google/flan-t5-xxl
+"""
+DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
+else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
+
+class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
+  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    embeddings: list[list[float]] = []
+    num_tokens = 0
+    for prompt in prompts:
+      input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+      with torch.inference_mode():
+        outputs = self.model(input_ids, decoder_input_ids=input_ids)
+        data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
+        embeddings.append(data.tolist())
+        num_tokens += len(input_ids[0])
+    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import transformers
+
+class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    if decoder_start_token_id is None: decoder_start_token_id = 0
+    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
+    decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
+    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+import typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import transformers
+
+class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/gpt_neox/init.py
+++ b/openllm-python/src/openllm/models/gpt_neox/init.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_gpt_neox import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING,
+    GPTNeoXConfig as GPTNeoXConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
+  if t.TYPE_CHECKING: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_gpt_neox"] = ["VLLMGPTNeoX"]
+  if t.TYPE_CHECKING: from .modeling_vllm_gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+import openllm
+
+class GPTNeoXConfig(openllm.LLMConfig):
+  """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
+
+  It is, to the best of our knowledge, the largest dense autoregressive model
+  that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
+  can be found at https://github.com/EleutherAI/gpt-neox.
+
+  GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
+
+  Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
+  also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
+
+  Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
+  for more information.
+  """
+  __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
+                "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
+  use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
+
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 100
+
+START_GPT_NEOX_COMMAND_DOCSTRING = """\
+Run a LLMServer for GPTNeoX model.
+
+\b
+> See more information about GPTNeoX at [HuggingFace's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
+
+\b
+## Usage
+
+Currently, GPTNeoX only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+GPTNeoX Runner will use EleutherAI/gpt-neox-20b as the default model. To change to any other GPTNeoX
+saved pretrained, or a fine-tune GPTNeoX, provide ``OPENLLM_GPT_NEOX_MODEL_ID='stabilityai/stablelm-tuned-alpha-3b'``
+or provide `--model-id` flag when running ``openllm start gpt-neox``:
+
+\b
+$ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
+"""
+DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
+  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
+    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
+    if self.config.use_half_precision: model.half()
+    return model
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+import typing as t, openllm, logging
+from openllm._prompt import process_prompt
+from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMGPTNeoX(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
--- a/openllm-python/src/openllm/models/llama/init.py
+++ b/openllm-python/src/openllm/models/llama/init.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlamaConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
+if t.TYPE_CHECKING:
+  from .configuration_llama import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    PROMPT_MAPPING as PROMPT_MAPPING,
+    START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING,
+    LlamaConfig as LlamaConfig,
+  )
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_llama"] = ["VLLMLlama"]
+  if t.TYPE_CHECKING: from .modeling_vllm_llama import VLLMLlama as VLLMLlama
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_llama"] = ["Llama"]
+  if t.TYPE_CHECKING: from .modeling_llama import Llama as Llama
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/llama/configuration_llama.py
+++ b/openllm-python/src/openllm/models/llama/configuration_llama.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+import typing as t, openllm
+
+class LlamaConfig(openllm.LLMConfig):
+  """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+
+  It is a collection of foundation language models ranging from 7B to 65B parameters.
+
+  Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
+
+  Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
+
+  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
+  for more information.
+  """
+  use_llama2_prompt: bool = openllm.LLMConfig.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
+                "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
+                                                                          "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
+                                                                          "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
+                "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0.6
+    top_p: float = 0.9
+    top_k: int = 12
+  class SamplingParams:
+    best_of: int = 1
+    presence_penalty: float = 0.5
+
+START_LLAMA_COMMAND_DOCSTRING = """\
+Run a LLMServer for Llama model.
+
+\b
+> See more information about Llama at [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama
+
+\b
+## Usage
+
+By default, this model will use [vLLM](https://github.com/vllm-project/vllm) for inference.
+This model will also supports PyTorch.
+
+\b
+- To use PyTorch, set the environment variable ``OPENLLM_LLAMA_FRAMEWORK="pt"``
+
+\b
+Llama Runner will use decapoda-research/llama-7b-hf as the default model. To change to any other Llama
+saved pretrained, or a fine-tune Llama, provide ``OPENLLM_LLAMA_MODEL_ID='openlm-research/open_llama_7b_v2'``
+or provide `--model-id` flag when running ``openllm start llama``:
+
+\b
+$ openllm start llama --model-id 'openlm-research/open_llama_7b_v2'
+
+\b
+OpenLLM also supports running Llama-2 and its fine-tune and variants. To import the Llama weights, one can use the following:
+
+\b
+$ CONVERTER=hf-llama2 openllm import llama /path/to/llama-2
+"""
+SYSTEM_MESSAGE = """
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+"""
+SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "<<SYS>>", "</s>", "<s>"
+# TODO: support history and v1 prompt implementation
+_v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction="{instruction}", end_key=EINST_KEY)
+PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
+def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
+DEFAULT_PROMPT_TEMPLATE = _get_prompt
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
+else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
+
+logger = logging.getLogger(__name__)
+class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
+  __openllm_internal__ = True
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), generation_config=self.config.model_construct_env(**attrs).to_generation_config(), do_sample=True, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])), skip_special_tokens=True, clean_up_tokenization_spaces=True)
+  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+    encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+    with torch.inference_mode():
+      data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
+      mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
+      masked_embeddings = data * mask
+      sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
+    return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
--- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMLlama(openllm.LLM["vllm.LLMEngine", "transformers.LlamaTokenizerFast"]):
+  __openllm_internal__ = True
+  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
--- a/openllm-python/src/openllm/models/mpt/init.py
+++ b/openllm-python/src/openllm/models/mpt/init.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
+if t.TYPE_CHECKING:
+  from .configuration_mpt import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    PROMPT_MAPPING as PROMPT_MAPPING,
+    START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING,
+    MPTConfig as MPTConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_mpt"] = ["MPT"]
+  if t.TYPE_CHECKING: from .modeling_mpt import MPT as MPT
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_mpt"] = ["VLLMMPT"]
+  if t.TYPE_CHECKING: from .modeling_vllm_mpt import VLLMMPT as VLLMMPT
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/mpt/configuration_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/configuration_mpt.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+import sys, typing as t
+
+import openllm
+
+if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
+else: MPTPromptType = str
+
+class MPTConfig(openllm.LLMConfig):
+  """MPT is a decoder-style transformer pretrained from scratch on English text and code.
+
+  This model was trained by [MosaicML](https://www.mosaicml.com/).
+
+  ``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
+  on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
+  for more details on specific models.
+  """
+  __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
+                "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
+  prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
+  max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
+  class GenerationConfig:
+    max_new_tokens: int = 128
+    temperature: float = 0
+    top_p: float = 0.8
+
+START_MPT_COMMAND_DOCSTRING = """\
+Run a LLMServer for MPT model.
+
+\b
+> See more information about MPT at [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
+
+\b
+## Usage
+
+Currently, MPT only supports PyTorch. Make sure ``torch`` is available in your system.
+
+If you want to use Flash Attention support with openai/triton, make sure to install OpenLLM with
+
+\b
+```bash
+pip install "openllm[mpt]"
+```
+
+\b
+MPT Runner will use mosaicml/mpt-7b-instruct as the default model. To change to any other MPT
+saved pretrained, or a fine-tune MPT, provide ``OPENLLM_MPT_MODEL_ID='mosaicml/mpt-30b'``
+or provide `--model-id` flag when running ``openllm start mpt``:
+
+\b
+$ openllm start mpt --model-id mosaicml/mpt-30b
+"""
+INSTRUCTION_KEY, RESPONSE_KEY, END_KEY = "### Instruction:", "### Response:", "### End"
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+# NOTE: This is the prompt that is used for generating responses using an already
+# trained model.  It ends with the response key, where the job of the model is to provide
+# the completion that follows it (i.e. the response itself).
+_chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instruction}""", """{intro}
+{instruction_key}
+{instruction}
+{response_key}
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
+def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
+DEFAULT_PROMPT_TEMPLATE = _get_prompt
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+import logging, typing as t, bentoml, openllm
+from openllm._prompt import process_prompt
+from openllm.utils import generate_labels, is_triton_available
+from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
+
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
+
+logger = logging.getLogger(__name__)
+def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
+  config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
+  if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
+  if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
+  else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
+  # setting max_seq_len
+  config.max_seq_len = max_sequence_length
+  return config
+class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  def llm_post_init(self) -> None: self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
+  def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
+    _, tokenizer_attrs = self.llm_parameters
+    torch_dtype = attrs.pop("torch_dtype", self.dtype)
+    device_map = attrs.pop("device_map", None)
+    attrs.pop("low_cpu_mem_usage", None)
+    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
+    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
+    try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    finally: torch.cuda.empty_cache()
+  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
+    torch_dtype = attrs.pop("torch_dtype", self.dtype)
+    device_map = attrs.pop("device_map", None)
+    trust_remote_code = attrs.pop("trust_remote_code", True)
+    config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
+    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
+    model.tie_weights()
+    return model
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    _template = None
+    if use_default_prompt_template:
+      if prompt_type is None:
+        if "instruct" in self.model_id: prompt_type = "instruct"
+        elif "storywriter" in self.model_id: prompt_type = "storywriter"
+        elif "chat" in self.model_id: prompt_type = "chat"
+        else: prompt_type = "default"
+      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
+    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    llm_config = self.config.model_construct_env(**attrs)
+    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
+    with torch.inference_mode():
+      if torch.cuda.is_available():
+        with torch.autocast("cuda", torch.float16):  # type: ignore[attr-defined]
+          generated_tensors = self.model.generate(**inputs, **attrs)
+      else: generated_tensors = self.model.generate(**inputs, **attrs)
+    return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
+if t.TYPE_CHECKING: import transformers, vllm
+
+logger = logging.getLogger(__name__)
+class VLLMMPT(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    _template = None
+    if use_default_prompt_template:
+      if prompt_type is None:
+        if "instruct" in self.model_id: prompt_type = "instruct"
+        elif "storywriter" in self.model_id: prompt_type = "storywriter"
+        elif "chat" in self.model_id: prompt_type = "chat"
+        else: prompt_type = "default"
+      _template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
+    return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
--- a/openllm-python/src/openllm/models/opt/init.py
+++ b/openllm-python/src/openllm/models/opt/init.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_opt import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING,
+    OPTConfig as OPTConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_opt"] = ["OPT"]
+  if t.TYPE_CHECKING: from .modeling_opt import OPT as OPT
+try:
+  if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
+  if t.TYPE_CHECKING: from .modeling_flax_opt import FlaxOPT as FlaxOPT
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_opt"] = ["VLLMOPT"]
+  if t.TYPE_CHECKING: from .modeling_vllm_opt import VLLMOPT as VLLMOPT
+try:
+  if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_tf_opt"] = ["TFOPT"]
+  if t.TYPE_CHECKING: from .modeling_tf_opt import TFOPT as TFOPT
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/opt/configuration_opt.py
+++ b/openllm-python/src/openllm/models/opt/configuration_opt.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+import openllm
+
+class OPTConfig(openllm.LLMConfig):
+  """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
+
+  OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
+  within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
+  objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
+  the self-supervised causal language modeling objective.
+
+  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
+  """
+  __config__ = {
+      "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
+      "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
+      "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
+  }
+  format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
+  class GenerationConfig:
+    top_k: int = 15
+    temperature: float = 0.75
+    max_new_tokens: int = 1024
+    num_return_sequences: int = 1
+
+START_OPT_COMMAND_DOCSTRING = """\
+Run a LLMServer for OPT model.
+
+\b
+> See more information about falcon at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
+
+\b
+## Usage
+
+By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
+
+\b
+- To use Flax, set the environment variable ``OPENLLM_OPT_FRAMEWORK="flax"``
+
+\b
+- To use Tensorflow, set the environment variable ``OPENLLM_OPT_FRAMEWORK="tf"``
+
+\b
+OPT Runner will use facebook/opt-2.7b as the default model. To change to any other OPT
+saved pretrained, or a fine-tune OPT, provide ``OPENLLM_OPT_MODEL_ID='facebook/opt-6.7b'``
+or provide `--model-id` flag when running ``openllm start opt``:
+
+\b
+$ openllm start opt --model-id facebook/opt-6.7b
+"""
+DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+import logging, typing as t, bentoml, openllm
+from openllm._prompt import process_prompt
+from openllm.utils import generate_labels
+from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
+  __openllm_internal__ = True
+  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    tokenizer.pad_token_id = config.pad_token_id
+    return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
+    if len(generation_result) == 1: return generation_result[0]
+    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+    else: return "\n".join(generation_result)
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
+  __openllm_internal__ = True
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
+    if len(generation_result) == 1: return generation_result[0]
+    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+    else: return "\n".join(generation_result)
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+import logging, typing as t, bentoml, openllm
+from openllm._prompt import process_prompt
+from openllm.utils import generate_labels
+from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
+  __openllm_internal__ = True
+  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    tokenizer.pad_token_id = config.pad_token_id
+    return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
+    if len(generation_result) == 1: return generation_result[0]
+    if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+    else: return "\n".join(generation_result)
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
--- a/openllm-python/src/openllm/models/stablelm/init.py
+++ b/openllm-python/src/openllm/models/stablelm/init.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_stablelm import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING,
+    StableLMConfig as StableLMConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_stablelm"] = ["StableLM"]
+  if t.TYPE_CHECKING: from .modeling_stablelm import StableLM as StableLM
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_stablelm"] = ["VLLMStableLM"]
+  if t.TYPE_CHECKING: from .modeling_vllm_stablelm import VLLMStableLM as VLLMStableLM
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+import openllm
+
+class StableLMConfig(openllm.LLMConfig):
+  """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
+
+  It is pre-trained on a diverse collection of English datasets with a sequence
+  length of 4096 to push beyond the context window limitations of existing open-source language models.
+
+  StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
+  built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
+  instruction-following datasets.
+
+  Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
+  and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
+  for more information.
+  """
+  __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
+                "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
+  class GenerationConfig:
+    temperature: float = 0.9
+    max_new_tokens: int = 128
+    top_k: int = 0
+    top_p: float = 0.9
+
+START_STABLELM_COMMAND_DOCSTRING = """\
+Run a LLMServer for StableLM model.
+
+\b
+> See more information about StableLM at [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
+
+\b
+## Usage
+
+Currently, StableLM only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+StableLM Runner will use stabilityai/stablelm-base-alpha-3b as the default model. To change to any other StableLM
+saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_MODEL_ID='stabilityai/stablelm-tuned-alpha-3b'``
+or provide `--model-id` flag when running ``openllm start stablelm``:
+
+\b
+$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
+"""
+SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
+DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE, SYSTEM_PROMPT
+
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
+
+logger = logging.getLogger(__name__)
+class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  def llm_post_init(self) -> None: self.bettertransformer = True if not torch.cuda.is_available() else False
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    if "tuned" in self._model_id and use_default_prompt_template:
+      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
+      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
+    else: prompt_text = prompt
+    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
+  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)]
--- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from openllm._prompt import process_prompt
+from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE, SYSTEM_PROMPT
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMStableLM(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    if "tuned" in self._model_id and use_default_prompt_template:
+      system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
+      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
+    else: prompt_text = prompt
+    return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
--- a/openllm-python/src/openllm/models/starcoder/init.py
+++ b/openllm-python/src/openllm/models/starcoder/init.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+import sys, typing as t
+from openllm.exceptions import MissingDependencyError
+from openllm.utils import LazyModule, is_torch_available, is_vllm_available
+
+_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
+if t.TYPE_CHECKING:
+  from .configuration_starcoder import (
+    DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
+    START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING,
+    StarCoderConfig as StarCoderConfig,
+  )
+try:
+  if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_starcoder"] = ["StarCoder"]
+  if t.TYPE_CHECKING: from .modeling_starcoder import StarCoder as StarCoder
+try:
+  if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else:
+  _import_structure["modeling_vllm_starcoder"] = ["VLLMStarCoder"]
+  if t.TYPE_CHECKING: from .modeling_vllm_starcoder import VLLMStarCoder as VLLMStarCoder
+
+sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import openllm
+
+class StarCoderConfig(openllm.LLMConfig):
+  """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
+
+  The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
+  [a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
+  [Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
+
+  Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
+  """
+  __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
+                "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
+  class GenerationConfig:
+    temperature: float = 0.2
+    max_new_tokens: int = 256
+    min_new_tokens: int = 32
+    top_k: float = 50
+    top_p: float = 0.95
+    pad_token_id: int = 49152
+    repetition_penalty: float = 1.2
+
+START_STARCODER_COMMAND_DOCSTRING = """\
+Run a LLMServer for StarCoder model.
+
+\b
+> See more information about StarCoder at [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
+
+\b
+## Usage
+
+Currently, StarCoder only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+StarCoder Runner will use bigcode/starcoder as the default model. To change to any other StarCoder
+saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_MODEL_ID='bigcode/starcoder'``
+or provide `--model-id` flag when running ``openllm start starcoder``:
+
+\b
+$ openllm start starcoder --model-id 'bigcode/starcoder'
+"""
+DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
+FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+import logging, typing as t, bentoml, openllm
+from openllm.utils import generate_labels
+from .configuration_starcoder import EOD, FIM_INDICATOR, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
+  __openllm_internal__ = True
+  @property
+  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
+    torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
+    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
+    try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    finally: torch.cuda.empty_cache()
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
+    if fim_mode:
+      try: prefix, suffix = prompt.split(FIM_INDICATOR)
+      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+    else: prompt_text = prompt
+    # XXX: This value for pad_token_id is currently a hack, need more investigate why the
+    # default starcoder doesn't include the same value as santacoder EOD
+    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
+  def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+    with torch.inference_mode():
+      # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
+      # NOTE: support fine-tuning starcoder
+      result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      # TODO: We will probably want to return the tokenizer here so that we can manually process this
+      # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
+      return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([]))
+    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
+    result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    # Inference API returns the stop sequence
+    for stop_seq in stop:
+      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
+    return [{"generated_text": result}]
--- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+import logging, typing as t, openllm
+from .configuration_starcoder import EOD, FIM_INDICATOR, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
+if t.TYPE_CHECKING: import vllm, transformers
+
+logger = logging.getLogger(__name__)
+class VLLMStarCoder(openllm.LLM["vllm.LLMEngine", "transformers.GPT2TokenizerFast"]):
+  __openllm_internal__ = True
+  tokenizer_id = "local"
+  def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
+    if fim_mode:
+      try: prefix, suffix = prompt.split(FIM_INDICATOR)
+      except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+      prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+    else: prompt_text = prompt
+    # XXX: This value for pad_token_id is currently a hack, need more investigate why the
+    # default starcoder doesn't include the same value as santacoder EOD
+    return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
--- a/openllm-python/src/openllm/playground/README.md
+++ b/openllm-python/src/openllm/playground/README.md
@@ -0,0 +1,14 @@
+This folder represents a playground and source of truth for a features
+development around OpenLLM
+
+```bash
+openllm playground
+```
+
+## Usage for developing this module
+
+Write a python script that can be used by `jupytext` to convert it to a jupyter
+notebook via `openllm playground`
+
+Make sure to add the new file and its documentation into
+[`_meta.yml`](./_meta.yml).
--- a/openllm-python/src/openllm/playground/init.py
+++ b/openllm-python/src/openllm/playground/init.py
--- a/openllm-python/src/openllm/playground/_meta.yml
+++ b/openllm-python/src/openllm/playground/_meta.yml
@@ -0,0 +1,50 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+features:
+  description: |
+    ## General introduction to OpenLLM.
+
+    This script will demo a few features from OpenLLM:
+
+    - Usage of Auto class abstraction and run prediction with `generate`
+    - Ability to send per-requests parameters
+    - Runner integration with BentoML
+opt_tuned:
+  description: |
+    ## Fine tuning OPT
+
+    This script demonstrate how one can easily fine tune OPT
+    with [LoRa](https://arxiv.org/abs/2106.09685) and in int8 with bitsandbytes.
+
+    It is based on one of the Peft examples fine tuning script.
+    It requires at least one GPU to be available, so make sure to have it.
+falcon_tuned:
+  description: |
+    ## Fine tuning Falcon
+
+    This script demonstrate how one can fine tune Falcon using [QLoRa](https://arxiv.org/pdf/2305.14314.pdf),
+    [trl](https://github.com/lvwerra/trl).
+
+    It is trained using OpenAssistant's Guanaco [dataset](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)
+
+    It requires at least one GPU to be available, so make sure to have it.
+llama2_qlora:
+  description: |
+    ## Fine tuning LlaMA 2
+    This script demonstrate how one can fine tune Falcon using LoRA with [trl](https://github.com/lvwerra/trl)
+
+    It is trained using the [Dolly datasets](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
+
+    It requires at least one GPU to be available.
--- a/openllm-python/src/openllm/playground/falcon_tuned.py
+++ b/openllm-python/src/openllm/playground/falcon_tuned.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+import dataclasses
+import logging
+import os
+import sys
+import typing as t
+
+import torch
+import transformers
+
+# import openllm here for OPENLLMDEVDEBUG
+import openllm
+
+# Make sure to have at least one GPU to run this script
+
+openllm.utils.configure_logging()
+
+logger = logging.getLogger(__name__)
+
+# On notebook, make sure to install the following
+# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
+
+from datasets import load_dataset
+from trl import SFTTrainer
+
+DEFAULT_MODEL_ID = "ybelkada/falcon-7b-sharded-bf16"
+DATASET_NAME = "timdettmers/openassistant-guanaco"
+
+@dataclasses.dataclass
+class TrainingArguments:
+  per_device_train_batch_size: int = dataclasses.field(default=4)
+  gradient_accumulation_steps: int = dataclasses.field(default=4)
+  optim: str = dataclasses.field(default="paged_adamw_32bit")
+  save_steps: int = dataclasses.field(default=10)
+  warmup_steps: int = dataclasses.field(default=10)
+  max_steps: int = dataclasses.field(default=500)
+  logging_steps: int = dataclasses.field(default=10)
+  learning_rate: float = dataclasses.field(default=2e-4)
+  max_grad_norm: float = dataclasses.field(default=0.3)
+  warmup_ratio: float = dataclasses.field(default=0.03)
+  fp16: bool = dataclasses.field(default=True)
+  group_by_length: bool = dataclasses.field(default=True)
+  lr_scheduler_type: str = dataclasses.field(default="constant")
+  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "falcon"))
+
+@dataclasses.dataclass
+class ModelArguments:
+  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
+  max_sequence_length: int = dataclasses.field(default=512)
+
+parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
+if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+  # If we pass only one argument to the script and it's the path to a json file,
+  # let's parse it to get our arguments.
+  model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+else:
+  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
+
+model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ensure_available=True,).prepare_for_training(adapter_type="lora", lora_alpha=16, lora_dropout=0.1, r=16, bias="none", target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h",],)
+model.config.use_cache = False
+tokenizer.pad_token = tokenizer.eos_token
+
+dataset = load_dataset(DATASET_NAME, split="train")
+
+trainer = SFTTrainer(model=model, train_dataset=dataset, dataset_text_field="text", max_seq_length=model_args.max_sequence_length, tokenizer=tokenizer, args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args),),)
+
+# upcast layernorm in float32 for more stable training
+for name, module in trainer.model.named_modules():
+  if "norm" in name:
+    module = module.to(torch.float32)
+
+trainer.train()
+
+trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))
--- a/openllm-python/src/openllm/playground/features.py
+++ b/openllm-python/src/openllm/playground/features.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+import argparse
+import logging
+import typing as t
+
+import openllm
+
+openllm.utils.configure_logging()
+
+logger = logging.getLogger(__name__)
+
+MAX_NEW_TOKENS = 384
+
+Q = "Answer the following question, step by step:\n{q}\nA:"
+question = "What is the meaning of life?"
+
+def main() -> int:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("question", default=question)
+
+  if openllm.utils.in_notebook():
+    args = parser.parse_args(args=[question])
+  else:
+    args = parser.parse_args()
+
+  model = openllm.AutoLLM.for_model("opt", model_id="facebook/opt-2.7b", ensure_available=True)
+  prompt = Q.format(q=args.question)
+
+  logger.info("-" * 50, "Running with 'generate()'", "-" * 50)
+  res = model.generate(prompt, max_new_tokens=MAX_NEW_TOKENS)
+  logger.info("=" * 10, "Response:", model.postprocess_generate(prompt, res))
+
+  logger.info("-" * 50, "Running with 'generate()' with per-requests argument", "-" * 50)
+  res = model.generate(prompt, num_return_sequences=3)
+  logger.info("=" * 10, "Response:", model.postprocess_generate(prompt, res))
+
+  logger.info("-" * 50, "Using Runner abstraction with runner.generate.run()", "-" * 50)
+  r = openllm.Runner("opt", model_id="facebook/opt-350m", init_local=True)
+  res = r.generate.run(prompt)
+  logger.info("=" * 10, "Response:", r.llm.postprocess_generate(prompt, res))
+
+  logger.info("-" * 50, "Using Runner abstraction with runner()", "-" * 50)
+  res = r(prompt)
+  logger.info("=" * 10, "Response:", r.llm.postprocess_generate(prompt, res))
+
+  return 0
+
+def _mp_fn(index: t.Any):  # noqa # type: ignore
+  # For xla_spawn (TPUs)
+  main()
+
+if openllm.utils.in_notebook():
+  main()
+else:
+  raise SystemExit(main())
--- a/openllm-python/src/openllm/playground/llama2_qlora.py
+++ b/openllm-python/src/openllm/playground/llama2_qlora.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+import dataclasses
+import logging
+import os
+import sys
+import typing as t
+
+import torch
+import transformers
+
+# import openllm here for OPENLLMDEVDEBUG
+import openllm
+
+if t.TYPE_CHECKING:
+  import peft
+
+# Make sure to have at least one GPU to run this script
+
+openllm.utils.configure_logging()
+
+logger = logging.getLogger(__name__)
+
+# On notebook, make sure to install the following
+# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
+
+from functools import partial
+from itertools import chain
+from random import randint, randrange
+
+import bitsandbytes as bnb
+from datasets import load_dataset
+
+# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
+def find_all_linear_names(model):
+  lora_module_names = set()
+  for name, module in model.named_modules():
+    if isinstance(module, bnb.nn.Linear4bit):
+      names = name.split(".")
+      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+  if "lm_head" in lora_module_names:  # needed for 16-bit
+    lora_module_names.remove("lm_head")
+  return list(lora_module_names)
+
+# Change this to the local converted path if you don't have access to the meta-llama model
+DEFAULT_MODEL_ID = "meta-llama/Llama-2-7b-hf"
+# change this to 'main' if you want to use the latest llama
+DEFAULT_MODEL_VERSION = "335a02887eb6684d487240bbc28b5699298c3135"
+DATASET_NAME = "databricks/databricks-dolly-15k"
+
+def format_dolly(sample):
+  instruction = f"### Instruction\n{sample['instruction']}"
+  context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
+  response = f"### Answer\n{sample['response']}"
+  # join all the parts together
+  prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
+  return prompt
+
+# template dataset to add prompt to each sample
+def template_dataset(sample, tokenizer):
+  sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
+  return sample
+
+# empty list to save remainder from batches to use in next batch
+remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}
+
+def chunk(sample, chunk_length=2048):
+  # define global remainder variable to save remainder from batches to use in next batch
+  global remainder
+  # Concatenate all texts and add remainder from previous batch
+  concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
+  concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
+  # get total number of tokens for batch
+  batch_total_length = len(concatenated_examples[next(iter(sample.keys()))])
+
+  # get max number of chunks for batch
+  if batch_total_length >= chunk_length:
+    batch_chunk_length = (batch_total_length//chunk_length) * chunk_length
+
+  # Split by chunks of max_len.
+  result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()}
+  # add remainder to global variable for next batch
+  remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
+  # prepare labels
+  result["labels"] = result["input_ids"].copy()
+  return result
+
+def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
+  # Load dataset from the hub
+  dataset = load_dataset(dataset_name, split="train")
+
+  print(f"dataset size: {len(dataset)}")
+  print(dataset[randrange(len(dataset))])
+
+  # apply prompt template per sample
+  dataset = dataset.map(partial(template_dataset, tokenizer=tokenizer), remove_columns=list(dataset.features))
+  # print random sample
+  print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])
+
+  # tokenize and chunk dataset
+  lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True,)
+
+  # Print total number of samples
+  print(f"Total number of samples: {len(lm_dataset)}")
+  return lm_dataset
+
+def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True,) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
+  from peft.tuners.lora import LoraLayer
+
+  llm = openllm.AutoLLM.for_model("llama", model_id=model_id, model_version=model_version, ensure_available=True, quantize="int4", bnb_4bit_compute_dtype=torch.bfloat16, use_cache=not gradient_checkpointing, device_map="auto",)
+  print("Model summary:", llm.model)
+
+  # get lora target modules
+  modules = find_all_linear_names(llm.model)
+  print(f"Found {len(modules)} modules to quantize: {modules}")
+
+  model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules)
+
+  # pre-process the model by upcasting the layer norms in float 32 for
+  for name, module in model.named_modules():
+    if isinstance(module, LoraLayer):
+      if bf16:
+        module = module.to(torch.bfloat16)
+    if "norm" in name:
+      module = module.to(torch.float32)
+    if "lm_head" in name or "embed_tokens" in name:
+      if hasattr(module, "weight"):
+        if bf16 and module.weight.dtype == torch.float32:
+          module = module.to(torch.bfloat16)
+  return model, tokenizer
+
+@dataclasses.dataclass
+class TrainingArguments:
+  per_device_train_batch_size: int = dataclasses.field(default=1)
+  gradient_checkpointing: bool = dataclasses.field(default=True)
+  bf16: bool = dataclasses.field(default=torch.cuda.get_device_capability()[0] == 8)
+  learning_rate: float = dataclasses.field(default=5e-5)
+  num_train_epochs: int = dataclasses.field(default=3)
+  logging_steps: int = dataclasses.field(default=1)
+  report_to: str = dataclasses.field(default="none")
+  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "llama"))
+  save_strategy: str = dataclasses.field(default="no")
+
+@dataclasses.dataclass
+class ModelArguments:
+  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
+  model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION)
+  seed: int = dataclasses.field(default=42)
+  merge_weights: bool = dataclasses.field(default=False)
+
+if openllm.utils.in_notebook():
+  model_args, training_rags = ModelArguments(), TrainingArguments()
+else:
+  parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
+  if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+    # If we pass only one argument to the script and it's the path to a json file,
+    # let's parse it to get our arguments.
+    model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+  else:
+    model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
+
+# import the model first hand
+openllm.import_model("llama", model_id=model_args.model_id, model_version=model_args.model_version)
+
+def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
+  import peft
+
+  transformers.set_seed(model_args.seed)
+
+  model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,)
+  datasets = prepare_datasets(tokenizer)
+
+  trainer = transformers.Trainer(model=model, args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)), train_dataset=datasets, data_collator=transformers.default_data_collator,)
+
+  trainer.train()
+
+  if model_args.merge_weights:
+    # note that this will requires larger GPU as we will load the whole model into memory
+
+    # merge adapter weights with base model and save
+    # save int4 model
+    trainer.model.save_pretrained(training_args.output_dir, safe_serialization=False)
+
+    # gc mem
+    del model, trainer
+    torch.cuda.empty_cache()
+
+    model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
+    # merge lora with base weights and save
+    model = model.merge_and_unload()
+    model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
+  else:
+    trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))
+
+train_loop(model_args, training_args)
--- a/openllm-python/src/openllm/playground/opt_tuned.py
+++ b/openllm-python/src/openllm/playground/opt_tuned.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+import dataclasses
+import logging
+import os
+import sys
+import typing as t
+
+import transformers
+
+# import openllm here for OPENLLMDEVDEBUG
+import openllm
+
+# Make sure to have at least one GPU to run this script
+
+openllm.utils.configure_logging()
+
+logger = logging.getLogger(__name__)
+
+# On notebook, make sure to install the following
+# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
+
+from datasets import load_dataset
+
+if t.TYPE_CHECKING:
+  from peft import PeftModel
+
+DEFAULT_MODEL_ID = "facebook/opt-6.7b"
+
+def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments,):
+  return transformers.Trainer(model=model, train_dataset=dataset_dict["train"], args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args),), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),)
+
+@dataclasses.dataclass
+class TrainingArguments:
+  per_device_train_batch_size: int = dataclasses.field(default=4)
+  gradient_accumulation_steps: int = dataclasses.field(default=4)
+  warmup_steps: int = dataclasses.field(default=10)
+  max_steps: int = dataclasses.field(default=50)
+  learning_rate: float = dataclasses.field(default=3e-4)
+  fp16: bool = dataclasses.field(default=True)
+  logging_steps: int = dataclasses.field(default=1)
+  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "opt"))
+
+@dataclasses.dataclass
+class ModelArguments:
+  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
+
+parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
+if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+  # If we pass only one argument to the script and it's the path to a json file,
+  # let's parse it to get our arguments.
+  model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+else:
+  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
+
+model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True,).prepare_for_training(adapter_type="lora", r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none",)
+
+# ft on english_quotes
+data = load_dataset("Abirate/english_quotes")
+data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+trainer = load_trainer(model, tokenizer, data, training_args)
+model.config.use_cache = False  # silence just for warning, reenable for inference later
+
+trainer.train()
+
+trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))
--- a/openllm-python/src/openllm/py.typed
+++ b/openllm-python/src/openllm/py.typed
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -0,0 +1,93 @@
+"""Serialisation utilities for OpenLLM.
+
+Currently supports transformers for PyTorch, Tensorflow and Flax.
+
+Currently, GGML format is working in progress.
+
+## Usage
+
+```python
+import openllm
+
+llm = openllm.AutoLLM.for_model("dolly-v2")
+llm.save_pretrained("./path/to/local-dolly")
+```
+
+To use different runtime, specify directly in the `for_model` method:
+
+```python
+import openllm
+
+llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml')
+llm.save_pretrained("./path/to/local-dolly")
+```
+"""
+from __future__ import annotations
+import importlib, typing as t
+import cloudpickle, fs, openllm
+from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
+from openllm._typing_compat import M, T, ParamSpec
+
+if t.TYPE_CHECKING:
+  import bentoml
+  from . import (
+    constants as constants,
+    ggml as ggml,
+    transformers as transformers,
+  )
+
+P = ParamSpec("P")
+def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
+  """Load the tokenizer from BentoML store.
+
+  By default, it will try to find the bentomodel whether it is in store..
+  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
+  """
+  from .transformers._helpers import infer_tokenizers_from_llm, process_config
+
+  config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__)
+  bentomodel_fs = fs.open_fs(llm._bentomodel.path)
+  if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
+    with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
+      try: tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
+      except KeyError: raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save"
+                                                                " the tokenizer within the model via 'custom_objects'."
+                                                                " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
+  else: tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath("/"), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs)
+
+  if tokenizer.pad_token_id is None:
+    if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id
+    elif config.eos_token_id is not None: tokenizer.pad_token_id = config.eos_token_id
+    elif tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id
+    else: tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+  return tokenizer
+
+class _Caller(t.Protocol[P]):
+  def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ...
+
+_extras = ["get", "import_model", "save_pretrained", "load_model"]
+def _make_dispatch_function(fn: str) -> _Caller[P]:
+  def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
+    """Generic function dispatch to correct serialisation submodules based on LLM runtime.
+
+    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
+
+    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"'
+    """
+    return getattr(importlib.import_module(f".{llm.runtime}", __name__), fn)(llm, *args, **kwargs)
+  return caller
+
+if t.TYPE_CHECKING:
+  def get(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: ...
+  def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: ...
+  def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None: ...
+  def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M: ...
+
+_import_structure: dict[str, list[str]] = {"ggml": [], "transformers": [], "constants": []}
+__all__ = ["ggml", "transformers", "constants", "load_tokenizer", *_extras]
+def __dir__() -> list[str]: return sorted(__all__)
+def __getattr__(name: str) -> t.Any:
+  if name == "load_tokenizer": return load_tokenizer
+  elif name in _import_structure: return importlib.import_module(f".{name}", __name__)
+  elif name in _extras: return _make_dispatch_function(name)
+  else: raise AttributeError(f"{__name__} has no attribute {name}")
--- a/Show More
+++ b/Show More