Merge openllm-next as openllm 0.6

2026-06-11 01:49:52 -04:00 · 2024-07-09 14:21:52 +08:00
parent d25ea794f3 57a1c86c22
commit 0b93db47cd
17 changed files with 2482 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,5 @@
+**/_next/ linguist-generated=true
+
+* text=auto eol=lf
+# Needed for setuptools-scm-git-archive
+.git_archival.txt  export-subst
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,163 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.whl
+# Environments
+venv/
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -0,0 +1,107 @@
+# Developer Guide
+
+This Developer Guide is designed to help you contribute to the OpenLLM project.
+Follow these steps to set up your development environment and learn the process
+of contributing to our open-source project.
+
+Join our [Discord Channel](https://l.bentoml.com/join-openllm-discord) and reach
+out to us if you have any question!
+
+## Table of Contents
+
+- [Developer Guide](#developer-guide)
+  - [Table of Contents](#table-of-contents)
+  - [Setting Up Your Development Environment](#setting-up-your-development-environment)
+  - [Development Workflow](#development-workflow)
+    - [Adding new models](#adding-new-models)
+    - [Adding bentos](#adding-new-models)
+    - [Adding repos](#adding-new-models)
+
+## Setting Up Your Development Environment
+
+Before you can start developing, you'll need to set up your environment:
+
+1. Ensure you have [Git](https://git-scm.com/), and
+   [Python3.8+](https://www.python.org/downloads/) installed.
+2. Fork the OpenLLM repository from GitHub.
+3. Clone the forked repository from GitHub:
+
+   ```bash
+   git clone git@github.com:username/OpenLLM.git && cd openllm
+   ```
+
+4. Add the OpenLLM upstream remote to your local OpenLLM clone:
+
+   ```bash
+   git remote add upstream git@github.com:bentoml/OpenLLM.git
+   ```
+
+5. Configure git to pull from the upstream remote:
+
+   ```bash
+   git switch main # ensure you're on the main branch
+   git fetch upstream --tags
+   git branch --set-upstream-to=upstream/main
+   ```
+
+## Development Workflow
+
+There are a few ways to contribute to the repository structure for OpenLLM:
+
+### Adding new models
+
+1. [recipe.yaml](./recipe.yaml) contains all related-metadata for generating new LLM-based bentos. To add a new LLM, the following structure should be adhere to:
+
+```yaml
+"<model_name>:<model_tag>":
+  project: vllm-chat
+  service_config:
+    name: phi3
+    traffic:
+      timeout: 300
+    resources:
+      gpu: 1
+      gpu_type: nvidia-tesla-l4
+  engine_config:
+    model: microsoft/Phi-3-mini-4k-instruct
+    max_model_len: 4096
+    dtype: half
+  chat_template: phi-3
+```
+
+- `<model_name>` represents the type of model to be supported. Currently supports `phi3`, `llama2`, `llama3`, `gemma`
+
+- `<model_tag>` emphasizes the type of model and its related metadata. The convention would include `<model_size>-<model_type>-<precision>[-<quantization>]`
+  For example:
+
+  - `microsoft/Phi-3-mini-4k-instruct` should be represented as `3.8b-instruct-fp16`.
+  - `TheBloke/Llama-2-7B-Chat-AWQ` would be `7b-chat-awq-4bit`
+
+- `project` would be used as the basis for the generated bento. Currently, most models should use `vllm-chat` as default.
+
+- `service_config` entails all BentoML-related [configuration](https://docs.bentoml.com/en/latest/guides/configurations.html) to run this bento.
+
+> [!NOTE]
+>
+> We recommend to include the following field for `service_config`:
+>
+> - `name` should be the same as `<model_name>`
+> - `resources` includes the available accelerator that can run this models. See more [here](https://docs.bentoml.com/en/latest/guides/configurations.html#resources)
+
+- `engine_config` are fields to be used for vLLM engine. See more supported arguments in [`AsyncEngineArgs`](https://github.com/vllm-project/vllm/blob/7cd2ebb0251fd1fd0eec5c93dac674603a22eddd/vllm/engine/arg_utils.py#L799). We recommend to always include `model`, `max_model_len`, `dtype` and `trust_remote_code`.
+
+- If the model is a chat model, `chat_template` should be used. Add the appropriate `chat_template` under [chat_template directory](./vllm-chat/chat_templates/) should you decide to do so.
+
+2. You can then run `BENTOML_HOME=$(openllm repo default)/bentoml/bentos python make.py <model_name>:<model_tag>` to generate the required bentos.
+
+3. You can then submit a [Pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) to `openllm` with the recipe changes
+
+### Adding bentos
+
+OpenLLM now also manages a [generated bento repository](https://github.com/bentoml/openllm-models/tree/main). If you update and modify and generated bentos, make sure to update the recipe and added the generated bentos under `bentoml/bentos`.
+
+### Adding repos
+
+If you wish to create a your own managed git repo, you should follow the structure of [bentoml/openllm-models](https://github.com/bentoml/openllm-models/tree/main).
+
+To add your custom repo, do `openllm repo add <repo_alias> <git_url>`
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,22 @@
+```
+pip install .
+openllm serve
+# or openllm run
+```
+To find out what LLM models are already in your hands.
+
+License
+-------
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+Acknowledgements
+----------------
+
+This project makes use of the following open-source projects:
+
+* [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
+* [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
+* [chujiezheng/chat_templates](https://github.com/chujiezheng/chat_templates)
+
+We are grateful to the developers and contributors of these projects for their hard work and dedication.
--- a/openllm_next/init.py
+++ b/openllm_next/init.py
--- a/openllm_next/main.py
+++ b/openllm_next/main.py
@@ -0,0 +1,338 @@
+import os
+import random
+import sys
+from collections import defaultdict
+from typing import Annotated, Optional
+
+import questionary
+import typer
+
+from openllm_next.accelerator_spec import (
+    DeploymentTarget,
+    can_run,
+    get_local_machine_spec,
+)
+from openllm_next.analytic import DO_NOT_TRACK, OpenLLMTyper
+from openllm_next.clean import app as clean_app
+from openllm_next.cloud import deploy as cloud_deploy
+from openllm_next.cloud import ensure_cloud_context, get_cloud_machine_spec
+from openllm_next.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
+from openllm_next.local import run as local_run
+from openllm_next.local import serve as local_serve
+from openllm_next.model import app as model_app
+from openllm_next.model import ensure_bento, list_bento
+from openllm_next.repo import app as repo_app
+
+app = OpenLLMTyper(
+    help="`openllm hello` to get started. "
+    "OpenLLM is a CLI tool to manage and deploy open source LLMs and"
+    " get an OpenAI API compatible chat server in seconds.",
+)
+
+app.add_typer(repo_app, name="repo")
+app.add_typer(model_app, name="model")
+app.add_typer(clean_app, name="clean")
+
+
+def _select_bento_name(models, target):
+    from tabulate import tabulate
+
+    options = []
+    model_infos = [
+        [model.repo.name, model.name, can_run(model, target)] for model in models
+    ]
+    model_name_groups = defaultdict(lambda: 0)
+    for repo, name, score in model_infos:
+        model_name_groups[(repo, name)] += score
+    table_data = [
+        [name, repo, CHECKED if score > 0 else ""]
+        for (repo, name), score in model_name_groups.items()
+    ]
+    if not table_data:
+        output("No model found", style="red")
+        raise typer.Exit(1)
+    table = tabulate(
+        table_data,
+        headers=["model", "repo", "locally runnable"],
+    ).split("\n")
+    headers = f"{table[0]}\n   {table[1]}"
+
+    options.append(questionary.Separator(headers))
+    for table_data, table_line in zip(table_data, table[2:]):
+        options.append(questionary.Choice(table_line, value=table_data[:2]))
+    selected = questionary.select("Select a model", options).ask()
+    if selected is None:
+        raise typer.Exit(1)
+    return selected
+
+
+def _select_bento_version(models, target, bento_name, repo):
+    from tabulate import tabulate
+
+    model_infos = [
+        [model, can_run(model, target)]
+        for model in models
+        if model.name == bento_name and model.repo.name == repo
+    ]
+
+    table_data = [
+        [model.tag, CHECKED if score > 0 else ""]
+        for model, score in model_infos
+        if model.name == bento_name and model.repo.name == repo
+    ]
+    if not table_data:
+        output(f"No model found for {bento_name} in {repo}", style="red")
+        raise typer.Exit(1)
+    table = tabulate(
+        table_data,
+        headers=["version", "locally runnable"],
+    ).split("\n")
+
+    options = []
+    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
+    for table_data, table_line in zip(model_infos, table[2:]):
+        options.append(questionary.Choice(table_line, value=table_data))
+    selected = questionary.select("Select a version", options).ask()
+    if selected is None:
+        raise typer.Exit(1)
+    return selected
+
+
+def _select_target(bento, targets):
+    from tabulate import tabulate
+
+    options = []
+    targets.sort(key=lambda x: can_run(bento, x), reverse=True)
+    if not targets:
+        output(
+            "No available instance type, check your bentocloud account",
+            style="red",
+        )
+        raise typer.Exit(1)
+
+    table = tabulate(
+        [
+            [
+                target.name,
+                target.accelerators_repr,
+                f"${target.price}",
+                CHECKED if can_run(bento, target) else "insufficient res.",
+            ]
+            for target in targets
+        ],
+        headers=["instance type", "accelerator", "price/hr", "deployable"],
+    ).split("\n")
+    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
+
+    for target, line in zip(targets, table[2:]):
+        options.append(
+            questionary.Choice(
+                f"{line}",
+                value=target,
+            )
+        )
+    selected = questionary.select("Select an instance type", options).ask()
+    if selected is None:
+        raise typer.Exit(1)
+    return selected
+
+
+def _select_action(bento, score):
+    if score > 0:
+        options = [
+            questionary.Separator("Available actions"),
+            questionary.Choice(
+                "0. Run the model in terminal",
+                value="run",
+                shortcut_key="0",
+            ),
+            questionary.Separator(f"  $ openllm run {bento}"),
+            questionary.Separator(" "),
+            questionary.Choice(
+                "1. Serve the model locally and get a chat server",
+                value="serve",
+                shortcut_key="1",
+            ),
+            questionary.Separator(f"  $ openllm serve {bento}"),
+            questionary.Separator(" "),
+            questionary.Choice(
+                "2. Deploy the model to bentocloud and get a scalable chat server",
+                value="deploy",
+                shortcut_key="2",
+            ),
+            questionary.Separator(f"  $ openllm deploy {bento}"),
+        ]
+    else:
+        options = [
+            questionary.Separator("Available actions"),
+            questionary.Choice(
+                "0. Run the model in terminal",
+                value="run",
+                disabled="insufficient res.",
+                shortcut_key="0",
+            ),
+            questionary.Separator(f"  $ openllm run {bento}"),
+            questionary.Separator(" "),
+            questionary.Choice(
+                "1. Serve the model locally and get a chat server",
+                value="serve",
+                disabled="insufficient res.",
+                shortcut_key="1",
+            ),
+            questionary.Separator(f"  $ openllm serve {bento}"),
+            questionary.Separator(" "),
+            questionary.Choice(
+                "2. Deploy the model to bentocloud and get a scalable chat server",
+                value="deploy",
+                shortcut_key="2",
+            ),
+            questionary.Separator(f"  $ openllm deploy {bento}"),
+        ]
+    action = questionary.select("Select an action", options).ask()
+    if action is None:
+        raise typer.Exit(1)
+    if action == "run":
+        try:
+            local_run(bento)
+        finally:
+            output("\nUse this command to run the action again:", style="green")
+            output(f"  $ openllm run {bento}", style="orange")
+    elif action == "serve":
+        try:
+            local_serve(bento)
+        finally:
+            output("\nUse this command to run the action again:", style="green")
+            output(f"  $ openllm serve {bento}", style="orange")
+    elif action == "deploy":
+        ensure_cloud_context()
+        targets = get_cloud_machine_spec()
+        target = _select_target(bento, targets)
+        try:
+            cloud_deploy(bento, target)
+        finally:
+            output("\nUse this command to run the action again:", style="green")
+            output(
+                f"  $ openllm deploy {bento} --instance-type {target.name}",
+                style="orange",
+            )
+
+
+@app.command(help="get started interactively")
+def hello():
+    INTERACTIVE.set(True)
+    VERBOSE_LEVEL.set(20)
+
+    target = get_local_machine_spec()
+    output(f"  Detected Platform: {target.platform}", style="green")
+    if target.accelerators:
+        output("  Detected Accelerators: ", style="green")
+        for a in target.accelerators:
+            output(f"   - {a.model} {a.memory_size}GB", style="green")
+    else:
+        output("  Detected Accelerators: None", style="yellow")
+
+    models = list_bento()
+    if not models:
+        output(
+            "No model found, you probably need to update the model repo:",
+            style="red",
+        )
+        output(
+            "  $ openllm repo update",
+            style="orange",
+        )
+        raise typer.Exit(1)
+
+    bento_name, repo = _select_bento_name(models, target)
+    bento, score = _select_bento_version(models, target, bento_name, repo)
+    _select_action(bento, score)
+
+
+@app.command(help="start an OpenAI API compatible chat server and chat in browser")
+def serve(
+    model: Annotated[str, typer.Argument()] = "",
+    repo: Optional[str] = None,
+    port: int = 3000,
+    verbose: bool = False,
+):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    target = get_local_machine_spec()
+    bento = ensure_bento(model, target=target, repo_name=repo)
+    local_serve(bento, port=port)
+
+
+@app.command(help="run the model and chat in terminal")
+def run(
+    model: Annotated[str, typer.Argument()] = "",
+    repo: Optional[str] = None,
+    port: Optional[int] = None,
+    timeout: int = 600,
+    verbose: bool = False,
+):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    target = get_local_machine_spec()
+    bento = ensure_bento(model, target=target, repo_name=repo)
+    if port is None:
+        port = random.randint(30000, 40000)
+    local_run(bento, port=port, timeout=timeout)
+
+
+@app.command(
+    help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)",
+)
+def deploy(
+    model: Annotated[str, typer.Argument()] = "",
+    instance_type: Optional[str] = None,
+    repo: Optional[str] = None,
+    verbose: bool = False,
+):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    bento = ensure_bento(model, repo_name=repo)
+    if instance_type is not None:
+        cloud_deploy(bento, DeploymentTarget(name=instance_type))
+        return
+    targets = get_cloud_machine_spec()
+    targets = filter(lambda x: can_run(bento, x) > 0, targets)
+    targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
+    if not targets:
+        output(
+            "No available instance type, check your bentocloud account",
+            style="red",
+        )
+        raise typer.Exit(1)
+    target = targets[0]
+    output(
+        f"Recommended instance type: {target.name}",
+        style="green",
+    )
+    cloud_deploy(bento, target)
+
+
+@app.callback(invoke_without_command=True)
+def typer_callback(
+    verbose: int = 0,
+    do_not_track: bool = typer.Option(
+        False,
+        "--do-not-track",
+        help="Whether to disable usage tracking",
+        envvar=DO_NOT_TRACK,
+    ),
+):
+    if verbose:
+        VERBOSE_LEVEL.set(verbose)
+    if do_not_track:
+        os.environ[DO_NOT_TRACK] = str(True)
+
+
+def main():
+    if sys.version_info < (3, 9):
+        output("Python 3.8 or higher is required", style="red")
+        sys.exit(1)
+    app()
+
+
+if __name__ == "__main__":
+    main()
--- a/openllm_next/accelerator_spec.py
+++ b/openllm_next/accelerator_spec.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import functools
+import math
+import typing
+from types import SimpleNamespace
+
+import psutil
+
+from openllm_next.common import BentoInfo, DeploymentTarget, output
+
+
+class Accelerator(SimpleNamespace):
+    model: str
+    memory_size: float
+
+    def __gt__(self, other):
+        return self.memory_size > other.memory_size
+
+    def __eq__(self, other):
+        return self.memory_size == other.memory_size
+
+    def __repr__(self):
+        return f"{self.model}({self.memory_size}GB)"
+
+
+class Resource(SimpleNamespace):
+    cpu: int = 0
+    memory: float
+    gpu: int = 0
+    gpu_type: str = ""
+
+    def __hash__(self):
+        return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
+    
+    def __bool__(self):
+        return any(value is not None for value in self.__dict__.values())
+
+
+ACCELERATOR_SPEC_DICT: dict[str, dict] = {
+    "nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0},
+    "nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0},
+    "nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0},
+    "nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0},
+    "nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0},
+    "nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0},
+    "nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0},
+    "nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0},
+    "nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0},
+    "nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0},
+    "nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0},
+    "nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0},
+    "nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0},
+    "nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0},
+    "nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0},
+    "nvidia-l4": {"model": "L4", "memory_size": 24.0},
+    "nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0},
+    "nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0},
+    "nvidia-a100-80g": {"model": "A100", "memory_size": 80.0},
+    "nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0},
+    "nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0},
+}
+
+
+ACCELERATOR_SPECS: dict[str, Accelerator] = {
+    key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()
+}
+
+
+@functools.lru_cache
+def get_local_machine_spec():
+    if psutil.MACOS:
+        return DeploymentTarget(accelerators=[], source="local", platform="macos")
+
+    if psutil.WINDOWS:
+        platform = "windows"
+    elif psutil.LINUX:
+        platform = "linux"
+    else:
+        raise NotImplementedError(f"Unsupported platform")
+
+    from pynvml import (
+        nvmlDeviceGetCount,
+        nvmlDeviceGetCudaComputeCapability,
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlDeviceGetName,
+        nvmlInit,
+        nvmlShutdown,
+    )
+
+    try:
+        nvmlInit()
+        device_count = nvmlDeviceGetCount()
+        accelerators: list[Accelerator] = []
+        for i in range(device_count):
+            handle = nvmlDeviceGetHandleByIndex(i)
+            name = nvmlDeviceGetName(handle)
+            memory_info = nvmlDeviceGetMemoryInfo(handle)
+            accelerators.append(
+                Accelerator(
+                    model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
+                )
+            )
+            compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
+            if compute_capability < (7, 5):
+                output(
+                    f"GPU {name} with compute capability {compute_capability} "
+                    "may not be supported, 7.5 or higher is recommended. check "
+                    "https://developer.nvidia.com/cuda-gpus for more information",
+                    style="yellow",
+                )
+        nvmlShutdown()
+        return DeploymentTarget(
+            accelerators=accelerators,
+            source="local",
+            platform=platform,
+        )
+    except Exception as e:
+        output(
+            f"Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment",
+            style="yellow",
+        )
+        output(f"Error: {e}", style="red", level=20)
+        return DeploymentTarget(accelerators=[], source="local", platform=platform)
+
+
+@functools.lru_cache()
+def can_run(
+    bento: typing.Union[Resource, BentoInfo],
+    target: typing.Optional[DeploymentTarget] = None,
+) -> float:
+    """
+    Calculate if the bento can be deployed on the target.
+    """
+    if target is None:
+        target = get_local_machine_spec()
+
+    resource_spec = Resource(**(bento.bento_yaml["services"][0]["config"].get("resources", {})))
+    labels = bento.bento_yaml.get("labels", {})
+    platforms = labels.get("platforms", "linux").split(",")
+
+    if target.platform not in platforms:
+        return 0.0
+    
+    # return 1.0 if no resource is specified
+    if not resource_spec:
+        return 0.5
+    
+    if resource_spec.gpu > 0:
+        required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
+        filtered_accelerators = [
+            ac
+            for ac in target.accelerators
+            if ac.memory_size >= required_gpu.memory_size
+        ]
+        if resource_spec.gpu > len(filtered_accelerators):
+            return 0.0
+        return (
+            required_gpu.memory_size
+            * resource_spec.gpu
+            / sum(ac.memory_size for ac in target.accelerators)
+        )
+    if target.accelerators:
+        return 0.01 / sum(ac.memory_size for ac in target.accelerators)
+    return 1.0
--- a/openllm_next/analytic.py
+++ b/openllm_next/analytic.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import functools
+import os
+import re
+import time
+import typing
+from abc import ABC
+
+import attr
+import click
+import typer
+import typer.core
+
+DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK"
+
+
+class EventMeta(ABC):
+    @property
+    def event_name(self):
+        # camel case to snake case
+        event_name = re.sub(r"(?<!^)(?=[A-Z])", "_", self.__class__.__name__).lower()
+        # remove "_event" suffix
+        suffix_to_remove = "_event"
+        if event_name.endswith(suffix_to_remove):
+            event_name = event_name[: -len(suffix_to_remove)]
+        return event_name
+
+
+@attr.define
+class CliEvent(EventMeta):
+    cmd_group: str
+    cmd_name: str
+    duration_in_ms: float = attr.field(default=0)
+    error_type: typing.Optional[str] = attr.field(default=None)
+    return_code: typing.Optional[int] = attr.field(default=None)
+
+
+@attr.define
+class OpenllmCliEvent(CliEvent):
+    pass
+
+
+class OrderedCommands(typer.core.TyperGroup):
+    def list_commands(self, _: click.Context) -> typing.Iterable[str]:
+        return list(self.commands)
+
+
+class OpenLLMTyper(typer.Typer):
+    def __init__(self, *args: typing.Any, **kwargs: typing.Any):
+        no_args_is_help = kwargs.pop("no_args_is_help", True)
+        context_settings = kwargs.pop("context_settings", {})
+        if "help_option_names" not in context_settings:
+            context_settings["help_option_names"] = ("-h", "--help")
+        if "max_content_width" not in context_settings:
+            context_settings["max_content_width"] = int(
+                os.environ.get("COLUMNS", str(120))
+            )
+        klass = kwargs.pop("cls", OrderedCommands)
+
+        super().__init__(
+            *args,
+            cls=klass,
+            no_args_is_help=no_args_is_help,
+            context_settings=context_settings,
+            **kwargs,
+        )
+
+    def command(self, *args: typing.Any, **kwargs: typing.Any):
+        def decorator(f):
+            @functools.wraps(f)
+            @click.pass_context
+            def wrapped(ctx: click.Context, *args, **kwargs):
+                from bentoml._internal.utils.analytics import track
+
+                do_not_track = (
+                    os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true"
+                )
+
+                # so we know that the root program is openllm
+                command_name = ctx.info_name
+                if ctx.parent.parent is not None:
+                    # openllm model list
+                    command_group = ctx.parent.info_name
+                elif ctx.parent.info_name == ctx.find_root().info_name:
+                    # openllm run
+                    command_group = "openllm"
+
+                if do_not_track:
+                    return f(*args, **kwargs)
+                start_time = time.time_ns()
+                try:
+                    return_value = f(*args, **kwargs)
+                    duration_in_ns = time.time_ns() - start_time
+                    track(
+                        OpenllmCliEvent(
+                            cmd_group=command_group,
+                            cmd_name=command_name,
+                            duration_in_ms=duration_in_ns / 1e6,
+                        )
+                    )
+                    return return_value
+                except BaseException as e:
+                    duration_in_ns = time.time_ns() - start_time
+                    track(
+                        OpenllmCliEvent(
+                            cmd_group=command_group,
+                            cmd_name=command_name,
+                            duration_in_ms=duration_in_ns / 1e6,
+                            error_type=type(e).__name__,
+                            return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
+                        )
+                    )
+                    raise
+
+            return typer.Typer.command(self, *args, **kwargs)(wrapped)
+
+        return decorator
--- a/openllm_next/clean.py
+++ b/openllm_next/clean.py
@@ -0,0 +1,75 @@
+import pathlib
+import shutil
+
+import questionary
+
+from openllm_next.analytic import OpenLLMTyper
+from openllm_next.common import (
+    CONFIG_FILE,
+    REPO_DIR,
+    VENV_DIR,
+    VERBOSE_LEVEL,
+    output,
+)
+
+app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM")
+
+
+HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub"
+
+
+@app.command(help="Clean up all the cached models from huggingface")
+def model_cache(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob("*"))
+    sure = questionary.confirm(
+        f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
+    ).ask()
+    if not sure:
+        return
+    shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
+    output("All models cached by Huggingface have been removed", style="green")
+
+
+@app.command(help="Clean up all the virtual environments created by OpenLLM")
+def venvs(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    used_space = sum(f.stat().st_size for f in VENV_DIR.rglob("*"))
+    sure = questionary.confirm(
+        f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
+    ).ask()
+    if not sure:
+        return
+    shutil.rmtree(VENV_DIR, ignore_errors=True)
+    output("All virtual environments have been removed", style="green")
+
+
+@app.command(help="Clean up all the repositories cloned by OpenLLM")
+def repos(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    shutil.rmtree(REPO_DIR, ignore_errors=True)
+    output("All repositories have been removed", style="green")
+
+
+@app.command(help="Reset configurations to default")
+def configs(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    shutil.rmtree(CONFIG_FILE, ignore_errors=True)
+    output("All configurations have been reset", style="green")
+
+
+@app.command(
+    name="all",
+    help="Clean up all above and bring OpenLLM to a fresh start",
+)
+def all_cache(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    repos()
+    venvs()
+    model_cache()
+    configs()
--- a/openllm_next/cloud.py
+++ b/openllm_next/cloud.py
@@ -0,0 +1,174 @@
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import typing
+
+import typer
+
+from openllm_next.accelerator_spec import ACCELERATOR_SPECS
+from openllm_next.analytic import OpenLLMTyper
+from openllm_next.common import (
+    INTERACTIVE,
+    BentoInfo,
+    DeploymentTarget,
+    output,
+    run_command,
+)
+
+app = OpenLLMTyper()
+
+
+def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
+    cmd = ["bentoml", "deploy", bento.bentoml_tag]
+    env = {
+        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
+    }
+
+    required_envs = bento.bento_yaml.get("envs", [])
+    required_env_names = [env["name"] for env in required_envs if "name" in env]
+    if required_env_names:
+        output(
+            f"This model requires the following environment variables to run: {repr(required_env_names)}",
+            style="yellow",
+        )
+
+    for env_info in bento.bento_yaml.get("envs", []):
+        if "name" not in env_info:
+            continue
+        if os.environ.get(env_info["name"]):
+            default = os.environ[env_info["name"]]
+        elif "value" in env_info:
+            default = env_info["value"]
+        else:
+            default = ""
+
+        if INTERACTIVE.get():
+            import questionary
+
+            value = questionary.text(
+                f"{env_info['name']}:",
+                default=default,
+            ).ask()
+        else:
+            if default == "":
+                output(
+                    f"Environment variable {env_info['name']} is required but not provided",
+                    style="red",
+                )
+                raise typer.Exit(1)
+            else:
+                value = default
+
+        if value is None:
+            raise typer.Exit(1)
+        cmd += ["--env", f"{env_info['name']}={value}"]
+
+    if target:
+        cmd += ["--instance-type", target.name]
+
+    assert (pathlib.Path.home() / "bentoml" / ".yatai.yaml").exists()
+    shutil.copy(
+        pathlib.Path.home() / "bentoml" / ".yatai.yaml",
+        bento.repo.path / "bentoml" / ".yatai.yaml",
+    )
+
+    return cmd, env, None
+
+
+def ensure_cloud_context():
+    import questionary
+
+    cmd = ["bentoml", "cloud", "current-context"]
+    try:
+        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+        context = json.loads(result)
+        output(f"  bentoml already logged in: {context['endpoint']}", style="green")
+    except subprocess.CalledProcessError:
+        output("  bentoml not logged in", style="red")
+        if not INTERACTIVE.get():
+            output(
+                "\n  get bentoml logged in by:",
+            )
+            output(
+                "    $ bentoml cloud login",
+                style="orange",
+            )
+            output("")
+            output(
+                """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
+                style="yellow",
+            )
+            raise typer.Exit(1)
+        else:
+            action = questionary.select(
+                "Choose an action:",
+                choices=[
+                    "I have a BentoCloud account",
+                    "get an account in two minutes",
+                ],
+            ).ask()
+            if action is None:
+                raise typer.Exit(1)
+            elif action == "get an account in two minutes":
+                output(
+                    "Please visit https://cloud.bentoml.com to get your token",
+                    style="yellow",
+                )
+            endpoint = questionary.text(
+                "Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)"
+            ).ask()
+            if endpoint is None:
+                raise typer.Exit(1)
+            token = questionary.text(
+                "Enter your token: (similar to cniluaxxxxxxxx)"
+            ).ask()
+            if token is None:
+                raise typer.Exit(1)
+            cmd = [
+                "bentoml",
+                "cloud",
+                "login",
+                "--api-token",
+                token,
+                "--endpoint",
+                endpoint,
+            ]
+            try:
+                result = subprocess.check_output(cmd)
+                output("  Logged in successfully", style="green")
+            except subprocess.CalledProcessError:
+                output("  Failed to login", style="red")
+                raise typer.Exit(1)
+
+
+def get_cloud_machine_spec():
+    ensure_cloud_context()
+    cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
+    try:
+        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+        instance_types = json.loads(result)
+        return [
+            DeploymentTarget(
+                source="cloud",
+                name=it["name"],
+                price=it["price"],
+                platform="linux",
+                accelerators=(
+                    [ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))]
+                    if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS
+                    else []
+                ),
+            )
+            for it in instance_types
+        ]
+    except (subprocess.CalledProcessError, json.JSONDecodeError):
+        output("Failed to get cloud instance types", style="red")
+        return []
+
+
+def deploy(bento: BentoInfo, target: DeploymentTarget):
+    ensure_cloud_context()
+    cmd, env, cwd = _get_deploy_cmd(bento, target)
+    run_command(cmd, env=env, cwd=cwd)
--- a/openllm_next/common.py
+++ b/openllm_next/common.py
@@ -0,0 +1,422 @@
+from __future__ import annotations
+
+import asyncio
+import functools
+import hashlib
+import io
+import json
+import os
+import pathlib
+import signal
+import subprocess
+import sys
+import sysconfig
+import typing
+from contextlib import asynccontextmanager, contextmanager
+from types import SimpleNamespace
+
+import typer
+import typer.core
+
+ERROR_STYLE = "red"
+SUCCESS_STYLE = "green"
+
+
+CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
+REPO_DIR = CLLAMA_HOME / "repos"
+TEMP_DIR = CLLAMA_HOME / "temp"
+VENV_DIR = CLLAMA_HOME / "venv"
+
+REPO_DIR.mkdir(exist_ok=True, parents=True)
+TEMP_DIR.mkdir(exist_ok=True, parents=True)
+VENV_DIR.mkdir(exist_ok=True, parents=True)
+
+CONFIG_FILE = CLLAMA_HOME / "config.json"
+
+CHECKED = "☆"
+
+T = typing.TypeVar("T")
+
+
+class ContextVar(typing.Generic[T]):
+    def __init__(self, default: T):
+        self._stack: list[T] = []
+        self._default = default
+
+    def get(self) -> T:
+        if self._stack:
+            return self._stack[-1]
+        return self._default
+
+    def set(self, value):
+        self._stack.append(value)
+
+    @contextmanager
+    def patch(self, value):
+        self._stack.append(value)
+        try:
+            yield
+        finally:
+            self._stack.pop()
+
+
+VERBOSE_LEVEL = ContextVar(10)
+INTERACTIVE = ContextVar(False)
+FORCE = ContextVar(False)
+
+
+def output(content, level=0, style=None, end=None):
+    import questionary
+
+    if level > VERBOSE_LEVEL.get():
+        return
+
+    if not isinstance(content, str):
+        import pyaml
+
+        out = io.StringIO()
+        pyaml.pprint(
+            content,
+            dst=out,
+            sort_dicts=False,
+            sort_keys=False,
+        )
+        questionary.print(out.getvalue(), style=style, end="" if end is None else end)
+        out.close()
+
+    if isinstance(content, str):
+        questionary.print(content, style=style, end="\n" if end is None else end)
+
+
+class Config(SimpleNamespace):
+    repos: dict[str, str] = {
+        "default": "git+https://github.com/bentoml/openllm-models@main"
+    }
+    default_repo: str = "default"
+
+    def tolist(self):
+        return dict(
+            repos=self.repos,
+            default_repo=self.default_repo,
+        )
+
+
+def load_config():
+    if CONFIG_FILE.exists():
+        try:
+            with open(CONFIG_FILE) as f:
+                return Config(**json.load(f))
+        except json.JSONDecodeError:
+            return Config()
+    return Config()
+
+
+def save_config(config):
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config.tolist(), f, indent=2)
+
+
+class RepoInfo(SimpleNamespace):
+    name: str
+    path: pathlib.Path
+    url: str
+    server: str
+    owner: str
+    repo: str
+    branch: str
+
+    def tolist(self):
+        if VERBOSE_LEVEL.get() <= 0:
+            return f"{self.name} ({self.url})"
+        if VERBOSE_LEVEL.get() <= 10:
+            return dict(
+                name=self.name,
+                url=self.url,
+                path=str(self.path),
+            )
+        if VERBOSE_LEVEL.get() <= 20:
+            return dict(
+                name=self.name,
+                url=self.url,
+                path=str(self.path),
+                server=self.server,
+                owner=self.owner,
+                repo=self.repo,
+                branch=self.branch,
+            )
+
+
+class BentoInfo(SimpleNamespace):
+    repo: RepoInfo
+    path: pathlib.Path
+    alias: str = ""
+
+    def __str__(self):
+        if self.repo.name == "default":
+            return f"{self.tag}"
+        else:
+            return f"{self.repo.name}/{self.tag}"
+
+    def __hash__(self):
+        return md5(str(self.path))
+
+    @property
+    def tag(self) -> str:
+        if self.alias:
+            return f"{self.path.parent.name}:{self.alias}"
+        return f"{self.path.parent.name}:{self.path.name}"
+
+    @property
+    def bentoml_tag(self) -> str:
+        return f"{self.path.parent.name}:{self.path.name}"
+
+    @property
+    def name(self) -> str:
+        return self.path.parent.name
+
+    @property
+    def version(self) -> str:
+        return self.path.name
+
+    @property
+    def labels(self) -> dict[str, str]:
+        return self.bento_yaml["labels"]
+
+    @functools.cached_property
+    def bento_yaml(self) -> dict:
+        import yaml
+
+        bento_file = self.path / "bento.yaml"
+        return yaml.safe_load(bento_file.read_text())
+
+    @functools.cached_property
+    def platforms(self) -> list[str]:
+        return self.bento_yaml["labels"].get("platforms", "linux").split(",")
+
+    @functools.cached_property
+    def pretty_yaml(self) -> dict:
+        def _pretty_routes(routes):
+            return {
+                route["route"]: {
+                    "input": {
+                        k: v["type"] for k, v in route["input"]["properties"].items()
+                    },
+                    "output": route["output"]["type"],
+                }
+                for route in routes
+            }
+
+        if len(self.bento_yaml["services"]) == 1:
+            pretty_yaml = {
+                "apis": _pretty_routes(self.bento_yaml["schema"]["routes"]),
+                "resources": self.bento_yaml["services"][0]["config"]["resources"],
+                "envs": self.bento_yaml["envs"],
+                "platforms": self.platforms,
+            }
+            return pretty_yaml
+        return self.bento_yaml
+
+    @functools.cached_property
+    def pretty_gpu(self) -> str:
+        from openllm_next.accelerator_spec import ACCELERATOR_SPECS
+
+        try:
+            resources = self.bento_yaml["services"][0]["config"]["resources"]
+            if resources["gpu"] > 1:
+                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
+                return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
+            elif resources["gpu"] > 0:
+                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
+                return f"{acc.memory_size:.0f}G"
+        except KeyError:
+            pass
+        return ""
+
+    def tolist(self):
+        verbose = VERBOSE_LEVEL.get()
+        if verbose <= 0:
+            return str(self)
+        if verbose <= 10:
+            return dict(
+                tag=self.tag,
+                repo=self.repo.tolist(),
+                path=str(self.path),
+                model_card=self.pretty_yaml,
+            )
+        if verbose <= 20:
+            return dict(
+                tag=self.tag,
+                repo=self.repo.tolist(),
+                path=str(self.path),
+                bento_yaml=self.bento_yaml,
+            )
+
+
+class VenvSpec(SimpleNamespace):
+    python_version: str
+    python_packages: dict[str, str]
+    name_prefix = ""
+
+    def __hash__(self):
+        return md5(
+            # self.python_version,
+            *sorted(self.python_packages),
+        )
+
+
+class Accelerator(SimpleNamespace):
+    model: str
+    memory_size: float
+
+    def __gt__(self, other):
+        return self.memory_size > other.memory_size
+
+    def __eq__(self, other):
+        return self.memory_size == other.memory_size
+
+
+class DeploymentTarget(SimpleNamespace):
+    source: str = "local"
+    name: str = "local"
+    price: str = ""
+    platform = "linux"
+    accelerators: list[Accelerator]
+
+    def __hash__(self):
+        return hash(self.source)
+
+    @property
+    def accelerators_repr(self) -> str:
+        accs = {a.model for a in self.accelerators}
+        if len(accs) == 0:
+            return "null"
+        if len(accs) == 1:
+            a = self.accelerators[0]
+            return f"{a.model} x{len(self.accelerators)}"
+        return ", ".join((f"{a.model}" for a in self.accelerators))
+
+
+def run_command(
+    cmd,
+    cwd=None,
+    env=None,
+    copy_env=True,
+    venv=None,
+    silent=False,
+) -> subprocess.CompletedProcess:
+    import shlex
+
+    env = env or {}
+    cmd = [str(c) for c in cmd]
+    bin_dir = "Scripts" if os.name == "nt" else "bin"
+    if not silent:
+        output("\n")
+        if cwd:
+            output(f"$ cd {cwd}", style="orange")
+        if env:
+            for k, v in env.items():
+                output(f"$ export {k}={shlex.quote(v)}", style="orange")
+        if venv:
+            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
+        output(f"$ {' '.join(cmd)}", style="orange")
+
+    if venv:
+        py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
+    else:
+        py = sys.executable
+
+    if copy_env:
+        env = {**os.environ, **env}
+
+    if cmd and cmd[0] == "bentoml":
+        cmd = [py, "-m", "bentoml"] + cmd[1:]
+    if cmd and cmd[0] == "python":
+        cmd = [py] + cmd[1:]
+
+    try:
+        if silent:
+            return subprocess.run(  # type: ignore
+                cmd,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+        else:
+            return subprocess.run(
+                cmd,
+                cwd=cwd,
+                env=env,
+            )
+    except subprocess.CalledProcessError:
+        output("Command failed", style="red")
+        raise typer.Exit(1)
+
+
+async def stream_command_output(stream, style="gray"):
+    async for line in stream:
+        output(line.decode(), style=style, end="")
+
+
+@asynccontextmanager
+async def async_run_command(
+    cmd,
+    cwd=None,
+    env=None,
+    copy_env=True,
+    venv=None,
+    silent=True,
+):
+    import shlex
+
+    env = env or {}
+    cmd = [str(c) for c in cmd]
+
+    if not silent:
+        output("\n")
+        if cwd:
+            output(f"$ cd {cwd}", style="orange")
+        if env:
+            for k, v in env.items():
+                output(f"$ export {k}={shlex.quote(v)}", style="orange")
+        if venv:
+            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
+        output(f"$ {' '.join(cmd)}", style="orange")
+
+    if venv:
+        py = venv / "bin" / "python"
+    else:
+        py = sys.executable
+
+    if copy_env:
+        env = {**os.environ, **env}
+
+    if cmd and cmd[0] == "bentoml":
+        cmd = [py, "-m", "bentoml"] + cmd[1:]
+    if cmd and cmd[0] == "python":
+        cmd = [py] + cmd[1:]
+
+    proc = None
+    try:
+        proc = await asyncio.create_subprocess_shell(
+            " ".join(map(str, cmd)),
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            cwd=cwd,
+            env=env,
+        )
+        yield proc
+    except subprocess.CalledProcessError:
+        output("Command failed", style="red")
+        raise typer.Exit(1)
+    finally:
+        if proc:
+            proc.send_signal(signal.SIGINT)
+            await proc.wait()
+
+
+def md5(*strings: str) -> int:
+    m = hashlib.md5()
+    for s in strings:
+        m.update(s.encode())
+    return int(m.hexdigest(), 16)
--- a/openllm_next/local.py
+++ b/openllm_next/local.py
@@ -0,0 +1,117 @@
+import asyncio
+import time
+
+import httpx
+
+from openllm_next.common import (
+    BentoInfo,
+    async_run_command,
+    output,
+    run_command,
+    stream_command_output,
+)
+from openllm_next.venv import ensure_venv
+
+
+def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
+    cmd = ["bentoml", "serve", bento.bentoml_tag]
+    if port != 3000:
+        cmd += ["--port", str(port)]
+    env = {
+        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
+    }
+    return cmd, env, None
+
+
+def serve(
+    bento: BentoInfo,
+    port: int = 3000,
+):
+    venv = ensure_venv(bento)
+    cmd, env, cwd = _get_serve_cmd(bento, port=port)
+    run_command(cmd, env=env, cwd=cwd, venv=venv)
+
+
+async def _run_model(
+    bento: BentoInfo,
+    port: int = 3000,
+    timeout: int = 600,
+):
+    venv = ensure_venv(bento)
+    cmd, env, cwd = _get_serve_cmd(bento, port)
+    async with async_run_command(
+        cmd,
+        env=env,
+        cwd=cwd,
+        venv=venv,
+        silent=False,
+    ) as server_proc:
+
+        output(f"Model server started {server_proc.pid}")
+
+        stdout_streamer = None
+        stderr_streamer = None
+        start_time = time.time()
+
+        output("Model loading...", style="green")
+        for _ in range(timeout):
+            try:
+                resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3)
+                if resp.status_code == 200:
+                    break
+            except httpx.RequestError:
+                if time.time() - start_time > 30:
+                    if not stdout_streamer:
+                        stdout_streamer = asyncio.create_task(
+                            stream_command_output(server_proc.stdout, style="gray")
+                        )
+                    if not stderr_streamer:
+                        stderr_streamer = asyncio.create_task(
+                            stream_command_output(server_proc.stderr, style="#BD2D0F")
+                        )
+                await asyncio.sleep(1)
+        else:
+            output("Model failed to load", style="red")
+            server_proc.terminate()
+            return
+
+        if stdout_streamer:
+            stdout_streamer.cancel()
+        if stderr_streamer:
+            stderr_streamer.cancel()
+
+        output("Model is ready", style="green")
+        messages: list[dict[str, str]] = []
+
+        from openai import AsyncOpenAI
+
+        client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local")
+        model_id = (await client.models.list()).data[0].id
+        while True:
+            try:
+                message = input("user: ")
+                if message == "":
+                    output("empty message, please enter something", style="yellow")
+                    continue
+                messages.append(dict(role="user", content=message))
+                output("assistant: ", end="", style="lightgreen")
+                assistant_message = ""
+                stream = await client.chat.completions.create(
+                    model=model_id,
+                    messages=messages,  # type: ignore
+                    stream=True,
+                )
+                async for chunk in stream:
+                    text = chunk.choices[0].delta.content or ""
+                    assistant_message += text
+                    output(text, end="", style="lightgreen")
+                messages.append(dict(role="assistant", content=assistant_message))
+                output("")
+            except KeyboardInterrupt:
+                break
+        output("\nStopping model server...", style="green")
+    output("Stopped model server", style="green")
+
+
+def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
+    asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/openllm_next/model.py
+++ b/openllm_next/model.py
@@ -0,0 +1,173 @@
+import typing
+from typing import Optional
+
+import tabulate
+import typer
+
+from openllm_next.accelerator_spec import DeploymentTarget, can_run
+from openllm_next.analytic import OpenLLMTyper
+from openllm_next.common import (
+    FORCE,
+    VERBOSE_LEVEL,
+    BentoInfo,
+    load_config,
+    output,
+)
+from openllm_next.repo import ensure_repo_updated, parse_repo_url
+
+app = OpenLLMTyper(help="manage models")
+
+
+@app.command()
+def get(
+    tag: str,
+    repo: Optional[str] = None,
+    verbose: bool = False,
+):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    bento_info = ensure_bento(tag, repo_name=repo)
+    if bento_info:
+        output(bento_info)
+
+
+@app.command(name="list")
+def list_(
+    tag: Optional[str] = None,
+    repo: Optional[str] = None,
+    verbose: bool = False,
+):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+
+    bentos = list_bento(tag=tag, repo_name=repo)
+    bentos.sort(key=lambda x: x.name)
+
+    seen = set()
+
+    def is_seen(value):
+        if value in seen:
+            return True
+        seen.add(value)
+        return False
+
+    table = tabulate.tabulate(
+        [
+            [
+                "" if is_seen(bento.name) else bento.name,
+                bento.tag,
+                bento.repo.name,
+                bento.pretty_gpu,
+                ",".join(bento.platforms),
+            ]
+            for bento in bentos
+        ],
+        headers=["model", "version", "repo", "required VRAM", "platforms"],
+    )
+    output(table)
+
+
+def ensure_bento(
+    model: str,
+    target: Optional[DeploymentTarget] = None,
+    repo_name: Optional[str] = None,
+) -> BentoInfo:
+    bentos = list_bento(model, repo_name=repo_name)
+    if len(bentos) == 0:
+        output(f"No model found for {model}", style="red")
+        raise typer.Exit(1)
+
+    if len(bentos) == 1:
+        if FORCE.get():
+            output(f"Found model {bentos[0]}", style="green")
+            return bentos[0]
+        if target is None:
+            return bentos[0]
+        if can_run(bentos[0], target) <= 0:
+            return bentos[0]
+        output(f"Found model {bentos[0]}", style="green")
+        return bentos[0]
+
+    if target is None:
+        output(
+            f"Multiple models match {model}, did you mean one of these?",
+            style="red",
+        )
+        for bento in bentos:
+            output(f"  {bento}")
+        raise typer.Exit(1)
+
+    filtered = [bento for bento in bentos if can_run(bento, target) > 0]
+    if len(filtered) == 0:
+        output(f"No deployment target found for {model}", style="red")
+        raise typer.Exit(1)
+
+    if len(filtered) == 0:
+        output(f"No deployment target found for {model}", style="red")
+        raise typer.Exit(1)
+
+    if len(bentos) > 1:
+        output(
+            f"Multiple models match {model}, did you mean one of these?",
+            style="red",
+        )
+        for bento in bentos:
+            output(f"  {bento}")
+        raise typer.Exit(1)
+
+    return bentos[0]
+
+
+def list_bento(
+    tag: typing.Optional[str] = None,
+    repo_name: typing.Optional[str] = None,
+    include_alias: bool = False,
+) -> typing.List[BentoInfo]:
+    ensure_repo_updated()
+
+    if repo_name is not None:
+        config = load_config()
+        if repo_name not in config.repos:
+            output(f"Repo `{repo_name}` not found, did you mean one of these?")
+            for repo_name in config.repos:
+                output(f"  {repo_name}")
+            raise typer.Exit(1)
+
+    if not tag:
+        glob_pattern = "bentoml/bentos/*/*"
+    elif ":" in tag:
+        bento_name, version = tag.split(":")
+        glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
+    else:
+        glob_pattern = f"bentoml/bentos/{tag}/*"
+
+    model_list = []
+    config = load_config()
+    for _repo_name, repo_url in config.repos.items():
+        if repo_name is not None and _repo_name != repo_name:
+            continue
+        repo = parse_repo_url(repo_url, _repo_name)
+        for path in repo.path.glob(glob_pattern):
+            if path.is_dir() and (path / "bento.yaml").exists():
+                model = BentoInfo(repo=repo, path=path)
+            elif path.is_file():
+                with open(path) as f:
+                    origin_name = f.read().strip()
+                origin_path = path.parent / origin_name
+                model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
+            else:
+                model = None
+            if model:
+                model_list.append(model)
+    model_list.sort(key=lambda x: x.tag)
+    if not include_alias:
+        seen = set()
+        model_list = [
+            x
+            for x in model_list
+            if not (
+                f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
+                or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
+            )
+        ]
+    return model_list
--- a/openllm_next/repo.py
+++ b/openllm_next/repo.py
@@ -0,0 +1,203 @@
+import datetime
+import re
+import shutil
+
+import pyaml
+import questionary
+import typer
+
+from openllm_next.analytic import OpenLLMTyper
+from openllm_next.common import (
+    INTERACTIVE,
+    REPO_DIR,
+    VERBOSE_LEVEL,
+    RepoInfo,
+    load_config,
+    output,
+    save_config,
+)
+
+UPDATE_INTERVAL = datetime.timedelta(days=3)
+
+app = OpenLLMTyper(help="manage repos")
+
+
+@app.command()
+def list(verbose: bool = False):
+    if verbose:
+        VERBOSE_LEVEL.set(20)
+    config = load_config()
+    pyaml.pprint(
+        [parse_repo_url(repo, name) for name, repo in config.repos.items()],
+        sort_dicts=False,
+        sort_keys=False,
+    )
+
+
+@app.command()
+def remove(name: str):
+    config = load_config()
+    if name not in config.repos:
+        output(f"Repo {name} does not exist", style="red")
+        return
+
+    del config.repos[name]
+    save_config(config)
+    output(f"Repo {name} removed", style="green")
+
+
+def _complete_alias(repo_name: str):
+    from openllm_next.model import list_bento
+
+    for bento in list_bento(repo_name=repo_name):
+        alias = bento.labels.get("openllm_alias", "").strip()
+        if alias:
+            for a in alias.split(","):
+                with open(bento.path.parent / a, "w") as f:
+                    f.write(bento.version)
+
+
+@app.command()
+def update():
+    import dulwich
+    import dulwich.errors
+    import dulwich.porcelain
+
+    config = load_config()
+    repos_in_use = set()
+    for repo_name, repo in config.repos.items():
+        repo = parse_repo_url(repo, repo_name)
+        repos_in_use.add((repo.server, repo.owner, repo.repo))
+        if repo.path.exists():  # TODO: use update instead of remove and clone
+            shutil.rmtree(repo.path, ignore_errors=True)
+        if not repo.path.exists():
+            repo.path.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                dulwich.porcelain.clone(
+                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
+                    str(repo.path),
+                    checkout=True,
+                    depth=1,
+                    branch=repo.branch,
+                )
+                output("")
+                output(f"Repo `{repo.name}` updated", style="green")
+            except:
+                shutil.rmtree(repo.path, ignore_errors=True)
+                output(f"Failed to clone repo {repo.name}", style="red")
+        else:
+            try:
+                import dulwich.porcelain
+
+                dulwich.porcelain.pull(
+                    str(repo.path),
+                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
+                    refspecs=repo.branch,
+                    force=True,
+                )
+                dulwich.porcelain.clean(str(repo.path), str(repo.path))
+                output("")
+                output(f"Repo `{repo.name}` updated", style="green")
+            except:
+                shutil.rmtree(repo.path, ignore_errors=True)
+                output(f"Failed to update repo {repo.name}", style="red")
+    for c in REPO_DIR.glob("*/*/*"):
+        repo_spec = tuple(c.parts[-3:])
+        if repo_spec not in repos_in_use:
+            shutil.rmtree(c, ignore_errors=True)
+            output(f"Removed unused repo cache {c}")
+    with open(REPO_DIR / "last_update", "w") as f:
+        f.write(datetime.datetime.now().isoformat())
+    for repo_name in config.repos:
+        _complete_alias(repo_name)
+
+
+def ensure_repo_updated():
+    last_update_file = REPO_DIR / "last_update"
+    if not last_update_file.exists():
+        if INTERACTIVE.get():
+            choice = questionary.confirm(
+                "The repo cache is never updated, do you want to update it to fetch the latest model list?"
+            ).ask()
+            if choice:
+                update()
+            return
+        else:
+            output(
+                "The repo cache is never updated, please run `openllm repo update` to fetch the latest model list",
+                style="red",
+            )
+            raise typer.Exit(1)
+    last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
+    if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
+        if INTERACTIVE.get():
+            choice = questionary.confirm(
+                "The repo cache is outdated, do you want to update it to fetch the latest model list?"
+            ).ask()
+            if choice:
+                update()
+        else:
+            output(
+                "The repo cache is outdated, please run `openllm repo update` to fetch the latest model list",
+                style="yellow",
+            )
+
+
+GIT_REPO_RE = re.compile(
+    r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
+)
+
+
+def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
+    """
+    parse the git repo url to server, owner, repo name, branch
+    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm@main")
+    ('github.com', 'bentoml', 'bentovllm', 'main')
+
+    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm")
+    ('github.com', 'bentoml', 'bentovllm', 'main')
+    """
+    match = GIT_REPO_RE.match(repo_url)
+    if not match:
+        raise ValueError(f"Invalid git repo url: {repo_url}")
+    server = match.group("server")
+    owner = match.group("owner")
+    repo = match.group("repo")
+    branch = match.group("branch") or "main"
+    path = REPO_DIR / server / owner / repo
+    return RepoInfo(
+        name=repo if repo_name is None else repo_name,
+        url=repo_url,
+        server=server,
+        owner=owner,
+        repo=repo,
+        branch=branch,
+        path=path,
+    )
+
+
+@app.command()
+def add(name: str, repo: str):
+    name = name.lower()
+    if not name.isidentifier():
+        output(
+            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
+            style="red",
+        )
+        return
+
+    config = load_config()
+    if name in config.repos:
+        override = questionary.confirm(
+            f"Repo {name} already exists({config.repos[name]}), override?"
+        ).ask()
+        if not override:
+            return
+
+    config.repos[name] = repo
+    save_config(config)
+    output(f"Repo {name} added", style="green")
+
+
+if __name__ == "__main__":
+    app()
--- a/openllm_next/venv.py
+++ b/openllm_next/venv.py
@@ -0,0 +1,164 @@
+import functools
+import os
+import pathlib
+import shutil
+import typing
+from typing import Iterable
+
+import typer
+
+from openllm_next.common import (
+    VENV_DIR,
+    VERBOSE_LEVEL,
+    BentoInfo,
+    VenvSpec,
+    output,
+    run_command,
+)
+
+
+@functools.lru_cache
+def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
+    from pip_requirements_parser import RequirementsFile
+
+    requirements_txt = RequirementsFile.from_file(
+        str(requirement),
+        include_nested=True,
+    )
+    return requirements_txt.requirements
+
+
+def _filter_preheat_packages(requirements: Iterable) -> list[str]:
+    PREHEAT_PIP_PACKAGES = ["torch", "vllm"]
+
+    deps: list[str] = []
+    for req in requirements:
+        if (
+            req.is_editable
+            or req.is_local_path
+            or req.is_url
+            or req.is_wheel
+            or not req.name
+            or not req.specifier
+        ):
+            continue
+        for sp in req.specifier:
+            if sp.operator == "==" and req.name in PREHEAT_PIP_PACKAGES:
+                assert req.line is not None
+                deps.append(req.line)
+                break
+    return deps
+
+
+@functools.lru_cache
+def _resolve_bento_env_specs(bento: BentoInfo):
+    ver_file = bento.path / "env" / "python" / "version.txt"
+    assert ver_file.exists(), f"cannot find version file in {bento.path}"
+
+    lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
+    if not lock_file.exists():
+        lock_file = bento.path / "env" / "python" / "requirements.txt"
+
+    reqs = _resolve_packages(lock_file)
+    preheat_packages = _filter_preheat_packages(reqs)
+    ver = ver_file.read_text().strip()
+    return (
+        VenvSpec(
+            python_version=ver,
+            python_packages=preheat_packages,
+            name_prefix=f"{bento.tag.replace(':', '_')}-1-",
+        ),
+        VenvSpec(
+            python_version=ver,
+            python_packages=[v.line for v in reqs],
+            name_prefix=f"{bento.tag.replace(':', '_')}-2-",
+        ),
+    )
+
+
+def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
+    if os.name == "nt":
+        return venv / "Lib/site-packages"
+    else:
+        return next(venv.glob("lib/python*")) / "site-packages"
+
+
+def _ensure_venv(
+    env_spec: VenvSpec,
+    parrent_venv: typing.Optional[pathlib.Path] = None,
+) -> pathlib.Path:
+    venv = VENV_DIR / str(hash(env_spec))
+    if venv.exists() and not (venv / "DONE").exists():
+        shutil.rmtree(venv, ignore_errors=True)
+    if not venv.exists():
+        output(f"Installing model dependencies({venv})...", style="green")
+
+        venv_py = (
+            venv / "Scripts" / "python.exe"
+            if os.name == "nt"
+            else venv / "bin" / "python"
+        )
+        try:
+            run_command(
+                ["python", "-m", "uv", "venv", venv],
+                silent=VERBOSE_LEVEL.get() < 10,
+            )
+            lib_dir = _get_lib_dir(venv)
+            if parrent_venv is not None:
+                parent_lib_dir = _get_lib_dir(parrent_venv)
+                with open(lib_dir / f"{parrent_venv.name}.pth", "w+") as f:
+                    f.write(str(parent_lib_dir))
+            with open(venv / "requirements.txt", "w") as f:
+                f.write("\n".join(sorted(env_spec.python_packages)))
+            run_command(
+                [
+                    "python",
+                    "-m",
+                    "uv",
+                    "pip",
+                    "install",
+                    "-p",
+                    str(venv_py),
+                    "-r",
+                    venv / "requirements.txt",
+                ],
+                silent=VERBOSE_LEVEL.get() < 10,
+            )
+            with open(venv / "DONE", "w") as f:
+                f.write("DONE")
+        except Exception:
+            shutil.rmtree(venv, ignore_errors=True)
+            output(
+                f"Failed to install dependencies to {venv}. Cleaned up.",
+                style="red",
+            )
+            raise typer.Exit(1)
+        output(f"Successfully installed dependencies to {venv}.", style="green")
+        return venv
+    else:
+        return venv
+
+
+def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
+    last_venv = None
+    for env_spec in env_spec_list:
+        last_venv = _ensure_venv(env_spec, last_venv)
+    assert last_venv is not None
+    return last_venv
+
+
+def ensure_venv(bento: BentoInfo) -> pathlib.Path:
+    return _ensure_venvs(_resolve_bento_env_specs(bento))
+
+
+def _check_venv(env_spec: VenvSpec) -> bool:
+    venv = VENV_DIR / str(hash(env_spec))
+    if not venv.exists():
+        return False
+    if venv.exists() and not (venv / "DONE").exists():
+        return False
+    return True
+
+
+def check_venv(bento: BentoInfo) -> bool:
+    return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,34 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "openllm-next"
+version = "0.0.1"
+description = "A description of your package."
+authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
+license = {file = "LICENSE"}
+dependencies = [
+    "bentoml",
+    "typer",
+    "questionary",
+    "pyaml",
+    "psutil",
+    "pathlib",
+    "pip_requirements_parser",
+    "nvidia-ml-py",
+    "dulwich",
+    "tabulate",
+    "uv",
+    "openai==1.35.9",
+]
+
+[project.scripts]
+openllm = "openllm_next.__main__:main"
+
+[tool.typer]
+src-dir = "openllm_next"
+
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true