From 58fa8a70cb39a65ca478b778fe058da9bafbe308 Mon Sep 17 00:00:00 2001
From: bojiang <bojiang_@outlook.com>
Date: Sat, 18 May 2024 12:41:54 +0800
Subject: [PATCH] feat: repo/model/serve

---
 .gitignore         | 161 ++++++++++++
 LICENSE            | 201 +++++++++++++++
 README.md          | 150 +++++++++++
 cllama/__init__.py |   0
 cllama/__main__.py | 295 +++++++++++++++++++++
 cllama/aws.py      | 630 +++++++++++++++++++++++++++++++++++++++++++++
 cllama/spec.py     |  11 +
 pyproject.toml     |  22 ++
 req.txt            |   7 +
 9 files changed, 1477 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 cllama/__init__.py
 create mode 100644 cllama/__main__.py
 create mode 100644 cllama/aws.py
 create mode 100644 cllama/spec.py
 create mode 100644 pyproject.toml
 create mode 100644 req.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..7751b092
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.whl
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..ceb0ae1c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,150 @@
+<div align="center">
+    <h1 align="center">Self-host LLMs with vLLM and BentoML</h1>
+</div>
+
+This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [vLLM](https://vllm.ai), a high-throughput and memory-efficient inference engine.
+
+See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects.
+
+💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or vLLM options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
+
+
+## Prerequisites
+
+- You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
+- You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
+- If you want to test the Service locally, you need a Nvidia GPU with at least 16G VRAM.
+- (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
+
+## Install dependencies
+
+```bash
+git clone https://github.com/bentoml/BentoVLLM.git
+cd BentoVLLM/mistral-7b-instruct
+pip install -r requirements.txt && pip install -f -U "pydantic>=2.0"
+```
+
+## Run the BentoML Service
+
+We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
+
+```bash
+$ bentoml serve .
+
+2024-01-18T07:51:30+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:VLLM" listening on http://localhost:3000 (Press CTRL+C to quit)
+INFO 01-18 07:51:40 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
+INFO 01-18 07:51:40 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
+INFO 01-18 07:51:46 model_runner.py:547] Graph capturing finished in 6 secs.
+```
+
+The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
+
+<details>
+
+<summary>CURL</summary>
+
+```bash
+curl -X 'POST' \
+  'http://localhost:3000/generate' \
+  -H 'accept: text/event-stream' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "prompt": "Explain superconductors like I'\''m five years old",
+  "tokens": null
+}'
+```
+
+</details>
+
+<details>
+
+<summary>Python client</summary>
+
+```python
+import bentoml
+
+with bentoml.SyncHTTPClient("http://localhost:3000") as client:
+    response_generator = client.generate(
+        prompt="Explain superconductors like I'm five years old",
+        tokens=None
+    )
+    for response in response_generator:
+        print(response)
+```
+
+</details>
+
+<details>
+
+<summary>OpenAI-compatible endpoints</summary>
+
+This Service uses the `@openai_endpoints` decorator to set up OpenAI-compatible endpoints (`chat/completions` and `completions`). This means your client can interact with the backend Service (in this case, the VLLM class) as if they were communicating directly with OpenAI's API. This [utility](mistral-7b-instruct/bentovllm_openai/) does not affect your BentoML Service code, and you can use it for other LLMs as well.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
+
+# Use the following func to get the available models
+client.models.list()
+
+chat_completion = client.chat.completions.create(
+    model="mistralai/Mistral-7B-Instruct-v0.2",
+    messages=[
+        {
+            "role": "user",
+            "content": "Explain superconductors like I'm five years old"
+        }
+    ],
+    stream=True,
+)
+for chunk in chat_completion:
+    # Extract and print the content of the model's reply
+    print(chunk.choices[0].delta.content or "", end="")
+```
+
+**Note**: If your Service is deployed with [protected endpoints on BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#access-protected-deployments), you need to set the environment variable `OPENAI_API_KEY` to your BentoCloud API key first.
+
+```bash
+export OPENAI_API_KEY={YOUR_BENTOCLOUD_API_TOKEN}
+```
+
+You can then use the following line to replace the client in the above code snippet. Refer to [Obtain the endpoint URL](https://docs.bentoml.com/en/latest/bentocloud/how-tos/call-deployment-endpoints.html#obtain-the-endpoint-url) to retrieve the endpoint URL.
+
+```python
+client = OpenAI(base_url='your_bentocloud_deployment_endpoint_url/v1')
+```
+
+</details>
+
+For detailed explanations of the Service code, see [vLLM inference](https://docs.bentoml.org/en/latest/use-cases/large-language-models/vllm.html).
+
+## Deploy to BentoCloud
+
+After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
+
+Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
+
+```bash
+bentoml deploy .
+```
+
+Once the application is up and running on BentoCloud, you can access it via the exposed URL.
+
+**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
+
+
+## Different LLM Models
+
+Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories.
+
+- [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/)
+- [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/)
+- [Llama-2-7b-chat-hf](llama2-7b-chat/)
+- [SOLAR-10.7B-v1.0](solar-10.7b-instruct/)
+
+
+## LLM tools integration examples
+
+- Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service.
+- [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation.
diff --git a/cllama/__init__.py b/cllama/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cllama/__main__.py b/cllama/__main__.py
new file mode 100644
index 00000000..9c08f408
--- /dev/null
+++ b/cllama/__main__.py
@@ -0,0 +1,295 @@
+import typer
+import shlex
+import os
+from typing_extensions import TypedDict
+import collections
+
+import prompt_toolkit
+import shutil
+import pydantic
+import yaml
+import json
+import questionary
+import re
+import subprocess
+import pyaml
+import pathlib
+from cllama.spec import GPU_MEMORY
+
+
+ERROR_STYLE = "red"
+SUCCESS_STYLE = "green"
+
+
+CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
+REPO_DIR = CLLAMA_HOME / "repos"
+TEMP_DIR = CLLAMA_HOME / "temp"
+VENV_DIR = CLLAMA_HOME / "venv"
+
+REPO_DIR.mkdir(exist_ok=True, parents=True)
+TEMP_DIR.mkdir(exist_ok=True, parents=True)
+VENV_DIR.mkdir(exist_ok=True, parents=True)
+
+CONFIG_FILE = CLLAMA_HOME / "config.json"
+
+
+app = typer.Typer()
+repo_app = typer.Typer()
+model_app = typer.Typer()
+
+app.add_typer(repo_app, name="repo")
+app.add_typer(model_app, name="model")
+
+
+class Config(pydantic.BaseModel):
+    repos: dict[str, str] = {
+        "default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
+    }
+    default_repo: str = "default"
+
+
+def _load_config():
+    if CONFIG_FILE.exists():
+        with open(CONFIG_FILE) as f:
+            return Config(**json.load(f))
+    return Config()
+
+
+def _save_config(config):
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config.dict(), f, indent=2)
+
+
+class RepoInfo(TypedDict):
+    name: str
+    path: str
+    url: str
+    server: str
+    owner: str
+    repo: str
+    branch: str
+
+
+class ModelInfo(TypedDict):
+    repo: RepoInfo
+    path: str
+
+
+class BentoInfo(TypedDict):
+    model: ModelInfo
+    bento_yaml: dict
+
+
+def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
+    model_map = collections.defaultdict(dict)
+    config = _load_config()
+    for repo_name, repo_url in config.repos.items():
+        server, owner, repo, branch = _parse_repo_url(repo_url)
+        repo_dir = REPO_DIR / server / owner / repo
+        for path in repo_dir.glob("bentoml/bentos/*/*"):
+            if path.is_dir():
+                model_map[path.parent.name][path.name] = ModelInfo(
+                    repo=RepoInfo(
+                        name=repo_name,
+                        url=repo_url,
+                        server=server,
+                        owner=owner,
+                        repo=repo,
+                        branch=branch,
+                        path=str(repo_dir),
+                    ),
+                    path=str(path),
+                )
+            elif path.is_file():
+                with open(path) as f:
+                    origin_name = f.read().strip()
+                origin_path = path.parent / origin_name
+                model_map[path.parent.name][path.name] = ModelInfo(
+                    repo=RepoInfo(
+                        name=repo_name,
+                        url=repo_url,
+                        server=server,
+                        owner=owner,
+                        repo=repo,
+                        branch=branch,
+                        path=str(repo_dir),
+                    ),
+                    path=str(origin_path),
+                )
+    return model_map
+
+
+GIT_REPO_RE = re.compile(
+    r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
+)
+
+
+@repo_app.command(name="list")
+def repo_list():
+    config = _load_config()
+    pyaml.pprint(config.repos)
+
+
+def _parse_repo_url(repo_url):
+    """
+    parse the git repo url to server, owner, repo name, branch
+    >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
+    ('github.com', 'bojiang', 'bentovllm', 'main')
+
+    >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
+    ('github.com', 'bojiang', 'bentovllm', 'main')
+    """
+    match = GIT_REPO_RE.match(repo_url)
+    if not match:
+        raise ValueError(f"Invalid git repo url: {repo_url}")
+    return (
+        match.group("server"),
+        match.group("owner"),
+        match.group("repo"),
+        match.group("branch") or "main",
+    )
+
+
+@repo_app.command(name="add")
+def repo_add(name: str, repo: str):
+    name = name.lower()
+    if not name.isidentifier():
+        questionary.print(
+            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
+            style=ERROR_STYLE,
+        )
+        return
+
+    config = _load_config()
+    if name in config.repos:
+        override = questionary.confirm(
+            f"Repo {name} already exists({config.repos[name]}), override?"
+        ).ask()
+        if not override:
+            return
+
+    config.repos[name] = repo
+    _save_config(config)
+    pyaml.pprint(config.repos)
+
+
+@repo_app.command(name="remove")
+def repo_remove(name: str):
+    config = _load_config()
+    if name not in config.repos:
+        questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
+        return
+
+    del config.repos[name]
+    _save_config(config)
+    pyaml.pprint(config.repos)
+
+
+def _run_command(cmd, cwd=None, env=None, copy_env=True):
+    questionary.print("\n")
+    env = env or {}
+    if cwd:
+        questionary.print(f"$ cd {cwd}", style="bold")
+    if env:
+        for k, v in env.items():
+            questionary.print(f"$ export {k}={shlex.quote(v)}", style="bold")
+    if copy_env:
+        env = {**os.environ, **env}
+    questionary.print(f"$ {' '.join(cmd)}", style="bold")
+    try:
+        subprocess.run(cmd, cwd=cwd, env=env, check=True)
+    except subprocess.CalledProcessError:
+        questionary.print("Command failed", style=ERROR_STYLE)
+        return
+
+
+@repo_app.command(name="update")
+def repo_update():
+    config = _load_config()
+    repos_in_use = set()
+    for name, repo in config.repos.items():
+        server, owner, repo_name, branch = _parse_repo_url(repo)
+        repos_in_use.add((server, owner, repo_name))
+        repo_dir = REPO_DIR / server / owner / repo_name
+        if not repo_dir.exists():
+            repo_dir.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                cmd = [
+                    "git",
+                    "clone",
+                    "--branch",
+                    branch,
+                    f"https://{server}/{owner}/{repo_name}.git",
+                    str(repo_dir),
+                ]
+                _run_command(cmd)
+            except subprocess.CalledProcessError:
+                shutil.rmtree(repo_dir, ignore_errors=True)
+                questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
+        else:
+            try:
+                cmd = ["git", "fetch", "origin", branch]
+                _run_command(cmd, cwd=repo_dir)
+                cmd = ["git", "reset", "--hard", f"origin/{branch}"]
+                _run_command(cmd, cwd=repo_dir)
+            except:
+                shutil.rmtree(repo_dir, ignore_errors=True)
+                questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
+    for repo_dir in REPO_DIR.glob("*/*/*"):
+        if tuple(repo_dir.parts[-3:]) not in repos_in_use:
+            shutil.rmtree(repo_dir, ignore_errors=True)
+            questionary.print(f"Removed unused repo {repo_dir}")
+    questionary.print("Repos updated", style=SUCCESS_STYLE)
+
+
+@model_app.command(name="list")
+def model_list():
+    pyaml.pprint(_load_model_map())
+
+
+def _get_bento_info(tag):
+    model_map = _load_model_map()
+    bento, version = tag.split(":")
+    if bento not in model_map or version not in model_map[bento]:
+        questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
+        return
+    model_info = model_map[bento][version]
+    path = pathlib.Path(model_info["path"])
+
+    bento_file = path / "bento.yaml"
+    bento_info = yaml.safe_load(bento_file.read_text())
+    return BentoInfo(
+        model=model_info,
+        bento_yaml=bento_info,
+    )
+
+
+@model_app.command(name="get")
+def model_get(tag: str):
+    bento_info = _get_bento_info(tag)
+    if bento_info:
+        pyaml.pprint(bento_info)
+
+
+def _serve_model(model: str):
+    if ":" not in model:
+        model = f"{model}:latest"
+    bento_info = _get_bento_info(model)
+    if not bento_info:
+        questionary.print(f"Model {model} not found", style=ERROR_STYLE)
+        return
+    cmd = ["bentoml", "serve", model]
+    env = {
+        "CLLAMA_MODEL": model,
+        "BENTOML_HOME": bento_info["model"]["repo"]["path"] + "/bentoml",
+    }
+    _run_command(cmd, env=env)
+
+
+@app.command()
+def serve(model: str):
+    _serve_model(model)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/cllama/aws.py b/cllama/aws.py
new file mode 100644
index 00000000..27d86ba9
--- /dev/null
+++ b/cllama/aws.py
@@ -0,0 +1,630 @@
+import typer
+import typing
+import collections
+
+import prompt_toolkit
+from prompt_toolkit import print_formatted_text as print
+import time
+import uuid
+import shutil
+import pydantic
+from urllib.parse import urlparse
+import yaml
+import json
+import bentoml
+import questionary
+import os
+import re
+import subprocess
+import pyaml
+import pathlib
+from cllama.spec import GPU_MEMORY
+
+ERROR_STYLE = "red"
+SUCCESS_STYLE = "green"
+
+
+CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
+REPO_DIR = CLLAMA_HOME / "repos"
+TEMP_DIR = CLLAMA_HOME / "temp"
+VENV_DIR = CLLAMA_HOME / "venv"
+
+REPO_DIR.mkdir(exist_ok=True, parents=True)
+TEMP_DIR.mkdir(exist_ok=True, parents=True)
+VENV_DIR.mkdir(exist_ok=True, parents=True)
+
+CONFIG_FILE = CLLAMA_HOME / "config.json"
+
+
+app = typer.Typer()
+repo_app = typer.Typer()
+model_app = typer.Typer()
+
+app.add_typer(repo_app, name="repo")
+app.add_typer(model_app, name="model")
+
+
+class Config(pydantic.BaseModel):
+    repos: dict[str, str] = {
+        "default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
+    }
+    default_repo: str = "default"
+
+
+def _load_config():
+    if CONFIG_FILE.exists():
+        with open(CONFIG_FILE) as f:
+            return Config(**json.load(f))
+    return Config()
+
+
+def _save_config(config):
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config.dict(), f, indent=2)
+
+
+class ModelInfo(typing.TypedDict):
+    repo: str
+    path: str
+
+
+def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
+    model_map = collections.defaultdict(dict)
+    config = _load_config()
+    for repo_name, repo_url in config.repos.items():
+        server, owner, repo, _ = _parse_repo_url(repo_url)
+        repo_dir = REPO_DIR / server / owner / repo
+        for path in repo_dir.glob("bentoml/bentos/*/*"):
+            if path.is_dir():
+                model_map[path.parent.name][path.name] = ModelInfo(
+                    repo=repo_name,
+                    path=str(path),
+                )
+            elif path.is_file():
+                with open(path) as f:
+                    origin_name = f.read().strip()
+                origin_path = path.parent / origin_name
+                model_map[path.parent.name][path.name] = ModelInfo(
+                    repo=repo_name,
+                    path=str(origin_path),
+                )
+    return model_map
+
+
+GIT_REPO_RE = re.compile(
+    r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
+)
+
+
+@repo_app.command(name="list")
+def repo_list():
+    config = _load_config()
+    pyaml.pprint(config.repos)
+
+
+def _parse_repo_url(repo_url):
+    """
+    parse the git repo url to server, owner, repo name, branch
+    >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
+    ('github.com', 'bojiang', 'bentovllm', 'main')
+
+    >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
+    ('github.com', 'bojiang', 'bentovllm', 'main')
+    """
+    match = GIT_REPO_RE.match(repo_url)
+    if not match:
+        raise ValueError(f"Invalid git repo url: {repo_url}")
+    return (
+        match.group("server"),
+        match.group("owner"),
+        match.group("repo"),
+        match.group("branch") or "main",
+    )
+
+
+@repo_app.command(name="add")
+def repo_add(name: str, repo: str):
+    name = name.lower()
+    if not name.isidentifier():
+        questionary.print(
+            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
+            style=ERROR_STYLE,
+        )
+        return
+
+    config = _load_config()
+    if name in config.repos:
+        override = questionary.confirm(
+            f"Repo {name} already exists({config.repos[name]}), override?"
+        ).ask()
+        if not override:
+            return
+
+    config.repos[name] = repo
+    _save_config(config)
+    pyaml.pprint(config.repos)
+
+
+@repo_app.command(name="remove")
+def repo_remove(name: str):
+    config = _load_config()
+    if name not in config.repos:
+        questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
+        return
+
+    del config.repos[name]
+    _save_config(config)
+    pyaml.pprint(config.repos)
+
+
+def _run_command(cmd, cwd=None):
+    questionary.print(f"\n$ {' '.join(cmd)}", style="bold")
+    subprocess.run(cmd, cwd=cwd, check=True)
+
+
+@repo_app.command(name="update")
+def repo_update():
+    config = _load_config()
+    repos_in_use = set()
+    for name, repo in config.repos.items():
+        server, owner, repo_name, branch = _parse_repo_url(repo)
+        repos_in_use.add((server, owner, repo_name))
+        repo_dir = REPO_DIR / server / owner / repo_name
+        if not repo_dir.exists():
+            repo_dir.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                cmd = [
+                    "git",
+                    "clone",
+                    "--branch",
+                    branch,
+                    f"https://{server}/{owner}/{repo_name}.git",
+                    str(repo_dir),
+                ]
+                _run_command(cmd)
+            except subprocess.CalledProcessError:
+                shutil.rmtree(repo_dir, ignore_errors=True)
+                questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
+        else:
+            try:
+                cmd = ["git", "fetch", "origin", branch]
+                _run_command(cmd, cwd=repo_dir)
+                cmd = ["git", "reset", "--hard", f"origin/{branch}"]
+                _run_command(cmd, cwd=repo_dir)
+            except:
+                shutil.rmtree(repo_dir, ignore_errors=True)
+                questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
+    for repo_dir in REPO_DIR.glob("*/*/*"):
+        if tuple(repo_dir.parts[-3:]) not in repos_in_use:
+            shutil.rmtree(repo_dir, ignore_errors=True)
+            questionary.print(f"Removed unused repo {repo_dir}")
+    questionary.print("Repos updated", style=SUCCESS_STYLE)
+
+
+@model_app.command(name="list")
+def model_list():
+    pyaml.pprint(_load_model_map())
+
+
+def _get_bento_info(tag):
+    model_map = _load_model_map()
+    bento, version = tag.split(":")
+    if bento not in model_map or version not in model_map[bento]:
+        questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
+        return
+    model_info = model_map[bento][version]
+    repo_name = model_info["repo"]
+    path = pathlib.Path(model_info["path"])
+
+    bento_file = path / "bento.yaml"
+    bento_info = yaml.safe_load(bento_file.read_text())
+    return bento_info
+
+
+@model_app.command(name="get")
+def model_get(tag: str):
+    bento_info = _get_bento_info(tag)
+    if bento_info:
+        pyaml.pprint(bento_info)
+
+
+def _filter_instance_types(
+    instance_types,
+    gpu_count,
+    gpu_memory=None,
+    gpu_type=None,
+    level="match",
+):
+    if gpu_memory is None:
+        if gpu_type is None:
+            raise ValueError("Either gpu_memory or gpu_type must be provided")
+        gpu_memory = GPU_MEMORY[gpu_type]
+
+    def _check_instance(spec):
+        if gpu_count == 0 or gpu_count is None:
+            if "GpuInfo" in spec:
+                return False
+            else:
+                return True
+        else:
+            gpus = spec.get("GpuInfo", {}).get("Gpus", [])
+            if len(gpus) == 0:
+                return False
+            it_gpu = gpus[0]
+            it_gpu_mem = it_gpu["MemoryInfo"]["SizeInMiB"] / 1024
+
+            if it_gpu["Count"] == gpu_count and it_gpu_mem == gpu_memory:
+                return True
+            elif it_gpu["Count"] >= gpu_count and it_gpu_mem >= gpu_memory:
+                if level == "match":
+                    return False
+                elif level == "usable":
+                    return True
+                else:
+                    assert False
+            else:
+                return False
+
+    def _sort_key(spec):
+        return (
+            spec["InstanceType"].split(".")[0],
+            spec.get("GpuInfo", {}).get("Gpus", [{}])[0].get("Count", 0),
+            spec.get("VCpuInfo", {}).get("DefaultVCpus", 0),
+            spec.get("MemoryInfo", {}).get("SizeInMiB", 0),
+        )
+
+    return sorted(filter(_check_instance, instance_types), key=_sort_key)
+
+
+def _resolve_git_package(package):
+    match = REG_GITPACKAGE.match(package)
+    if not match:
+        raise ValueError(f"Invalid git package: {package}")
+    repo_url, branch, subdirectory = match.groups()
+    parsed = urlparse(repo_url)
+
+    path_parts = [parsed.netloc] + parsed.path.split("/")
+
+    return repo_url, branch, subdirectory, path_parts
+
+
+def _get_it_card(spec):
+    """
+    InstanceType: g4dn.2xlarge
+    VCpuInfo:
+      DefaultCores: 32
+      DefaultThreadsPerCore: 2
+      DefaultVCpus: 64
+
+    MemoryInfo:
+      SizeInMiB: 32768
+
+    GpuInfo:
+      Gpus:
+        - Count: 1
+          Manufacturer: NVIDIA
+          MemoryInfo:
+            SizeInMiB: 16384
+          Name: T4
+      TotalGpuMemoryInMiB: 16384
+    """
+    return f"cpus: {spec['VCpuInfo']['DefaultVCpus']}, mem: {spec['MemoryInfo']['SizeInMiB']}, gpu: {spec['GpuInfo']['Gpus'][0]['Name']} x {spec['GpuInfo']['Gpus'][0]['Count']}, cost: $0.1/hour"
+
+
+def _ensure_aws_security_group(group_name="cllama-http-default"):
+    try:
+        existing_groups = subprocess.check_output(
+            [
+                "aws",
+                "ec2",
+                "describe-security-groups",
+                "--filters",
+                f"Name=group-name,Values={group_name}",
+                "--no-cli-pager",
+            ]
+        )
+        existing_groups = json.loads(existing_groups)
+        if existing_groups["SecurityGroups"]:
+            return existing_groups["SecurityGroups"][0]["GroupId"]
+
+        result = subprocess.check_output(
+            [
+                "aws",
+                "ec2",
+                "create-security-group",
+                "--group-name",
+                group_name,
+                "--description",
+                "Default VPC security group for cllama services",
+                "--no-cli-pager",
+            ]
+        )
+        result = json.loads(result)
+        security_group_id = result["GroupId"]
+
+        subprocess.check_call(
+            [
+                "aws",
+                "ec2",
+                "authorize-security-group-ingress",
+                "--group-id",
+                security_group_id,
+                "--protocol",
+                "tcp",
+                "--port",
+                "80",
+                "--cidr",
+                "0.0.0.0/0",
+                "--no-cli-pager",
+            ]
+        )
+        subprocess.check_call(
+            [
+                "aws",
+                "ec2",
+                "authorize-security-group-ingress",
+                "--group-id",
+                security_group_id,
+                "--protocol",
+                "tcp",
+                "--port",
+                "443",
+                "--cidr",
+                "0.0.0.0/0",
+                "--no-cli-pager",
+            ]
+        )
+        subprocess.check_call(
+            [
+                "aws",
+                "ec2",
+                "authorize-security-group-ingress",
+                "--group-id",
+                security_group_id,
+                "--protocol",
+                "tcp",
+                "--port",
+                "22",
+                "--cidr",
+                "0.0.0.0/0",
+                "--no-cli-pager",
+            ]
+        )
+        return security_group_id
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to create security group: {e}")
+
+
+@app.command()
+def serve(model: str, tag: str = "latest", force_rebuild: bool = False):
+    if ":" in model:
+        model, tag = model.split(":")
+    if tag == "latest":
+        tag = next(iter(MODEL_INFOS[model].keys()))
+
+    package = MODEL_INFOS[model][tag]
+    repo, branch, subdirectory, path_parts = _resolve_git_package(package)
+    repo_dir = REPO_DIR.joinpath(*path_parts)
+    bento_project_dir = repo_dir / subdirectory
+
+    if force_rebuild:
+        shutil.rmtree(repo_dir, ignore_errors=True)
+
+    if not repo_dir.exists():
+        repo_dir.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            cmd = ["git", "clone", "--branch", branch, repo, str(repo_dir)]
+            print(f"\n$ {' '.join(cmd)}")
+            subprocess.run(cmd, check=True)
+        except:
+            shutil.rmtree(repo_dir, ignore_errors=True)
+            raise
+
+    bento_info = _get_bento_info(f"{model}:{tag}", bento_project_dir)
+
+    if len(bento_info["services"]) != 1:
+        raise ValueError("Only support one service currently")
+
+    envs = {}
+    if len(bento_info.get("envs", [])) > 0:
+        for env in bento_info["envs"]:
+            if env["name"] == "CLLAMA_MODEL":
+                envs[env["name"]] = f"{model}:{tag}"
+                continue
+            if env["name"] in os.environ:
+                value = os.environ.get(env["name"])
+                questionary.print(f"Using environment value for {env['name']}")
+            elif env.get("value"):
+                value = questionary.text(
+                    f"Enter value for {env['name']}",
+                    default=env["value"],
+                ).ask()
+            else:
+                value = questionary.text(
+                    f"Enter value for {env['name']}",
+                ).ask()
+            envs[env["name"]] = value
+
+    cloud_provider = questionary.select(
+        "Select a cloud provider",
+        choices=[
+            questionary.Choice(title="Local", value="aws"),
+            questionary.Choice(title="BentoCloud", value="cloud"),
+        ],
+    ).ask()
+
+    if cloud_provider == "cloud":
+        cloud_provider = questionary.select(
+            "You haven't logged in to BentoCloud, select an action",
+            choices=[
+                questionary.Choice(title="Login with Token", value="login"),
+                questionary.Choice(title="Sign up ($10 free credit)", value="signup"),
+            ],
+        ).ask()
+        if cloud_provider == "login":
+            token = questionary.text("Enter your token").ask()
+            cmd = ["bentoml", "cloud", "login", "--token", token]
+            # print(f"\n$ {' '.join(cmd)}")
+            try:
+                subprocess.check_call(cmd)
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to login")
+        elif cloud_provider == "signup":
+            token = questionary.text(
+                "Open https://cloud.bentoml.org/signup in your browser",
+            ).ask()
+            # cmd = ["bentoml", "cloud", "signup"]
+            # print(f"\n$ {' '.join(cmd)}")
+            # try:
+            # subprocess.check_call(cmd)
+            # except subprocess.CalledProcessError:
+            # raise RuntimeError("Failed to sign up")
+
+    elif cloud_provider == "aws":
+        try:
+            cmd = ["aws", "ec2", "describe-instance-types", "--no-cli-pager"]
+            print(f"\n$ {' '.join(cmd)}")
+            _instance_types = subprocess.check_output(cmd, text=True)
+        except subprocess.CalledProcessError:
+            raise
+            # print(e)
+            # _cli_install_aws()
+        available_it_infos = json.loads(_instance_types)["InstanceTypes"]
+        # pyaml.p(available_it_infos)
+
+        service = bento_info["services"][0]
+        if "config" not in service or "resources" not in service["config"]:
+            raise ValueError("Service config is missing")
+        elif "gpu" in service["config"]["resources"]:
+            gpu_count = service["config"]["resources"]["gpu"]
+            gpu_type = service["config"]["resources"].get("gpu_type")
+            gpu_memory = service["config"]["resources"].get("gpu_memory")
+            supported_its = _filter_instance_types(
+                available_it_infos,
+                gpu_count,
+                gpu_memory,
+                gpu_type,
+            )
+            it = questionary.select(
+                "Select an instance type",
+                choices=[
+                    questionary.Choice(
+                        title=_get_it_card(it_info),
+                        value=it_info["InstanceType"],
+                    )
+                    for it_info in supported_its
+                ],
+            ).ask()
+            security_group_id = _ensure_aws_security_group()
+            AMI = "ami-02623cf022763d4a1"
+
+            init_script_file = TEMP_DIR / f"init_script_{str(uuid.uuid4())[:8]}.sh"
+            with open(init_script_file, "w") as f:
+                f.write(
+                    INIT_SCRIPT_TEMPLATE.format(
+                        repo=repo,
+                        subdirectory=subdirectory,
+                        model=model,
+                        tag=tag,
+                        env_args=" ".join([f"-e {k}={v}" for k, v in envs.items()]),
+                    )
+                )
+            # grant permission
+            os.chmod(init_script_file, 0o755)
+            cmd = [
+                "aws",
+                "ec2",
+                "run-instances",
+                "--image-id",
+                AMI,
+                "--instance-type",
+                it,
+                "--security-group-ids",
+                security_group_id,
+                "--user-data",
+                f"file://{init_script_file}",
+                "--key-name",
+                "jiang",
+                "--count",
+                "1",
+                "--no-cli-pager",
+            ]
+            # print(f"\n$ {' '.join(cmd)}")
+            try:
+                result = subprocess.check_output(cmd)
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to create instance")
+            result = json.loads(result)
+            instance_id = result["Instances"][0]["InstanceId"]
+            print(f"Deployment {instance_id} is created")
+
+            cmd = [
+                "aws",
+                "ec2",
+                "describe-instances",
+                "--instance-ids",
+                instance_id,
+                "--no-cli-pager",
+            ]
+            # print(f"\n$ {' '.join(cmd)}")
+            result = subprocess.check_output(cmd)
+            result = json.loads(result)
+            public_ip = result["Reservations"][0]["Instances"][0]["PublicIpAddress"]
+            print(f"Public IP: {public_ip}")
+
+            server_start_time = time.time()
+            print("Server is starting...")
+            with prompt_toolkit.shortcuts.ProgressBar() as pb:
+                for _ in pb(range(100)):
+                    start_time = time.time()
+                    try:
+                        with bentoml.SyncHTTPClient(f"http://{public_ip}"):
+                            break
+                    except Exception:
+                        time.sleep(max(0, 6 - (time.time() - start_time)))
+                else:
+                    raise RuntimeError("Instance is not ready after 10 minutes")
+            print(f"Server started in {time.time() - server_start_time:.2f} seconds")
+            print(f"HTTP server is ready at http://{public_ip}")
+            return
+        else:
+            raise ValueError("GPU is required for now")
+    if cloud_provider == "bentocloud":
+        cmd = ["bentoml", "cloud", "current-context"]
+        # print(f"\n$ {' '.join(cmd)}")
+        try:
+            output = subprocess.check_output(cmd, text=True)
+        except subprocess.CalledProcessError:
+            raise RuntimeError(
+                "Failed to get bentocloud login context, please login first",
+            )
+
+
+@app.command()
+def run(model: str, tag: str = "latest", force_rebuild: bool = False):
+    serve(model, tag, force_rebuild)
+
+
+INIT_SCRIPT_TEMPLATE = """#!/bin/bash
+pip3 install bentoml
+rm -r /usr/local/cuda*
+git clone {repo} /root/bento_repo
+export BENTOML_HOME=/root/bento_repo/{subdirectory}
+bentoml containerize {model}:{tag} --image-tag {model}:{tag}
+docker run --restart always --gpus all -d -p 80:3000 {env_args} {model}:{tag}
+
+nvidia-smi -q | grep -A2 "ECC Mode" | grep "Current" | grep "Enabled"
+ECC_ENABLED=$?
+
+if [[ $ECC_ENABLED -eq 0 ]]; then
+  echo "ECC is enabled. Disabling now..."
+  nvidia-smi -e 0
+  reboot
+else
+  echo "ECC is not enabled. No changes made."
+fi
+"""
+
+
+if __name__ == "__main__":
+    app()
diff --git a/cllama/spec.py b/cllama/spec.py
new file mode 100644
index 00000000..006041fc
--- /dev/null
+++ b/cllama/spec.py
@@ -0,0 +1,11 @@
+GPU_MEMORY = {
+    "nvidia-tesla-t4": 16,
+    "nvidia-tesla-v100": 16,
+    "nvidia-tesla-p100": 16,
+    "nvidia-tesla-p4": 8,
+    "nvidia-tesla-k80": 12,
+    "nvidia-tesla-a100": 40,
+    "nvidia-tesla-a100-80gb": 80,
+    "nvidia-tesla-a10g": 24,
+    "nvidia-l4": 24,
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..75d7a40c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,22 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cllama"
+version = "0.0.1"
+description = "A description of your package."
+authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
+license = {file = "LICENSE"}
+dependencies = [
+    "typer",
+    "bentoml",
+    "pyaml",
+    "fastapi",
+    "questionary",
+    "psutil",
+    "pathlib"
+]
+
+[tool.typer]
+src-dir = "cllama"
diff --git a/req.txt b/req.txt
new file mode 100644
index 00000000..4f08fb7b
--- /dev/null
+++ b/req.txt
@@ -0,0 +1,7 @@
+typer
+bentoml
+pyaml
+fastapi
+questionary
+psutil
+pathlib