Merge openllm-next as openllm 0.6

This commit is contained in:
bojiang
2024-07-09 14:21:52 +08:00
17 changed files with 2482 additions and 0 deletions

5
.gitattributes vendored Normal file
View File

@@ -0,0 +1,5 @@
**/_next/ linguist-generated=true
* text=auto eol=lf
# Needed for setuptools-scm-git-archive
.git_archival.txt export-subst

163
.gitignore vendored Normal file
View File

@@ -0,0 +1,163 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.whl
# Environments
venv/

107
DEVELOPMENT.md Normal file
View File

@@ -0,0 +1,107 @@
# Developer Guide
This Developer Guide is designed to help you contribute to the OpenLLM project.
Follow these steps to set up your development environment and learn the process
of contributing to our open-source project.
Join our [Discord Channel](https://l.bentoml.com/join-openllm-discord) and reach
out to us if you have any question!
## Table of Contents
- [Developer Guide](#developer-guide)
- [Table of Contents](#table-of-contents)
- [Setting Up Your Development Environment](#setting-up-your-development-environment)
- [Development Workflow](#development-workflow)
- [Adding new models](#adding-new-models)
- [Adding bentos](#adding-new-models)
- [Adding repos](#adding-new-models)
## Setting Up Your Development Environment
Before you can start developing, you'll need to set up your environment:
1. Ensure you have [Git](https://git-scm.com/), and
[Python3.8+](https://www.python.org/downloads/) installed.
2. Fork the OpenLLM repository from GitHub.
3. Clone the forked repository from GitHub:
```bash
git clone git@github.com:username/OpenLLM.git && cd openllm
```
4. Add the OpenLLM upstream remote to your local OpenLLM clone:
```bash
git remote add upstream git@github.com:bentoml/OpenLLM.git
```
5. Configure git to pull from the upstream remote:
```bash
git switch main # ensure you're on the main branch
git fetch upstream --tags
git branch --set-upstream-to=upstream/main
```
## Development Workflow
There are a few ways to contribute to the repository structure for OpenLLM:
### Adding new models
1. [recipe.yaml](./recipe.yaml) contains all related-metadata for generating new LLM-based bentos. To add a new LLM, the following structure should be adhere to:
```yaml
"<model_name>:<model_tag>":
project: vllm-chat
service_config:
name: phi3
traffic:
timeout: 300
resources:
gpu: 1
gpu_type: nvidia-tesla-l4
engine_config:
model: microsoft/Phi-3-mini-4k-instruct
max_model_len: 4096
dtype: half
chat_template: phi-3
```
- `<model_name>` represents the type of model to be supported. Currently supports `phi3`, `llama2`, `llama3`, `gemma`
- `<model_tag>` emphasizes the type of model and its related metadata. The convention would include `<model_size>-<model_type>-<precision>[-<quantization>]`
For example:
- `microsoft/Phi-3-mini-4k-instruct` should be represented as `3.8b-instruct-fp16`.
- `TheBloke/Llama-2-7B-Chat-AWQ` would be `7b-chat-awq-4bit`
- `project` would be used as the basis for the generated bento. Currently, most models should use `vllm-chat` as default.
- `service_config` entails all BentoML-related [configuration](https://docs.bentoml.com/en/latest/guides/configurations.html) to run this bento.
> [!NOTE]
>
> We recommend to include the following field for `service_config`:
>
> - `name` should be the same as `<model_name>`
> - `resources` includes the available accelerator that can run this models. See more [here](https://docs.bentoml.com/en/latest/guides/configurations.html#resources)
- `engine_config` are fields to be used for vLLM engine. See more supported arguments in [`AsyncEngineArgs`](https://github.com/vllm-project/vllm/blob/7cd2ebb0251fd1fd0eec5c93dac674603a22eddd/vllm/engine/arg_utils.py#L799). We recommend to always include `model`, `max_model_len`, `dtype` and `trust_remote_code`.
- If the model is a chat model, `chat_template` should be used. Add the appropriate `chat_template` under [chat_template directory](./vllm-chat/chat_templates/) should you decide to do so.
2. You can then run `BENTOML_HOME=$(openllm repo default)/bentoml/bentos python make.py <model_name>:<model_tag>` to generate the required bentos.
3. You can then submit a [Pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) to `openllm` with the recipe changes
### Adding bentos
OpenLLM now also manages a [generated bento repository](https://github.com/bentoml/openllm-models/tree/main). If you update and modify and generated bentos, make sure to update the recipe and added the generated bentos under `bentoml/bentos`.
### Adding repos
If you wish to create a your own managed git repo, you should follow the structure of [bentoml/openllm-models](https://github.com/bentoml/openllm-models/tree/main).
To add your custom repo, do `openllm repo add <repo_alias> <git_url>`

201
LICENSE Normal file
View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

22
README.md Normal file
View File

@@ -0,0 +1,22 @@
```
pip install .
openllm serve
# or openllm run
```
To find out what LLM models are already in your hands.
License
-------
This project is licensed under the MIT License - see the LICENSE file for details.
Acknowledgements
----------------
This project makes use of the following open-source projects:
* [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
* [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
* [chujiezheng/chat_templates](https://github.com/chujiezheng/chat_templates)
We are grateful to the developers and contributors of these projects for their hard work and dedication.

0
openllm_next/__init__.py Normal file
View File

338
openllm_next/__main__.py Normal file
View File

@@ -0,0 +1,338 @@
import os
import random
import sys
from collections import defaultdict
from typing import Annotated, Optional
import questionary
import typer
from openllm_next.accelerator_spec import (
DeploymentTarget,
can_run,
get_local_machine_spec,
)
from openllm_next.analytic import DO_NOT_TRACK, OpenLLMTyper
from openllm_next.clean import app as clean_app
from openllm_next.cloud import deploy as cloud_deploy
from openllm_next.cloud import ensure_cloud_context, get_cloud_machine_spec
from openllm_next.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
from openllm_next.local import run as local_run
from openllm_next.local import serve as local_serve
from openllm_next.model import app as model_app
from openllm_next.model import ensure_bento, list_bento
from openllm_next.repo import app as repo_app
app = OpenLLMTyper(
help="`openllm hello` to get started. "
"OpenLLM is a CLI tool to manage and deploy open source LLMs and"
" get an OpenAI API compatible chat server in seconds.",
)
app.add_typer(repo_app, name="repo")
app.add_typer(model_app, name="model")
app.add_typer(clean_app, name="clean")
def _select_bento_name(models, target):
from tabulate import tabulate
options = []
model_infos = [
[model.repo.name, model.name, can_run(model, target)] for model in models
]
model_name_groups = defaultdict(lambda: 0)
for repo, name, score in model_infos:
model_name_groups[(repo, name)] += score
table_data = [
[name, repo, CHECKED if score > 0 else ""]
for (repo, name), score in model_name_groups.items()
]
if not table_data:
output("No model found", style="red")
raise typer.Exit(1)
table = tabulate(
table_data,
headers=["model", "repo", "locally runnable"],
).split("\n")
headers = f"{table[0]}\n {table[1]}"
options.append(questionary.Separator(headers))
for table_data, table_line in zip(table_data, table[2:]):
options.append(questionary.Choice(table_line, value=table_data[:2]))
selected = questionary.select("Select a model", options).ask()
if selected is None:
raise typer.Exit(1)
return selected
def _select_bento_version(models, target, bento_name, repo):
from tabulate import tabulate
model_infos = [
[model, can_run(model, target)]
for model in models
if model.name == bento_name and model.repo.name == repo
]
table_data = [
[model.tag, CHECKED if score > 0 else ""]
for model, score in model_infos
if model.name == bento_name and model.repo.name == repo
]
if not table_data:
output(f"No model found for {bento_name} in {repo}", style="red")
raise typer.Exit(1)
table = tabulate(
table_data,
headers=["version", "locally runnable"],
).split("\n")
options = []
options.append(questionary.Separator(f"{table[0]}\n {table[1]}"))
for table_data, table_line in zip(model_infos, table[2:]):
options.append(questionary.Choice(table_line, value=table_data))
selected = questionary.select("Select a version", options).ask()
if selected is None:
raise typer.Exit(1)
return selected
def _select_target(bento, targets):
from tabulate import tabulate
options = []
targets.sort(key=lambda x: can_run(bento, x), reverse=True)
if not targets:
output(
"No available instance type, check your bentocloud account",
style="red",
)
raise typer.Exit(1)
table = tabulate(
[
[
target.name,
target.accelerators_repr,
f"${target.price}",
CHECKED if can_run(bento, target) else "insufficient res.",
]
for target in targets
],
headers=["instance type", "accelerator", "price/hr", "deployable"],
).split("\n")
options.append(questionary.Separator(f"{table[0]}\n {table[1]}"))
for target, line in zip(targets, table[2:]):
options.append(
questionary.Choice(
f"{line}",
value=target,
)
)
selected = questionary.select("Select an instance type", options).ask()
if selected is None:
raise typer.Exit(1)
return selected
def _select_action(bento, score):
if score > 0:
options = [
questionary.Separator("Available actions"),
questionary.Choice(
"0. Run the model in terminal",
value="run",
shortcut_key="0",
),
questionary.Separator(f" $ openllm run {bento}"),
questionary.Separator(" "),
questionary.Choice(
"1. Serve the model locally and get a chat server",
value="serve",
shortcut_key="1",
),
questionary.Separator(f" $ openllm serve {bento}"),
questionary.Separator(" "),
questionary.Choice(
"2. Deploy the model to bentocloud and get a scalable chat server",
value="deploy",
shortcut_key="2",
),
questionary.Separator(f" $ openllm deploy {bento}"),
]
else:
options = [
questionary.Separator("Available actions"),
questionary.Choice(
"0. Run the model in terminal",
value="run",
disabled="insufficient res.",
shortcut_key="0",
),
questionary.Separator(f" $ openllm run {bento}"),
questionary.Separator(" "),
questionary.Choice(
"1. Serve the model locally and get a chat server",
value="serve",
disabled="insufficient res.",
shortcut_key="1",
),
questionary.Separator(f" $ openllm serve {bento}"),
questionary.Separator(" "),
questionary.Choice(
"2. Deploy the model to bentocloud and get a scalable chat server",
value="deploy",
shortcut_key="2",
),
questionary.Separator(f" $ openllm deploy {bento}"),
]
action = questionary.select("Select an action", options).ask()
if action is None:
raise typer.Exit(1)
if action == "run":
try:
local_run(bento)
finally:
output("\nUse this command to run the action again:", style="green")
output(f" $ openllm run {bento}", style="orange")
elif action == "serve":
try:
local_serve(bento)
finally:
output("\nUse this command to run the action again:", style="green")
output(f" $ openllm serve {bento}", style="orange")
elif action == "deploy":
ensure_cloud_context()
targets = get_cloud_machine_spec()
target = _select_target(bento, targets)
try:
cloud_deploy(bento, target)
finally:
output("\nUse this command to run the action again:", style="green")
output(
f" $ openllm deploy {bento} --instance-type {target.name}",
style="orange",
)
@app.command(help="get started interactively")
def hello():
INTERACTIVE.set(True)
VERBOSE_LEVEL.set(20)
target = get_local_machine_spec()
output(f" Detected Platform: {target.platform}", style="green")
if target.accelerators:
output(" Detected Accelerators: ", style="green")
for a in target.accelerators:
output(f" - {a.model} {a.memory_size}GB", style="green")
else:
output(" Detected Accelerators: None", style="yellow")
models = list_bento()
if not models:
output(
"No model found, you probably need to update the model repo:",
style="red",
)
output(
" $ openllm repo update",
style="orange",
)
raise typer.Exit(1)
bento_name, repo = _select_bento_name(models, target)
bento, score = _select_bento_version(models, target, bento_name, repo)
_select_action(bento, score)
@app.command(help="start an OpenAI API compatible chat server and chat in browser")
def serve(
model: Annotated[str, typer.Argument()] = "",
repo: Optional[str] = None,
port: int = 3000,
verbose: bool = False,
):
if verbose:
VERBOSE_LEVEL.set(20)
target = get_local_machine_spec()
bento = ensure_bento(model, target=target, repo_name=repo)
local_serve(bento, port=port)
@app.command(help="run the model and chat in terminal")
def run(
model: Annotated[str, typer.Argument()] = "",
repo: Optional[str] = None,
port: Optional[int] = None,
timeout: int = 600,
verbose: bool = False,
):
if verbose:
VERBOSE_LEVEL.set(20)
target = get_local_machine_spec()
bento = ensure_bento(model, target=target, repo_name=repo)
if port is None:
port = random.randint(30000, 40000)
local_run(bento, port=port, timeout=timeout)
@app.command(
help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)",
)
def deploy(
model: Annotated[str, typer.Argument()] = "",
instance_type: Optional[str] = None,
repo: Optional[str] = None,
verbose: bool = False,
):
if verbose:
VERBOSE_LEVEL.set(20)
bento = ensure_bento(model, repo_name=repo)
if instance_type is not None:
cloud_deploy(bento, DeploymentTarget(name=instance_type))
return
targets = get_cloud_machine_spec()
targets = filter(lambda x: can_run(bento, x) > 0, targets)
targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
if not targets:
output(
"No available instance type, check your bentocloud account",
style="red",
)
raise typer.Exit(1)
target = targets[0]
output(
f"Recommended instance type: {target.name}",
style="green",
)
cloud_deploy(bento, target)
@app.callback(invoke_without_command=True)
def typer_callback(
verbose: int = 0,
do_not_track: bool = typer.Option(
False,
"--do-not-track",
help="Whether to disable usage tracking",
envvar=DO_NOT_TRACK,
),
):
if verbose:
VERBOSE_LEVEL.set(verbose)
if do_not_track:
os.environ[DO_NOT_TRACK] = str(True)
def main():
if sys.version_info < (3, 9):
output("Python 3.8 or higher is required", style="red")
sys.exit(1)
app()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,166 @@
from __future__ import annotations
import functools
import math
import typing
from types import SimpleNamespace
import psutil
from openllm_next.common import BentoInfo, DeploymentTarget, output
class Accelerator(SimpleNamespace):
model: str
memory_size: float
def __gt__(self, other):
return self.memory_size > other.memory_size
def __eq__(self, other):
return self.memory_size == other.memory_size
def __repr__(self):
return f"{self.model}({self.memory_size}GB)"
class Resource(SimpleNamespace):
cpu: int = 0
memory: float
gpu: int = 0
gpu_type: str = ""
def __hash__(self):
return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
def __bool__(self):
return any(value is not None for value in self.__dict__.values())
ACCELERATOR_SPEC_DICT: dict[str, dict] = {
"nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0},
"nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0},
"nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0},
"nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0},
"nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0},
"nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0},
"nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0},
"nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0},
"nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0},
"nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0},
"nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0},
"nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0},
"nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0},
"nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0},
"nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0},
"nvidia-l4": {"model": "L4", "memory_size": 24.0},
"nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0},
"nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0},
"nvidia-a100-80g": {"model": "A100", "memory_size": 80.0},
"nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0},
"nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0},
}
ACCELERATOR_SPECS: dict[str, Accelerator] = {
key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()
}
@functools.lru_cache
def get_local_machine_spec():
if psutil.MACOS:
return DeploymentTarget(accelerators=[], source="local", platform="macos")
if psutil.WINDOWS:
platform = "windows"
elif psutil.LINUX:
platform = "linux"
else:
raise NotImplementedError(f"Unsupported platform")
from pynvml import (
nvmlDeviceGetCount,
nvmlDeviceGetCudaComputeCapability,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo,
nvmlDeviceGetName,
nvmlInit,
nvmlShutdown,
)
try:
nvmlInit()
device_count = nvmlDeviceGetCount()
accelerators: list[Accelerator] = []
for i in range(device_count):
handle = nvmlDeviceGetHandleByIndex(i)
name = nvmlDeviceGetName(handle)
memory_info = nvmlDeviceGetMemoryInfo(handle)
accelerators.append(
Accelerator(
model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
)
)
compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
if compute_capability < (7, 5):
output(
f"GPU {name} with compute capability {compute_capability} "
"may not be supported, 7.5 or higher is recommended. check "
"https://developer.nvidia.com/cuda-gpus for more information",
style="yellow",
)
nvmlShutdown()
return DeploymentTarget(
accelerators=accelerators,
source="local",
platform=platform,
)
except Exception as e:
output(
f"Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment",
style="yellow",
)
output(f"Error: {e}", style="red", level=20)
return DeploymentTarget(accelerators=[], source="local", platform=platform)
@functools.lru_cache()
def can_run(
bento: typing.Union[Resource, BentoInfo],
target: typing.Optional[DeploymentTarget] = None,
) -> float:
"""
Calculate if the bento can be deployed on the target.
"""
if target is None:
target = get_local_machine_spec()
resource_spec = Resource(**(bento.bento_yaml["services"][0]["config"].get("resources", {})))
labels = bento.bento_yaml.get("labels", {})
platforms = labels.get("platforms", "linux").split(",")
if target.platform not in platforms:
return 0.0
# return 1.0 if no resource is specified
if not resource_spec:
return 0.5
if resource_spec.gpu > 0:
required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
filtered_accelerators = [
ac
for ac in target.accelerators
if ac.memory_size >= required_gpu.memory_size
]
if resource_spec.gpu > len(filtered_accelerators):
return 0.0
return (
required_gpu.memory_size
* resource_spec.gpu
/ sum(ac.memory_size for ac in target.accelerators)
)
if target.accelerators:
return 0.01 / sum(ac.memory_size for ac in target.accelerators)
return 1.0

118
openllm_next/analytic.py Normal file
View File

@@ -0,0 +1,118 @@
from __future__ import annotations
import functools
import os
import re
import time
import typing
from abc import ABC
import attr
import click
import typer
import typer.core
DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK"
class EventMeta(ABC):
@property
def event_name(self):
# camel case to snake case
event_name = re.sub(r"(?<!^)(?=[A-Z])", "_", self.__class__.__name__).lower()
# remove "_event" suffix
suffix_to_remove = "_event"
if event_name.endswith(suffix_to_remove):
event_name = event_name[: -len(suffix_to_remove)]
return event_name
@attr.define
class CliEvent(EventMeta):
cmd_group: str
cmd_name: str
duration_in_ms: float = attr.field(default=0)
error_type: typing.Optional[str] = attr.field(default=None)
return_code: typing.Optional[int] = attr.field(default=None)
@attr.define
class OpenllmCliEvent(CliEvent):
pass
class OrderedCommands(typer.core.TyperGroup):
def list_commands(self, _: click.Context) -> typing.Iterable[str]:
return list(self.commands)
class OpenLLMTyper(typer.Typer):
def __init__(self, *args: typing.Any, **kwargs: typing.Any):
no_args_is_help = kwargs.pop("no_args_is_help", True)
context_settings = kwargs.pop("context_settings", {})
if "help_option_names" not in context_settings:
context_settings["help_option_names"] = ("-h", "--help")
if "max_content_width" not in context_settings:
context_settings["max_content_width"] = int(
os.environ.get("COLUMNS", str(120))
)
klass = kwargs.pop("cls", OrderedCommands)
super().__init__(
*args,
cls=klass,
no_args_is_help=no_args_is_help,
context_settings=context_settings,
**kwargs,
)
def command(self, *args: typing.Any, **kwargs: typing.Any):
def decorator(f):
@functools.wraps(f)
@click.pass_context
def wrapped(ctx: click.Context, *args, **kwargs):
from bentoml._internal.utils.analytics import track
do_not_track = (
os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true"
)
# so we know that the root program is openllm
command_name = ctx.info_name
if ctx.parent.parent is not None:
# openllm model list
command_group = ctx.parent.info_name
elif ctx.parent.info_name == ctx.find_root().info_name:
# openllm run
command_group = "openllm"
if do_not_track:
return f(*args, **kwargs)
start_time = time.time_ns()
try:
return_value = f(*args, **kwargs)
duration_in_ns = time.time_ns() - start_time
track(
OpenllmCliEvent(
cmd_group=command_group,
cmd_name=command_name,
duration_in_ms=duration_in_ns / 1e6,
)
)
return return_value
except BaseException as e:
duration_in_ns = time.time_ns() - start_time
track(
OpenllmCliEvent(
cmd_group=command_group,
cmd_name=command_name,
duration_in_ms=duration_in_ns / 1e6,
error_type=type(e).__name__,
return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
)
)
raise
return typer.Typer.command(self, *args, **kwargs)(wrapped)
return decorator

75
openllm_next/clean.py Normal file
View File

@@ -0,0 +1,75 @@
import pathlib
import shutil
import questionary
from openllm_next.analytic import OpenLLMTyper
from openllm_next.common import (
CONFIG_FILE,
REPO_DIR,
VENV_DIR,
VERBOSE_LEVEL,
output,
)
app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM")
HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub"
@app.command(help="Clean up all the cached models from huggingface")
def model_cache(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob("*"))
sure = questionary.confirm(
f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
).ask()
if not sure:
return
shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
output("All models cached by Huggingface have been removed", style="green")
@app.command(help="Clean up all the virtual environments created by OpenLLM")
def venvs(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
used_space = sum(f.stat().st_size for f in VENV_DIR.rglob("*"))
sure = questionary.confirm(
f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
).ask()
if not sure:
return
shutil.rmtree(VENV_DIR, ignore_errors=True)
output("All virtual environments have been removed", style="green")
@app.command(help="Clean up all the repositories cloned by OpenLLM")
def repos(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
shutil.rmtree(REPO_DIR, ignore_errors=True)
output("All repositories have been removed", style="green")
@app.command(help="Reset configurations to default")
def configs(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
shutil.rmtree(CONFIG_FILE, ignore_errors=True)
output("All configurations have been reset", style="green")
@app.command(
name="all",
help="Clean up all above and bring OpenLLM to a fresh start",
)
def all_cache(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
repos()
venvs()
model_cache()
configs()

174
openllm_next/cloud.py Normal file
View File

@@ -0,0 +1,174 @@
import json
import os
import pathlib
import shutil
import subprocess
import typing
import typer
from openllm_next.accelerator_spec import ACCELERATOR_SPECS
from openllm_next.analytic import OpenLLMTyper
from openllm_next.common import (
INTERACTIVE,
BentoInfo,
DeploymentTarget,
output,
run_command,
)
app = OpenLLMTyper()
def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
cmd = ["bentoml", "deploy", bento.bentoml_tag]
env = {
"BENTOML_HOME": f"{bento.repo.path}/bentoml",
}
required_envs = bento.bento_yaml.get("envs", [])
required_env_names = [env["name"] for env in required_envs if "name" in env]
if required_env_names:
output(
f"This model requires the following environment variables to run: {repr(required_env_names)}",
style="yellow",
)
for env_info in bento.bento_yaml.get("envs", []):
if "name" not in env_info:
continue
if os.environ.get(env_info["name"]):
default = os.environ[env_info["name"]]
elif "value" in env_info:
default = env_info["value"]
else:
default = ""
if INTERACTIVE.get():
import questionary
value = questionary.text(
f"{env_info['name']}:",
default=default,
).ask()
else:
if default == "":
output(
f"Environment variable {env_info['name']} is required but not provided",
style="red",
)
raise typer.Exit(1)
else:
value = default
if value is None:
raise typer.Exit(1)
cmd += ["--env", f"{env_info['name']}={value}"]
if target:
cmd += ["--instance-type", target.name]
assert (pathlib.Path.home() / "bentoml" / ".yatai.yaml").exists()
shutil.copy(
pathlib.Path.home() / "bentoml" / ".yatai.yaml",
bento.repo.path / "bentoml" / ".yatai.yaml",
)
return cmd, env, None
def ensure_cloud_context():
import questionary
cmd = ["bentoml", "cloud", "current-context"]
try:
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
context = json.loads(result)
output(f" bentoml already logged in: {context['endpoint']}", style="green")
except subprocess.CalledProcessError:
output(" bentoml not logged in", style="red")
if not INTERACTIVE.get():
output(
"\n get bentoml logged in by:",
)
output(
" $ bentoml cloud login",
style="orange",
)
output("")
output(
""" * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
style="yellow",
)
raise typer.Exit(1)
else:
action = questionary.select(
"Choose an action:",
choices=[
"I have a BentoCloud account",
"get an account in two minutes",
],
).ask()
if action is None:
raise typer.Exit(1)
elif action == "get an account in two minutes":
output(
"Please visit https://cloud.bentoml.com to get your token",
style="yellow",
)
endpoint = questionary.text(
"Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)"
).ask()
if endpoint is None:
raise typer.Exit(1)
token = questionary.text(
"Enter your token: (similar to cniluaxxxxxxxx)"
).ask()
if token is None:
raise typer.Exit(1)
cmd = [
"bentoml",
"cloud",
"login",
"--api-token",
token,
"--endpoint",
endpoint,
]
try:
result = subprocess.check_output(cmd)
output(" Logged in successfully", style="green")
except subprocess.CalledProcessError:
output(" Failed to login", style="red")
raise typer.Exit(1)
def get_cloud_machine_spec():
ensure_cloud_context()
cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
try:
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
instance_types = json.loads(result)
return [
DeploymentTarget(
source="cloud",
name=it["name"],
price=it["price"],
platform="linux",
accelerators=(
[ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))]
if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS
else []
),
)
for it in instance_types
]
except (subprocess.CalledProcessError, json.JSONDecodeError):
output("Failed to get cloud instance types", style="red")
return []
def deploy(bento: BentoInfo, target: DeploymentTarget):
ensure_cloud_context()
cmd, env, cwd = _get_deploy_cmd(bento, target)
run_command(cmd, env=env, cwd=cwd)

422
openllm_next/common.py Normal file
View File

@@ -0,0 +1,422 @@
from __future__ import annotations
import asyncio
import functools
import hashlib
import io
import json
import os
import pathlib
import signal
import subprocess
import sys
import sysconfig
import typing
from contextlib import asynccontextmanager, contextmanager
from types import SimpleNamespace
import typer
import typer.core
ERROR_STYLE = "red"
SUCCESS_STYLE = "green"
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
REPO_DIR = CLLAMA_HOME / "repos"
TEMP_DIR = CLLAMA_HOME / "temp"
VENV_DIR = CLLAMA_HOME / "venv"
REPO_DIR.mkdir(exist_ok=True, parents=True)
TEMP_DIR.mkdir(exist_ok=True, parents=True)
VENV_DIR.mkdir(exist_ok=True, parents=True)
CONFIG_FILE = CLLAMA_HOME / "config.json"
CHECKED = ""
T = typing.TypeVar("T")
class ContextVar(typing.Generic[T]):
def __init__(self, default: T):
self._stack: list[T] = []
self._default = default
def get(self) -> T:
if self._stack:
return self._stack[-1]
return self._default
def set(self, value):
self._stack.append(value)
@contextmanager
def patch(self, value):
self._stack.append(value)
try:
yield
finally:
self._stack.pop()
VERBOSE_LEVEL = ContextVar(10)
INTERACTIVE = ContextVar(False)
FORCE = ContextVar(False)
def output(content, level=0, style=None, end=None):
import questionary
if level > VERBOSE_LEVEL.get():
return
if not isinstance(content, str):
import pyaml
out = io.StringIO()
pyaml.pprint(
content,
dst=out,
sort_dicts=False,
sort_keys=False,
)
questionary.print(out.getvalue(), style=style, end="" if end is None else end)
out.close()
if isinstance(content, str):
questionary.print(content, style=style, end="\n" if end is None else end)
class Config(SimpleNamespace):
repos: dict[str, str] = {
"default": "git+https://github.com/bentoml/openllm-models@main"
}
default_repo: str = "default"
def tolist(self):
return dict(
repos=self.repos,
default_repo=self.default_repo,
)
def load_config():
if CONFIG_FILE.exists():
try:
with open(CONFIG_FILE) as f:
return Config(**json.load(f))
except json.JSONDecodeError:
return Config()
return Config()
def save_config(config):
with open(CONFIG_FILE, "w") as f:
json.dump(config.tolist(), f, indent=2)
class RepoInfo(SimpleNamespace):
name: str
path: pathlib.Path
url: str
server: str
owner: str
repo: str
branch: str
def tolist(self):
if VERBOSE_LEVEL.get() <= 0:
return f"{self.name} ({self.url})"
if VERBOSE_LEVEL.get() <= 10:
return dict(
name=self.name,
url=self.url,
path=str(self.path),
)
if VERBOSE_LEVEL.get() <= 20:
return dict(
name=self.name,
url=self.url,
path=str(self.path),
server=self.server,
owner=self.owner,
repo=self.repo,
branch=self.branch,
)
class BentoInfo(SimpleNamespace):
repo: RepoInfo
path: pathlib.Path
alias: str = ""
def __str__(self):
if self.repo.name == "default":
return f"{self.tag}"
else:
return f"{self.repo.name}/{self.tag}"
def __hash__(self):
return md5(str(self.path))
@property
def tag(self) -> str:
if self.alias:
return f"{self.path.parent.name}:{self.alias}"
return f"{self.path.parent.name}:{self.path.name}"
@property
def bentoml_tag(self) -> str:
return f"{self.path.parent.name}:{self.path.name}"
@property
def name(self) -> str:
return self.path.parent.name
@property
def version(self) -> str:
return self.path.name
@property
def labels(self) -> dict[str, str]:
return self.bento_yaml["labels"]
@functools.cached_property
def bento_yaml(self) -> dict:
import yaml
bento_file = self.path / "bento.yaml"
return yaml.safe_load(bento_file.read_text())
@functools.cached_property
def platforms(self) -> list[str]:
return self.bento_yaml["labels"].get("platforms", "linux").split(",")
@functools.cached_property
def pretty_yaml(self) -> dict:
def _pretty_routes(routes):
return {
route["route"]: {
"input": {
k: v["type"] for k, v in route["input"]["properties"].items()
},
"output": route["output"]["type"],
}
for route in routes
}
if len(self.bento_yaml["services"]) == 1:
pretty_yaml = {
"apis": _pretty_routes(self.bento_yaml["schema"]["routes"]),
"resources": self.bento_yaml["services"][0]["config"]["resources"],
"envs": self.bento_yaml["envs"],
"platforms": self.platforms,
}
return pretty_yaml
return self.bento_yaml
@functools.cached_property
def pretty_gpu(self) -> str:
from openllm_next.accelerator_spec import ACCELERATOR_SPECS
try:
resources = self.bento_yaml["services"][0]["config"]["resources"]
if resources["gpu"] > 1:
acc = ACCELERATOR_SPECS[resources["gpu_type"]]
return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
elif resources["gpu"] > 0:
acc = ACCELERATOR_SPECS[resources["gpu_type"]]
return f"{acc.memory_size:.0f}G"
except KeyError:
pass
return ""
def tolist(self):
verbose = VERBOSE_LEVEL.get()
if verbose <= 0:
return str(self)
if verbose <= 10:
return dict(
tag=self.tag,
repo=self.repo.tolist(),
path=str(self.path),
model_card=self.pretty_yaml,
)
if verbose <= 20:
return dict(
tag=self.tag,
repo=self.repo.tolist(),
path=str(self.path),
bento_yaml=self.bento_yaml,
)
class VenvSpec(SimpleNamespace):
python_version: str
python_packages: dict[str, str]
name_prefix = ""
def __hash__(self):
return md5(
# self.python_version,
*sorted(self.python_packages),
)
class Accelerator(SimpleNamespace):
model: str
memory_size: float
def __gt__(self, other):
return self.memory_size > other.memory_size
def __eq__(self, other):
return self.memory_size == other.memory_size
class DeploymentTarget(SimpleNamespace):
source: str = "local"
name: str = "local"
price: str = ""
platform = "linux"
accelerators: list[Accelerator]
def __hash__(self):
return hash(self.source)
@property
def accelerators_repr(self) -> str:
accs = {a.model for a in self.accelerators}
if len(accs) == 0:
return "null"
if len(accs) == 1:
a = self.accelerators[0]
return f"{a.model} x{len(self.accelerators)}"
return ", ".join((f"{a.model}" for a in self.accelerators))
def run_command(
cmd,
cwd=None,
env=None,
copy_env=True,
venv=None,
silent=False,
) -> subprocess.CompletedProcess:
import shlex
env = env or {}
cmd = [str(c) for c in cmd]
bin_dir = "Scripts" if os.name == "nt" else "bin"
if not silent:
output("\n")
if cwd:
output(f"$ cd {cwd}", style="orange")
if env:
for k, v in env.items():
output(f"$ export {k}={shlex.quote(v)}", style="orange")
if venv:
output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
output(f"$ {' '.join(cmd)}", style="orange")
if venv:
py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
else:
py = sys.executable
if copy_env:
env = {**os.environ, **env}
if cmd and cmd[0] == "bentoml":
cmd = [py, "-m", "bentoml"] + cmd[1:]
if cmd and cmd[0] == "python":
cmd = [py] + cmd[1:]
try:
if silent:
return subprocess.run( # type: ignore
cmd,
cwd=cwd,
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
else:
return subprocess.run(
cmd,
cwd=cwd,
env=env,
)
except subprocess.CalledProcessError:
output("Command failed", style="red")
raise typer.Exit(1)
async def stream_command_output(stream, style="gray"):
async for line in stream:
output(line.decode(), style=style, end="")
@asynccontextmanager
async def async_run_command(
cmd,
cwd=None,
env=None,
copy_env=True,
venv=None,
silent=True,
):
import shlex
env = env or {}
cmd = [str(c) for c in cmd]
if not silent:
output("\n")
if cwd:
output(f"$ cd {cwd}", style="orange")
if env:
for k, v in env.items():
output(f"$ export {k}={shlex.quote(v)}", style="orange")
if venv:
output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
output(f"$ {' '.join(cmd)}", style="orange")
if venv:
py = venv / "bin" / "python"
else:
py = sys.executable
if copy_env:
env = {**os.environ, **env}
if cmd and cmd[0] == "bentoml":
cmd = [py, "-m", "bentoml"] + cmd[1:]
if cmd and cmd[0] == "python":
cmd = [py] + cmd[1:]
proc = None
try:
proc = await asyncio.create_subprocess_shell(
" ".join(map(str, cmd)),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=cwd,
env=env,
)
yield proc
except subprocess.CalledProcessError:
output("Command failed", style="red")
raise typer.Exit(1)
finally:
if proc:
proc.send_signal(signal.SIGINT)
await proc.wait()
def md5(*strings: str) -> int:
m = hashlib.md5()
for s in strings:
m.update(s.encode())
return int(m.hexdigest(), 16)

117
openllm_next/local.py Normal file
View File

@@ -0,0 +1,117 @@
import asyncio
import time
import httpx
from openllm_next.common import (
BentoInfo,
async_run_command,
output,
run_command,
stream_command_output,
)
from openllm_next.venv import ensure_venv
def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
cmd = ["bentoml", "serve", bento.bentoml_tag]
if port != 3000:
cmd += ["--port", str(port)]
env = {
"BENTOML_HOME": f"{bento.repo.path}/bentoml",
}
return cmd, env, None
def serve(
bento: BentoInfo,
port: int = 3000,
):
venv = ensure_venv(bento)
cmd, env, cwd = _get_serve_cmd(bento, port=port)
run_command(cmd, env=env, cwd=cwd, venv=venv)
async def _run_model(
bento: BentoInfo,
port: int = 3000,
timeout: int = 600,
):
venv = ensure_venv(bento)
cmd, env, cwd = _get_serve_cmd(bento, port)
async with async_run_command(
cmd,
env=env,
cwd=cwd,
venv=venv,
silent=False,
) as server_proc:
output(f"Model server started {server_proc.pid}")
stdout_streamer = None
stderr_streamer = None
start_time = time.time()
output("Model loading...", style="green")
for _ in range(timeout):
try:
resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3)
if resp.status_code == 200:
break
except httpx.RequestError:
if time.time() - start_time > 30:
if not stdout_streamer:
stdout_streamer = asyncio.create_task(
stream_command_output(server_proc.stdout, style="gray")
)
if not stderr_streamer:
stderr_streamer = asyncio.create_task(
stream_command_output(server_proc.stderr, style="#BD2D0F")
)
await asyncio.sleep(1)
else:
output("Model failed to load", style="red")
server_proc.terminate()
return
if stdout_streamer:
stdout_streamer.cancel()
if stderr_streamer:
stderr_streamer.cancel()
output("Model is ready", style="green")
messages: list[dict[str, str]] = []
from openai import AsyncOpenAI
client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local")
model_id = (await client.models.list()).data[0].id
while True:
try:
message = input("user: ")
if message == "":
output("empty message, please enter something", style="yellow")
continue
messages.append(dict(role="user", content=message))
output("assistant: ", end="", style="lightgreen")
assistant_message = ""
stream = await client.chat.completions.create(
model=model_id,
messages=messages, # type: ignore
stream=True,
)
async for chunk in stream:
text = chunk.choices[0].delta.content or ""
assistant_message += text
output(text, end="", style="lightgreen")
messages.append(dict(role="assistant", content=assistant_message))
output("")
except KeyboardInterrupt:
break
output("\nStopping model server...", style="green")
output("Stopped model server", style="green")
def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
asyncio.run(_run_model(bento, port=port, timeout=timeout))

173
openllm_next/model.py Normal file
View File

@@ -0,0 +1,173 @@
import typing
from typing import Optional
import tabulate
import typer
from openllm_next.accelerator_spec import DeploymentTarget, can_run
from openllm_next.analytic import OpenLLMTyper
from openllm_next.common import (
FORCE,
VERBOSE_LEVEL,
BentoInfo,
load_config,
output,
)
from openllm_next.repo import ensure_repo_updated, parse_repo_url
app = OpenLLMTyper(help="manage models")
@app.command()
def get(
tag: str,
repo: Optional[str] = None,
verbose: bool = False,
):
if verbose:
VERBOSE_LEVEL.set(20)
bento_info = ensure_bento(tag, repo_name=repo)
if bento_info:
output(bento_info)
@app.command(name="list")
def list_(
tag: Optional[str] = None,
repo: Optional[str] = None,
verbose: bool = False,
):
if verbose:
VERBOSE_LEVEL.set(20)
bentos = list_bento(tag=tag, repo_name=repo)
bentos.sort(key=lambda x: x.name)
seen = set()
def is_seen(value):
if value in seen:
return True
seen.add(value)
return False
table = tabulate.tabulate(
[
[
"" if is_seen(bento.name) else bento.name,
bento.tag,
bento.repo.name,
bento.pretty_gpu,
",".join(bento.platforms),
]
for bento in bentos
],
headers=["model", "version", "repo", "required VRAM", "platforms"],
)
output(table)
def ensure_bento(
model: str,
target: Optional[DeploymentTarget] = None,
repo_name: Optional[str] = None,
) -> BentoInfo:
bentos = list_bento(model, repo_name=repo_name)
if len(bentos) == 0:
output(f"No model found for {model}", style="red")
raise typer.Exit(1)
if len(bentos) == 1:
if FORCE.get():
output(f"Found model {bentos[0]}", style="green")
return bentos[0]
if target is None:
return bentos[0]
if can_run(bentos[0], target) <= 0:
return bentos[0]
output(f"Found model {bentos[0]}", style="green")
return bentos[0]
if target is None:
output(
f"Multiple models match {model}, did you mean one of these?",
style="red",
)
for bento in bentos:
output(f" {bento}")
raise typer.Exit(1)
filtered = [bento for bento in bentos if can_run(bento, target) > 0]
if len(filtered) == 0:
output(f"No deployment target found for {model}", style="red")
raise typer.Exit(1)
if len(filtered) == 0:
output(f"No deployment target found for {model}", style="red")
raise typer.Exit(1)
if len(bentos) > 1:
output(
f"Multiple models match {model}, did you mean one of these?",
style="red",
)
for bento in bentos:
output(f" {bento}")
raise typer.Exit(1)
return bentos[0]
def list_bento(
tag: typing.Optional[str] = None,
repo_name: typing.Optional[str] = None,
include_alias: bool = False,
) -> typing.List[BentoInfo]:
ensure_repo_updated()
if repo_name is not None:
config = load_config()
if repo_name not in config.repos:
output(f"Repo `{repo_name}` not found, did you mean one of these?")
for repo_name in config.repos:
output(f" {repo_name}")
raise typer.Exit(1)
if not tag:
glob_pattern = "bentoml/bentos/*/*"
elif ":" in tag:
bento_name, version = tag.split(":")
glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
else:
glob_pattern = f"bentoml/bentos/{tag}/*"
model_list = []
config = load_config()
for _repo_name, repo_url in config.repos.items():
if repo_name is not None and _repo_name != repo_name:
continue
repo = parse_repo_url(repo_url, _repo_name)
for path in repo.path.glob(glob_pattern):
if path.is_dir() and (path / "bento.yaml").exists():
model = BentoInfo(repo=repo, path=path)
elif path.is_file():
with open(path) as f:
origin_name = f.read().strip()
origin_path = path.parent / origin_name
model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
else:
model = None
if model:
model_list.append(model)
model_list.sort(key=lambda x: x.tag)
if not include_alias:
seen = set()
model_list = [
x
for x in model_list
if not (
f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
)
]
return model_list

203
openllm_next/repo.py Normal file
View File

@@ -0,0 +1,203 @@
import datetime
import re
import shutil
import pyaml
import questionary
import typer
from openllm_next.analytic import OpenLLMTyper
from openllm_next.common import (
INTERACTIVE,
REPO_DIR,
VERBOSE_LEVEL,
RepoInfo,
load_config,
output,
save_config,
)
UPDATE_INTERVAL = datetime.timedelta(days=3)
app = OpenLLMTyper(help="manage repos")
@app.command()
def list(verbose: bool = False):
if verbose:
VERBOSE_LEVEL.set(20)
config = load_config()
pyaml.pprint(
[parse_repo_url(repo, name) for name, repo in config.repos.items()],
sort_dicts=False,
sort_keys=False,
)
@app.command()
def remove(name: str):
config = load_config()
if name not in config.repos:
output(f"Repo {name} does not exist", style="red")
return
del config.repos[name]
save_config(config)
output(f"Repo {name} removed", style="green")
def _complete_alias(repo_name: str):
from openllm_next.model import list_bento
for bento in list_bento(repo_name=repo_name):
alias = bento.labels.get("openllm_alias", "").strip()
if alias:
for a in alias.split(","):
with open(bento.path.parent / a, "w") as f:
f.write(bento.version)
@app.command()
def update():
import dulwich
import dulwich.errors
import dulwich.porcelain
config = load_config()
repos_in_use = set()
for repo_name, repo in config.repos.items():
repo = parse_repo_url(repo, repo_name)
repos_in_use.add((repo.server, repo.owner, repo.repo))
if repo.path.exists(): # TODO: use update instead of remove and clone
shutil.rmtree(repo.path, ignore_errors=True)
if not repo.path.exists():
repo.path.parent.mkdir(parents=True, exist_ok=True)
try:
dulwich.porcelain.clone(
f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
str(repo.path),
checkout=True,
depth=1,
branch=repo.branch,
)
output("")
output(f"Repo `{repo.name}` updated", style="green")
except:
shutil.rmtree(repo.path, ignore_errors=True)
output(f"Failed to clone repo {repo.name}", style="red")
else:
try:
import dulwich.porcelain
dulwich.porcelain.pull(
str(repo.path),
f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
refspecs=repo.branch,
force=True,
)
dulwich.porcelain.clean(str(repo.path), str(repo.path))
output("")
output(f"Repo `{repo.name}` updated", style="green")
except:
shutil.rmtree(repo.path, ignore_errors=True)
output(f"Failed to update repo {repo.name}", style="red")
for c in REPO_DIR.glob("*/*/*"):
repo_spec = tuple(c.parts[-3:])
if repo_spec not in repos_in_use:
shutil.rmtree(c, ignore_errors=True)
output(f"Removed unused repo cache {c}")
with open(REPO_DIR / "last_update", "w") as f:
f.write(datetime.datetime.now().isoformat())
for repo_name in config.repos:
_complete_alias(repo_name)
def ensure_repo_updated():
last_update_file = REPO_DIR / "last_update"
if not last_update_file.exists():
if INTERACTIVE.get():
choice = questionary.confirm(
"The repo cache is never updated, do you want to update it to fetch the latest model list?"
).ask()
if choice:
update()
return
else:
output(
"The repo cache is never updated, please run `openllm repo update` to fetch the latest model list",
style="red",
)
raise typer.Exit(1)
last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
if INTERACTIVE.get():
choice = questionary.confirm(
"The repo cache is outdated, do you want to update it to fetch the latest model list?"
).ask()
if choice:
update()
else:
output(
"The repo cache is outdated, please run `openllm repo update` to fetch the latest model list",
style="yellow",
)
GIT_REPO_RE = re.compile(
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
)
def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
"""
parse the git repo url to server, owner, repo name, branch
>>> parse_repo_url("git+https://github.com/bentoml/bentovllm@main")
('github.com', 'bentoml', 'bentovllm', 'main')
>>> parse_repo_url("git+https://github.com/bentoml/bentovllm")
('github.com', 'bentoml', 'bentovllm', 'main')
"""
match = GIT_REPO_RE.match(repo_url)
if not match:
raise ValueError(f"Invalid git repo url: {repo_url}")
server = match.group("server")
owner = match.group("owner")
repo = match.group("repo")
branch = match.group("branch") or "main"
path = REPO_DIR / server / owner / repo
return RepoInfo(
name=repo if repo_name is None else repo_name,
url=repo_url,
server=server,
owner=owner,
repo=repo,
branch=branch,
path=path,
)
@app.command()
def add(name: str, repo: str):
name = name.lower()
if not name.isidentifier():
output(
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
style="red",
)
return
config = load_config()
if name in config.repos:
override = questionary.confirm(
f"Repo {name} already exists({config.repos[name]}), override?"
).ask()
if not override:
return
config.repos[name] = repo
save_config(config)
output(f"Repo {name} added", style="green")
if __name__ == "__main__":
app()

164
openllm_next/venv.py Normal file
View File

@@ -0,0 +1,164 @@
import functools
import os
import pathlib
import shutil
import typing
from typing import Iterable
import typer
from openllm_next.common import (
VENV_DIR,
VERBOSE_LEVEL,
BentoInfo,
VenvSpec,
output,
run_command,
)
@functools.lru_cache
def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
from pip_requirements_parser import RequirementsFile
requirements_txt = RequirementsFile.from_file(
str(requirement),
include_nested=True,
)
return requirements_txt.requirements
def _filter_preheat_packages(requirements: Iterable) -> list[str]:
PREHEAT_PIP_PACKAGES = ["torch", "vllm"]
deps: list[str] = []
for req in requirements:
if (
req.is_editable
or req.is_local_path
or req.is_url
or req.is_wheel
or not req.name
or not req.specifier
):
continue
for sp in req.specifier:
if sp.operator == "==" and req.name in PREHEAT_PIP_PACKAGES:
assert req.line is not None
deps.append(req.line)
break
return deps
@functools.lru_cache
def _resolve_bento_env_specs(bento: BentoInfo):
ver_file = bento.path / "env" / "python" / "version.txt"
assert ver_file.exists(), f"cannot find version file in {bento.path}"
lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
if not lock_file.exists():
lock_file = bento.path / "env" / "python" / "requirements.txt"
reqs = _resolve_packages(lock_file)
preheat_packages = _filter_preheat_packages(reqs)
ver = ver_file.read_text().strip()
return (
VenvSpec(
python_version=ver,
python_packages=preheat_packages,
name_prefix=f"{bento.tag.replace(':', '_')}-1-",
),
VenvSpec(
python_version=ver,
python_packages=[v.line for v in reqs],
name_prefix=f"{bento.tag.replace(':', '_')}-2-",
),
)
def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
if os.name == "nt":
return venv / "Lib/site-packages"
else:
return next(venv.glob("lib/python*")) / "site-packages"
def _ensure_venv(
env_spec: VenvSpec,
parrent_venv: typing.Optional[pathlib.Path] = None,
) -> pathlib.Path:
venv = VENV_DIR / str(hash(env_spec))
if venv.exists() and not (venv / "DONE").exists():
shutil.rmtree(venv, ignore_errors=True)
if not venv.exists():
output(f"Installing model dependencies({venv})...", style="green")
venv_py = (
venv / "Scripts" / "python.exe"
if os.name == "nt"
else venv / "bin" / "python"
)
try:
run_command(
["python", "-m", "uv", "venv", venv],
silent=VERBOSE_LEVEL.get() < 10,
)
lib_dir = _get_lib_dir(venv)
if parrent_venv is not None:
parent_lib_dir = _get_lib_dir(parrent_venv)
with open(lib_dir / f"{parrent_venv.name}.pth", "w+") as f:
f.write(str(parent_lib_dir))
with open(venv / "requirements.txt", "w") as f:
f.write("\n".join(sorted(env_spec.python_packages)))
run_command(
[
"python",
"-m",
"uv",
"pip",
"install",
"-p",
str(venv_py),
"-r",
venv / "requirements.txt",
],
silent=VERBOSE_LEVEL.get() < 10,
)
with open(venv / "DONE", "w") as f:
f.write("DONE")
except Exception:
shutil.rmtree(venv, ignore_errors=True)
output(
f"Failed to install dependencies to {venv}. Cleaned up.",
style="red",
)
raise typer.Exit(1)
output(f"Successfully installed dependencies to {venv}.", style="green")
return venv
else:
return venv
def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
last_venv = None
for env_spec in env_spec_list:
last_venv = _ensure_venv(env_spec, last_venv)
assert last_venv is not None
return last_venv
def ensure_venv(bento: BentoInfo) -> pathlib.Path:
return _ensure_venvs(_resolve_bento_env_specs(bento))
def _check_venv(env_spec: VenvSpec) -> bool:
venv = VENV_DIR / str(hash(env_spec))
if not venv.exists():
return False
if venv.exists() and not (venv / "DONE").exists():
return False
return True
def check_venv(bento: BentoInfo) -> bool:
return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))

34
pyproject.toml Normal file
View File

@@ -0,0 +1,34 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "openllm-next"
version = "0.0.1"
description = "A description of your package."
authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
license = {file = "LICENSE"}
dependencies = [
"bentoml",
"typer",
"questionary",
"pyaml",
"psutil",
"pathlib",
"pip_requirements_parser",
"nvidia-ml-py",
"dulwich",
"tabulate",
"uv",
"openai==1.35.9",
]
[project.scripts]
openllm = "openllm_next.__main__:main"
[tool.typer]
src-dir = "openllm_next"
[tool.isort]
multi_line_output = 3
include_trailing_comma = true