diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..60e5a277 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +**/_next/ linguist-generated=true + +* text=auto eol=lf +# Needed for setuptools-scm-git-archive +.git_archival.txt export-subst diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..6d6b7cc7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +*.whl +# Environments +venv/ diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 00000000..87459002 --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,107 @@ +# Developer Guide + +This Developer Guide is designed to help you contribute to the OpenLLM project. +Follow these steps to set up your development environment and learn the process +of contributing to our open-source project. + +Join our [Discord Channel](https://l.bentoml.com/join-openllm-discord) and reach +out to us if you have any question! + +## Table of Contents + +- [Developer Guide](#developer-guide) + - [Table of Contents](#table-of-contents) + - [Setting Up Your Development Environment](#setting-up-your-development-environment) + - [Development Workflow](#development-workflow) + - [Adding new models](#adding-new-models) + - [Adding bentos](#adding-new-models) + - [Adding repos](#adding-new-models) + +## Setting Up Your Development Environment + +Before you can start developing, you'll need to set up your environment: + +1. Ensure you have [Git](https://git-scm.com/), and + [Python3.8+](https://www.python.org/downloads/) installed. +2. Fork the OpenLLM repository from GitHub. +3. Clone the forked repository from GitHub: + + ```bash + git clone git@github.com:username/OpenLLM.git && cd openllm + ``` + +4. Add the OpenLLM upstream remote to your local OpenLLM clone: + + ```bash + git remote add upstream git@github.com:bentoml/OpenLLM.git + ``` + +5. Configure git to pull from the upstream remote: + + ```bash + git switch main # ensure you're on the main branch + git fetch upstream --tags + git branch --set-upstream-to=upstream/main + ``` + +## Development Workflow + +There are a few ways to contribute to the repository structure for OpenLLM: + +### Adding new models + +1. [recipe.yaml](./recipe.yaml) contains all related-metadata for generating new LLM-based bentos. To add a new LLM, the following structure should be adhere to: + +```yaml +":": + project: vllm-chat + service_config: + name: phi3 + traffic: + timeout: 300 + resources: + gpu: 1 + gpu_type: nvidia-tesla-l4 + engine_config: + model: microsoft/Phi-3-mini-4k-instruct + max_model_len: 4096 + dtype: half + chat_template: phi-3 +``` + +- `` represents the type of model to be supported. Currently supports `phi3`, `llama2`, `llama3`, `gemma` + +- `` emphasizes the type of model and its related metadata. The convention would include `--[-]` + For example: + + - `microsoft/Phi-3-mini-4k-instruct` should be represented as `3.8b-instruct-fp16`. + - `TheBloke/Llama-2-7B-Chat-AWQ` would be `7b-chat-awq-4bit` + +- `project` would be used as the basis for the generated bento. Currently, most models should use `vllm-chat` as default. + +- `service_config` entails all BentoML-related [configuration](https://docs.bentoml.com/en/latest/guides/configurations.html) to run this bento. + +> [!NOTE] +> +> We recommend to include the following field for `service_config`: +> +> - `name` should be the same as `` +> - `resources` includes the available accelerator that can run this models. See more [here](https://docs.bentoml.com/en/latest/guides/configurations.html#resources) + +- `engine_config` are fields to be used for vLLM engine. See more supported arguments in [`AsyncEngineArgs`](https://github.com/vllm-project/vllm/blob/7cd2ebb0251fd1fd0eec5c93dac674603a22eddd/vllm/engine/arg_utils.py#L799). We recommend to always include `model`, `max_model_len`, `dtype` and `trust_remote_code`. + +- If the model is a chat model, `chat_template` should be used. Add the appropriate `chat_template` under [chat_template directory](./vllm-chat/chat_templates/) should you decide to do so. + +2. You can then run `BENTOML_HOME=$(openllm repo default)/bentoml/bentos python make.py :` to generate the required bentos. + +3. You can then submit a [Pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) to `openllm` with the recipe changes + +### Adding bentos + +OpenLLM now also manages a [generated bento repository](https://github.com/bentoml/openllm-models/tree/main). If you update and modify and generated bentos, make sure to update the recipe and added the generated bentos under `bentoml/bentos`. + +### Adding repos + +If you wish to create a your own managed git repo, you should follow the structure of [bentoml/openllm-models](https://github.com/bentoml/openllm-models/tree/main). + +To add your custom repo, do `openllm repo add ` diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 00000000..73805195 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +``` +pip install . +openllm serve +# or openllm run +``` +To find out what LLM models are already in your hands. + +License +------- + +This project is licensed under the MIT License - see the LICENSE file for details. + +Acknowledgements +---------------- + +This project makes use of the following open-source projects: + +* [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving +* [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI +* [chujiezheng/chat_templates](https://github.com/chujiezheng/chat_templates) + +We are grateful to the developers and contributors of these projects for their hard work and dedication. diff --git a/openllm_next/__init__.py b/openllm_next/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openllm_next/__main__.py b/openllm_next/__main__.py new file mode 100644 index 00000000..51beec14 --- /dev/null +++ b/openllm_next/__main__.py @@ -0,0 +1,338 @@ +import os +import random +import sys +from collections import defaultdict +from typing import Annotated, Optional + +import questionary +import typer + +from openllm_next.accelerator_spec import ( + DeploymentTarget, + can_run, + get_local_machine_spec, +) +from openllm_next.analytic import DO_NOT_TRACK, OpenLLMTyper +from openllm_next.clean import app as clean_app +from openllm_next.cloud import deploy as cloud_deploy +from openllm_next.cloud import ensure_cloud_context, get_cloud_machine_spec +from openllm_next.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output +from openllm_next.local import run as local_run +from openllm_next.local import serve as local_serve +from openllm_next.model import app as model_app +from openllm_next.model import ensure_bento, list_bento +from openllm_next.repo import app as repo_app + +app = OpenLLMTyper( + help="`openllm hello` to get started. " + "OpenLLM is a CLI tool to manage and deploy open source LLMs and" + " get an OpenAI API compatible chat server in seconds.", +) + +app.add_typer(repo_app, name="repo") +app.add_typer(model_app, name="model") +app.add_typer(clean_app, name="clean") + + +def _select_bento_name(models, target): + from tabulate import tabulate + + options = [] + model_infos = [ + [model.repo.name, model.name, can_run(model, target)] for model in models + ] + model_name_groups = defaultdict(lambda: 0) + for repo, name, score in model_infos: + model_name_groups[(repo, name)] += score + table_data = [ + [name, repo, CHECKED if score > 0 else ""] + for (repo, name), score in model_name_groups.items() + ] + if not table_data: + output("No model found", style="red") + raise typer.Exit(1) + table = tabulate( + table_data, + headers=["model", "repo", "locally runnable"], + ).split("\n") + headers = f"{table[0]}\n {table[1]}" + + options.append(questionary.Separator(headers)) + for table_data, table_line in zip(table_data, table[2:]): + options.append(questionary.Choice(table_line, value=table_data[:2])) + selected = questionary.select("Select a model", options).ask() + if selected is None: + raise typer.Exit(1) + return selected + + +def _select_bento_version(models, target, bento_name, repo): + from tabulate import tabulate + + model_infos = [ + [model, can_run(model, target)] + for model in models + if model.name == bento_name and model.repo.name == repo + ] + + table_data = [ + [model.tag, CHECKED if score > 0 else ""] + for model, score in model_infos + if model.name == bento_name and model.repo.name == repo + ] + if not table_data: + output(f"No model found for {bento_name} in {repo}", style="red") + raise typer.Exit(1) + table = tabulate( + table_data, + headers=["version", "locally runnable"], + ).split("\n") + + options = [] + options.append(questionary.Separator(f"{table[0]}\n {table[1]}")) + for table_data, table_line in zip(model_infos, table[2:]): + options.append(questionary.Choice(table_line, value=table_data)) + selected = questionary.select("Select a version", options).ask() + if selected is None: + raise typer.Exit(1) + return selected + + +def _select_target(bento, targets): + from tabulate import tabulate + + options = [] + targets.sort(key=lambda x: can_run(bento, x), reverse=True) + if not targets: + output( + "No available instance type, check your bentocloud account", + style="red", + ) + raise typer.Exit(1) + + table = tabulate( + [ + [ + target.name, + target.accelerators_repr, + f"${target.price}", + CHECKED if can_run(bento, target) else "insufficient res.", + ] + for target in targets + ], + headers=["instance type", "accelerator", "price/hr", "deployable"], + ).split("\n") + options.append(questionary.Separator(f"{table[0]}\n {table[1]}")) + + for target, line in zip(targets, table[2:]): + options.append( + questionary.Choice( + f"{line}", + value=target, + ) + ) + selected = questionary.select("Select an instance type", options).ask() + if selected is None: + raise typer.Exit(1) + return selected + + +def _select_action(bento, score): + if score > 0: + options = [ + questionary.Separator("Available actions"), + questionary.Choice( + "0. Run the model in terminal", + value="run", + shortcut_key="0", + ), + questionary.Separator(f" $ openllm run {bento}"), + questionary.Separator(" "), + questionary.Choice( + "1. Serve the model locally and get a chat server", + value="serve", + shortcut_key="1", + ), + questionary.Separator(f" $ openllm serve {bento}"), + questionary.Separator(" "), + questionary.Choice( + "2. Deploy the model to bentocloud and get a scalable chat server", + value="deploy", + shortcut_key="2", + ), + questionary.Separator(f" $ openllm deploy {bento}"), + ] + else: + options = [ + questionary.Separator("Available actions"), + questionary.Choice( + "0. Run the model in terminal", + value="run", + disabled="insufficient res.", + shortcut_key="0", + ), + questionary.Separator(f" $ openllm run {bento}"), + questionary.Separator(" "), + questionary.Choice( + "1. Serve the model locally and get a chat server", + value="serve", + disabled="insufficient res.", + shortcut_key="1", + ), + questionary.Separator(f" $ openllm serve {bento}"), + questionary.Separator(" "), + questionary.Choice( + "2. Deploy the model to bentocloud and get a scalable chat server", + value="deploy", + shortcut_key="2", + ), + questionary.Separator(f" $ openllm deploy {bento}"), + ] + action = questionary.select("Select an action", options).ask() + if action is None: + raise typer.Exit(1) + if action == "run": + try: + local_run(bento) + finally: + output("\nUse this command to run the action again:", style="green") + output(f" $ openllm run {bento}", style="orange") + elif action == "serve": + try: + local_serve(bento) + finally: + output("\nUse this command to run the action again:", style="green") + output(f" $ openllm serve {bento}", style="orange") + elif action == "deploy": + ensure_cloud_context() + targets = get_cloud_machine_spec() + target = _select_target(bento, targets) + try: + cloud_deploy(bento, target) + finally: + output("\nUse this command to run the action again:", style="green") + output( + f" $ openllm deploy {bento} --instance-type {target.name}", + style="orange", + ) + + +@app.command(help="get started interactively") +def hello(): + INTERACTIVE.set(True) + VERBOSE_LEVEL.set(20) + + target = get_local_machine_spec() + output(f" Detected Platform: {target.platform}", style="green") + if target.accelerators: + output(" Detected Accelerators: ", style="green") + for a in target.accelerators: + output(f" - {a.model} {a.memory_size}GB", style="green") + else: + output(" Detected Accelerators: None", style="yellow") + + models = list_bento() + if not models: + output( + "No model found, you probably need to update the model repo:", + style="red", + ) + output( + " $ openllm repo update", + style="orange", + ) + raise typer.Exit(1) + + bento_name, repo = _select_bento_name(models, target) + bento, score = _select_bento_version(models, target, bento_name, repo) + _select_action(bento, score) + + +@app.command(help="start an OpenAI API compatible chat server and chat in browser") +def serve( + model: Annotated[str, typer.Argument()] = "", + repo: Optional[str] = None, + port: int = 3000, + verbose: bool = False, +): + if verbose: + VERBOSE_LEVEL.set(20) + target = get_local_machine_spec() + bento = ensure_bento(model, target=target, repo_name=repo) + local_serve(bento, port=port) + + +@app.command(help="run the model and chat in terminal") +def run( + model: Annotated[str, typer.Argument()] = "", + repo: Optional[str] = None, + port: Optional[int] = None, + timeout: int = 600, + verbose: bool = False, +): + if verbose: + VERBOSE_LEVEL.set(20) + target = get_local_machine_spec() + bento = ensure_bento(model, target=target, repo_name=repo) + if port is None: + port = random.randint(30000, 40000) + local_run(bento, port=port, timeout=timeout) + + +@app.command( + help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)", +) +def deploy( + model: Annotated[str, typer.Argument()] = "", + instance_type: Optional[str] = None, + repo: Optional[str] = None, + verbose: bool = False, +): + if verbose: + VERBOSE_LEVEL.set(20) + bento = ensure_bento(model, repo_name=repo) + if instance_type is not None: + cloud_deploy(bento, DeploymentTarget(name=instance_type)) + return + targets = get_cloud_machine_spec() + targets = filter(lambda x: can_run(bento, x) > 0, targets) + targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True) + if not targets: + output( + "No available instance type, check your bentocloud account", + style="red", + ) + raise typer.Exit(1) + target = targets[0] + output( + f"Recommended instance type: {target.name}", + style="green", + ) + cloud_deploy(bento, target) + + +@app.callback(invoke_without_command=True) +def typer_callback( + verbose: int = 0, + do_not_track: bool = typer.Option( + False, + "--do-not-track", + help="Whether to disable usage tracking", + envvar=DO_NOT_TRACK, + ), +): + if verbose: + VERBOSE_LEVEL.set(verbose) + if do_not_track: + os.environ[DO_NOT_TRACK] = str(True) + + +def main(): + if sys.version_info < (3, 9): + output("Python 3.8 or higher is required", style="red") + sys.exit(1) + app() + + +if __name__ == "__main__": + main() diff --git a/openllm_next/accelerator_spec.py b/openllm_next/accelerator_spec.py new file mode 100644 index 00000000..81b7d380 --- /dev/null +++ b/openllm_next/accelerator_spec.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import functools +import math +import typing +from types import SimpleNamespace + +import psutil + +from openllm_next.common import BentoInfo, DeploymentTarget, output + + +class Accelerator(SimpleNamespace): + model: str + memory_size: float + + def __gt__(self, other): + return self.memory_size > other.memory_size + + def __eq__(self, other): + return self.memory_size == other.memory_size + + def __repr__(self): + return f"{self.model}({self.memory_size}GB)" + + +class Resource(SimpleNamespace): + cpu: int = 0 + memory: float + gpu: int = 0 + gpu_type: str = "" + + def __hash__(self): + return hash((self.cpu, self.memory, self.gpu, self.gpu_type)) + + def __bool__(self): + return any(value is not None for value in self.__dict__.values()) + + +ACCELERATOR_SPEC_DICT: dict[str, dict] = { + "nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0}, + "nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0}, + "nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0}, + "nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0}, + "nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0}, + "nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0}, + "nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0}, + "nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0}, + "nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0}, + "nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0}, + "nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0}, + "nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0}, + "nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0}, + "nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0}, + "nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0}, + "nvidia-l4": {"model": "L4", "memory_size": 24.0}, + "nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0}, + "nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0}, + "nvidia-a100-80g": {"model": "A100", "memory_size": 80.0}, + "nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0}, + "nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0}, +} + + +ACCELERATOR_SPECS: dict[str, Accelerator] = { + key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items() +} + + +@functools.lru_cache +def get_local_machine_spec(): + if psutil.MACOS: + return DeploymentTarget(accelerators=[], source="local", platform="macos") + + if psutil.WINDOWS: + platform = "windows" + elif psutil.LINUX: + platform = "linux" + else: + raise NotImplementedError(f"Unsupported platform") + + from pynvml import ( + nvmlDeviceGetCount, + nvmlDeviceGetCudaComputeCapability, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetMemoryInfo, + nvmlDeviceGetName, + nvmlInit, + nvmlShutdown, + ) + + try: + nvmlInit() + device_count = nvmlDeviceGetCount() + accelerators: list[Accelerator] = [] + for i in range(device_count): + handle = nvmlDeviceGetHandleByIndex(i) + name = nvmlDeviceGetName(handle) + memory_info = nvmlDeviceGetMemoryInfo(handle) + accelerators.append( + Accelerator( + model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3) + ) + ) + compute_capability = nvmlDeviceGetCudaComputeCapability(handle) + if compute_capability < (7, 5): + output( + f"GPU {name} with compute capability {compute_capability} " + "may not be supported, 7.5 or higher is recommended. check " + "https://developer.nvidia.com/cuda-gpus for more information", + style="yellow", + ) + nvmlShutdown() + return DeploymentTarget( + accelerators=accelerators, + source="local", + platform=platform, + ) + except Exception as e: + output( + f"Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment", + style="yellow", + ) + output(f"Error: {e}", style="red", level=20) + return DeploymentTarget(accelerators=[], source="local", platform=platform) + + +@functools.lru_cache() +def can_run( + bento: typing.Union[Resource, BentoInfo], + target: typing.Optional[DeploymentTarget] = None, +) -> float: + """ + Calculate if the bento can be deployed on the target. + """ + if target is None: + target = get_local_machine_spec() + + resource_spec = Resource(**(bento.bento_yaml["services"][0]["config"].get("resources", {}))) + labels = bento.bento_yaml.get("labels", {}) + platforms = labels.get("platforms", "linux").split(",") + + if target.platform not in platforms: + return 0.0 + + # return 1.0 if no resource is specified + if not resource_spec: + return 0.5 + + if resource_spec.gpu > 0: + required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type] + filtered_accelerators = [ + ac + for ac in target.accelerators + if ac.memory_size >= required_gpu.memory_size + ] + if resource_spec.gpu > len(filtered_accelerators): + return 0.0 + return ( + required_gpu.memory_size + * resource_spec.gpu + / sum(ac.memory_size for ac in target.accelerators) + ) + if target.accelerators: + return 0.01 / sum(ac.memory_size for ac in target.accelerators) + return 1.0 diff --git a/openllm_next/analytic.py b/openllm_next/analytic.py new file mode 100644 index 00000000..0beb4482 --- /dev/null +++ b/openllm_next/analytic.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import functools +import os +import re +import time +import typing +from abc import ABC + +import attr +import click +import typer +import typer.core + +DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK" + + +class EventMeta(ABC): + @property + def event_name(self): + # camel case to snake case + event_name = re.sub(r"(? typing.Iterable[str]: + return list(self.commands) + + +class OpenLLMTyper(typer.Typer): + def __init__(self, *args: typing.Any, **kwargs: typing.Any): + no_args_is_help = kwargs.pop("no_args_is_help", True) + context_settings = kwargs.pop("context_settings", {}) + if "help_option_names" not in context_settings: + context_settings["help_option_names"] = ("-h", "--help") + if "max_content_width" not in context_settings: + context_settings["max_content_width"] = int( + os.environ.get("COLUMNS", str(120)) + ) + klass = kwargs.pop("cls", OrderedCommands) + + super().__init__( + *args, + cls=klass, + no_args_is_help=no_args_is_help, + context_settings=context_settings, + **kwargs, + ) + + def command(self, *args: typing.Any, **kwargs: typing.Any): + def decorator(f): + @functools.wraps(f) + @click.pass_context + def wrapped(ctx: click.Context, *args, **kwargs): + from bentoml._internal.utils.analytics import track + + do_not_track = ( + os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true" + ) + + # so we know that the root program is openllm + command_name = ctx.info_name + if ctx.parent.parent is not None: + # openllm model list + command_group = ctx.parent.info_name + elif ctx.parent.info_name == ctx.find_root().info_name: + # openllm run + command_group = "openllm" + + if do_not_track: + return f(*args, **kwargs) + start_time = time.time_ns() + try: + return_value = f(*args, **kwargs) + duration_in_ns = time.time_ns() - start_time + track( + OpenllmCliEvent( + cmd_group=command_group, + cmd_name=command_name, + duration_in_ms=duration_in_ns / 1e6, + ) + ) + return return_value + except BaseException as e: + duration_in_ns = time.time_ns() - start_time + track( + OpenllmCliEvent( + cmd_group=command_group, + cmd_name=command_name, + duration_in_ms=duration_in_ns / 1e6, + error_type=type(e).__name__, + return_code=2 if isinstance(e, KeyboardInterrupt) else 1, + ) + ) + raise + + return typer.Typer.command(self, *args, **kwargs)(wrapped) + + return decorator diff --git a/openllm_next/clean.py b/openllm_next/clean.py new file mode 100644 index 00000000..58dcb705 --- /dev/null +++ b/openllm_next/clean.py @@ -0,0 +1,75 @@ +import pathlib +import shutil + +import questionary + +from openllm_next.analytic import OpenLLMTyper +from openllm_next.common import ( + CONFIG_FILE, + REPO_DIR, + VENV_DIR, + VERBOSE_LEVEL, + output, +) + +app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM") + + +HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub" + + +@app.command(help="Clean up all the cached models from huggingface") +def model_cache(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob("*")) + sure = questionary.confirm( + f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?" + ).ask() + if not sure: + return + shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True) + output("All models cached by Huggingface have been removed", style="green") + + +@app.command(help="Clean up all the virtual environments created by OpenLLM") +def venvs(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + used_space = sum(f.stat().st_size for f in VENV_DIR.rglob("*")) + sure = questionary.confirm( + f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?" + ).ask() + if not sure: + return + shutil.rmtree(VENV_DIR, ignore_errors=True) + output("All virtual environments have been removed", style="green") + + +@app.command(help="Clean up all the repositories cloned by OpenLLM") +def repos(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + shutil.rmtree(REPO_DIR, ignore_errors=True) + output("All repositories have been removed", style="green") + + +@app.command(help="Reset configurations to default") +def configs(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + shutil.rmtree(CONFIG_FILE, ignore_errors=True) + output("All configurations have been reset", style="green") + + +@app.command( + name="all", + help="Clean up all above and bring OpenLLM to a fresh start", +) +def all_cache(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + repos() + venvs() + model_cache() + configs() diff --git a/openllm_next/cloud.py b/openllm_next/cloud.py new file mode 100644 index 00000000..261ea3dd --- /dev/null +++ b/openllm_next/cloud.py @@ -0,0 +1,174 @@ +import json +import os +import pathlib +import shutil +import subprocess +import typing + +import typer + +from openllm_next.accelerator_spec import ACCELERATOR_SPECS +from openllm_next.analytic import OpenLLMTyper +from openllm_next.common import ( + INTERACTIVE, + BentoInfo, + DeploymentTarget, + output, + run_command, +) + +app = OpenLLMTyper() + + +def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None): + cmd = ["bentoml", "deploy", bento.bentoml_tag] + env = { + "BENTOML_HOME": f"{bento.repo.path}/bentoml", + } + + required_envs = bento.bento_yaml.get("envs", []) + required_env_names = [env["name"] for env in required_envs if "name" in env] + if required_env_names: + output( + f"This model requires the following environment variables to run: {repr(required_env_names)}", + style="yellow", + ) + + for env_info in bento.bento_yaml.get("envs", []): + if "name" not in env_info: + continue + if os.environ.get(env_info["name"]): + default = os.environ[env_info["name"]] + elif "value" in env_info: + default = env_info["value"] + else: + default = "" + + if INTERACTIVE.get(): + import questionary + + value = questionary.text( + f"{env_info['name']}:", + default=default, + ).ask() + else: + if default == "": + output( + f"Environment variable {env_info['name']} is required but not provided", + style="red", + ) + raise typer.Exit(1) + else: + value = default + + if value is None: + raise typer.Exit(1) + cmd += ["--env", f"{env_info['name']}={value}"] + + if target: + cmd += ["--instance-type", target.name] + + assert (pathlib.Path.home() / "bentoml" / ".yatai.yaml").exists() + shutil.copy( + pathlib.Path.home() / "bentoml" / ".yatai.yaml", + bento.repo.path / "bentoml" / ".yatai.yaml", + ) + + return cmd, env, None + + +def ensure_cloud_context(): + import questionary + + cmd = ["bentoml", "cloud", "current-context"] + try: + result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) + context = json.loads(result) + output(f" bentoml already logged in: {context['endpoint']}", style="green") + except subprocess.CalledProcessError: + output(" bentoml not logged in", style="red") + if not INTERACTIVE.get(): + output( + "\n get bentoml logged in by:", + ) + output( + " $ bentoml cloud login", + style="orange", + ) + output("") + output( + """ * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""", + style="yellow", + ) + raise typer.Exit(1) + else: + action = questionary.select( + "Choose an action:", + choices=[ + "I have a BentoCloud account", + "get an account in two minutes", + ], + ).ask() + if action is None: + raise typer.Exit(1) + elif action == "get an account in two minutes": + output( + "Please visit https://cloud.bentoml.com to get your token", + style="yellow", + ) + endpoint = questionary.text( + "Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)" + ).ask() + if endpoint is None: + raise typer.Exit(1) + token = questionary.text( + "Enter your token: (similar to cniluaxxxxxxxx)" + ).ask() + if token is None: + raise typer.Exit(1) + cmd = [ + "bentoml", + "cloud", + "login", + "--api-token", + token, + "--endpoint", + endpoint, + ] + try: + result = subprocess.check_output(cmd) + output(" Logged in successfully", style="green") + except subprocess.CalledProcessError: + output(" Failed to login", style="red") + raise typer.Exit(1) + + +def get_cloud_machine_spec(): + ensure_cloud_context() + cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"] + try: + result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) + instance_types = json.loads(result) + return [ + DeploymentTarget( + source="cloud", + name=it["name"], + price=it["price"], + platform="linux", + accelerators=( + [ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))] + if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS + else [] + ), + ) + for it in instance_types + ] + except (subprocess.CalledProcessError, json.JSONDecodeError): + output("Failed to get cloud instance types", style="red") + return [] + + +def deploy(bento: BentoInfo, target: DeploymentTarget): + ensure_cloud_context() + cmd, env, cwd = _get_deploy_cmd(bento, target) + run_command(cmd, env=env, cwd=cwd) diff --git a/openllm_next/common.py b/openllm_next/common.py new file mode 100644 index 00000000..5191310c --- /dev/null +++ b/openllm_next/common.py @@ -0,0 +1,422 @@ +from __future__ import annotations + +import asyncio +import functools +import hashlib +import io +import json +import os +import pathlib +import signal +import subprocess +import sys +import sysconfig +import typing +from contextlib import asynccontextmanager, contextmanager +from types import SimpleNamespace + +import typer +import typer.core + +ERROR_STYLE = "red" +SUCCESS_STYLE = "green" + + +CLLAMA_HOME = pathlib.Path.home() / ".openllm_next" +REPO_DIR = CLLAMA_HOME / "repos" +TEMP_DIR = CLLAMA_HOME / "temp" +VENV_DIR = CLLAMA_HOME / "venv" + +REPO_DIR.mkdir(exist_ok=True, parents=True) +TEMP_DIR.mkdir(exist_ok=True, parents=True) +VENV_DIR.mkdir(exist_ok=True, parents=True) + +CONFIG_FILE = CLLAMA_HOME / "config.json" + +CHECKED = "☆" + +T = typing.TypeVar("T") + + +class ContextVar(typing.Generic[T]): + def __init__(self, default: T): + self._stack: list[T] = [] + self._default = default + + def get(self) -> T: + if self._stack: + return self._stack[-1] + return self._default + + def set(self, value): + self._stack.append(value) + + @contextmanager + def patch(self, value): + self._stack.append(value) + try: + yield + finally: + self._stack.pop() + + +VERBOSE_LEVEL = ContextVar(10) +INTERACTIVE = ContextVar(False) +FORCE = ContextVar(False) + + +def output(content, level=0, style=None, end=None): + import questionary + + if level > VERBOSE_LEVEL.get(): + return + + if not isinstance(content, str): + import pyaml + + out = io.StringIO() + pyaml.pprint( + content, + dst=out, + sort_dicts=False, + sort_keys=False, + ) + questionary.print(out.getvalue(), style=style, end="" if end is None else end) + out.close() + + if isinstance(content, str): + questionary.print(content, style=style, end="\n" if end is None else end) + + +class Config(SimpleNamespace): + repos: dict[str, str] = { + "default": "git+https://github.com/bentoml/openllm-models@main" + } + default_repo: str = "default" + + def tolist(self): + return dict( + repos=self.repos, + default_repo=self.default_repo, + ) + + +def load_config(): + if CONFIG_FILE.exists(): + try: + with open(CONFIG_FILE) as f: + return Config(**json.load(f)) + except json.JSONDecodeError: + return Config() + return Config() + + +def save_config(config): + with open(CONFIG_FILE, "w") as f: + json.dump(config.tolist(), f, indent=2) + + +class RepoInfo(SimpleNamespace): + name: str + path: pathlib.Path + url: str + server: str + owner: str + repo: str + branch: str + + def tolist(self): + if VERBOSE_LEVEL.get() <= 0: + return f"{self.name} ({self.url})" + if VERBOSE_LEVEL.get() <= 10: + return dict( + name=self.name, + url=self.url, + path=str(self.path), + ) + if VERBOSE_LEVEL.get() <= 20: + return dict( + name=self.name, + url=self.url, + path=str(self.path), + server=self.server, + owner=self.owner, + repo=self.repo, + branch=self.branch, + ) + + +class BentoInfo(SimpleNamespace): + repo: RepoInfo + path: pathlib.Path + alias: str = "" + + def __str__(self): + if self.repo.name == "default": + return f"{self.tag}" + else: + return f"{self.repo.name}/{self.tag}" + + def __hash__(self): + return md5(str(self.path)) + + @property + def tag(self) -> str: + if self.alias: + return f"{self.path.parent.name}:{self.alias}" + return f"{self.path.parent.name}:{self.path.name}" + + @property + def bentoml_tag(self) -> str: + return f"{self.path.parent.name}:{self.path.name}" + + @property + def name(self) -> str: + return self.path.parent.name + + @property + def version(self) -> str: + return self.path.name + + @property + def labels(self) -> dict[str, str]: + return self.bento_yaml["labels"] + + @functools.cached_property + def bento_yaml(self) -> dict: + import yaml + + bento_file = self.path / "bento.yaml" + return yaml.safe_load(bento_file.read_text()) + + @functools.cached_property + def platforms(self) -> list[str]: + return self.bento_yaml["labels"].get("platforms", "linux").split(",") + + @functools.cached_property + def pretty_yaml(self) -> dict: + def _pretty_routes(routes): + return { + route["route"]: { + "input": { + k: v["type"] for k, v in route["input"]["properties"].items() + }, + "output": route["output"]["type"], + } + for route in routes + } + + if len(self.bento_yaml["services"]) == 1: + pretty_yaml = { + "apis": _pretty_routes(self.bento_yaml["schema"]["routes"]), + "resources": self.bento_yaml["services"][0]["config"]["resources"], + "envs": self.bento_yaml["envs"], + "platforms": self.platforms, + } + return pretty_yaml + return self.bento_yaml + + @functools.cached_property + def pretty_gpu(self) -> str: + from openllm_next.accelerator_spec import ACCELERATOR_SPECS + + try: + resources = self.bento_yaml["services"][0]["config"]["resources"] + if resources["gpu"] > 1: + acc = ACCELERATOR_SPECS[resources["gpu_type"]] + return f"{acc.memory_size:.0f}Gx{resources['gpu']}" + elif resources["gpu"] > 0: + acc = ACCELERATOR_SPECS[resources["gpu_type"]] + return f"{acc.memory_size:.0f}G" + except KeyError: + pass + return "" + + def tolist(self): + verbose = VERBOSE_LEVEL.get() + if verbose <= 0: + return str(self) + if verbose <= 10: + return dict( + tag=self.tag, + repo=self.repo.tolist(), + path=str(self.path), + model_card=self.pretty_yaml, + ) + if verbose <= 20: + return dict( + tag=self.tag, + repo=self.repo.tolist(), + path=str(self.path), + bento_yaml=self.bento_yaml, + ) + + +class VenvSpec(SimpleNamespace): + python_version: str + python_packages: dict[str, str] + name_prefix = "" + + def __hash__(self): + return md5( + # self.python_version, + *sorted(self.python_packages), + ) + + +class Accelerator(SimpleNamespace): + model: str + memory_size: float + + def __gt__(self, other): + return self.memory_size > other.memory_size + + def __eq__(self, other): + return self.memory_size == other.memory_size + + +class DeploymentTarget(SimpleNamespace): + source: str = "local" + name: str = "local" + price: str = "" + platform = "linux" + accelerators: list[Accelerator] + + def __hash__(self): + return hash(self.source) + + @property + def accelerators_repr(self) -> str: + accs = {a.model for a in self.accelerators} + if len(accs) == 0: + return "null" + if len(accs) == 1: + a = self.accelerators[0] + return f"{a.model} x{len(self.accelerators)}" + return ", ".join((f"{a.model}" for a in self.accelerators)) + + +def run_command( + cmd, + cwd=None, + env=None, + copy_env=True, + venv=None, + silent=False, +) -> subprocess.CompletedProcess: + import shlex + + env = env or {} + cmd = [str(c) for c in cmd] + bin_dir = "Scripts" if os.name == "nt" else "bin" + if not silent: + output("\n") + if cwd: + output(f"$ cd {cwd}", style="orange") + if env: + for k, v in env.items(): + output(f"$ export {k}={shlex.quote(v)}", style="orange") + if venv: + output(f"$ source {venv / 'bin' / 'activate'}", style="orange") + output(f"$ {' '.join(cmd)}", style="orange") + + if venv: + py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}" + else: + py = sys.executable + + if copy_env: + env = {**os.environ, **env} + + if cmd and cmd[0] == "bentoml": + cmd = [py, "-m", "bentoml"] + cmd[1:] + if cmd and cmd[0] == "python": + cmd = [py] + cmd[1:] + + try: + if silent: + return subprocess.run( # type: ignore + cmd, + cwd=cwd, + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + else: + return subprocess.run( + cmd, + cwd=cwd, + env=env, + ) + except subprocess.CalledProcessError: + output("Command failed", style="red") + raise typer.Exit(1) + + +async def stream_command_output(stream, style="gray"): + async for line in stream: + output(line.decode(), style=style, end="") + + +@asynccontextmanager +async def async_run_command( + cmd, + cwd=None, + env=None, + copy_env=True, + venv=None, + silent=True, +): + import shlex + + env = env or {} + cmd = [str(c) for c in cmd] + + if not silent: + output("\n") + if cwd: + output(f"$ cd {cwd}", style="orange") + if env: + for k, v in env.items(): + output(f"$ export {k}={shlex.quote(v)}", style="orange") + if venv: + output(f"$ source {venv / 'bin' / 'activate'}", style="orange") + output(f"$ {' '.join(cmd)}", style="orange") + + if venv: + py = venv / "bin" / "python" + else: + py = sys.executable + + if copy_env: + env = {**os.environ, **env} + + if cmd and cmd[0] == "bentoml": + cmd = [py, "-m", "bentoml"] + cmd[1:] + if cmd and cmd[0] == "python": + cmd = [py] + cmd[1:] + + proc = None + try: + proc = await asyncio.create_subprocess_shell( + " ".join(map(str, cmd)), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=cwd, + env=env, + ) + yield proc + except subprocess.CalledProcessError: + output("Command failed", style="red") + raise typer.Exit(1) + finally: + if proc: + proc.send_signal(signal.SIGINT) + await proc.wait() + + +def md5(*strings: str) -> int: + m = hashlib.md5() + for s in strings: + m.update(s.encode()) + return int(m.hexdigest(), 16) diff --git a/openllm_next/local.py b/openllm_next/local.py new file mode 100644 index 00000000..947192fb --- /dev/null +++ b/openllm_next/local.py @@ -0,0 +1,117 @@ +import asyncio +import time + +import httpx + +from openllm_next.common import ( + BentoInfo, + async_run_command, + output, + run_command, + stream_command_output, +) +from openllm_next.venv import ensure_venv + + +def _get_serve_cmd(bento: BentoInfo, port: int = 3000): + cmd = ["bentoml", "serve", bento.bentoml_tag] + if port != 3000: + cmd += ["--port", str(port)] + env = { + "BENTOML_HOME": f"{bento.repo.path}/bentoml", + } + return cmd, env, None + + +def serve( + bento: BentoInfo, + port: int = 3000, +): + venv = ensure_venv(bento) + cmd, env, cwd = _get_serve_cmd(bento, port=port) + run_command(cmd, env=env, cwd=cwd, venv=venv) + + +async def _run_model( + bento: BentoInfo, + port: int = 3000, + timeout: int = 600, +): + venv = ensure_venv(bento) + cmd, env, cwd = _get_serve_cmd(bento, port) + async with async_run_command( + cmd, + env=env, + cwd=cwd, + venv=venv, + silent=False, + ) as server_proc: + + output(f"Model server started {server_proc.pid}") + + stdout_streamer = None + stderr_streamer = None + start_time = time.time() + + output("Model loading...", style="green") + for _ in range(timeout): + try: + resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3) + if resp.status_code == 200: + break + except httpx.RequestError: + if time.time() - start_time > 30: + if not stdout_streamer: + stdout_streamer = asyncio.create_task( + stream_command_output(server_proc.stdout, style="gray") + ) + if not stderr_streamer: + stderr_streamer = asyncio.create_task( + stream_command_output(server_proc.stderr, style="#BD2D0F") + ) + await asyncio.sleep(1) + else: + output("Model failed to load", style="red") + server_proc.terminate() + return + + if stdout_streamer: + stdout_streamer.cancel() + if stderr_streamer: + stderr_streamer.cancel() + + output("Model is ready", style="green") + messages: list[dict[str, str]] = [] + + from openai import AsyncOpenAI + + client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local") + model_id = (await client.models.list()).data[0].id + while True: + try: + message = input("user: ") + if message == "": + output("empty message, please enter something", style="yellow") + continue + messages.append(dict(role="user", content=message)) + output("assistant: ", end="", style="lightgreen") + assistant_message = "" + stream = await client.chat.completions.create( + model=model_id, + messages=messages, # type: ignore + stream=True, + ) + async for chunk in stream: + text = chunk.choices[0].delta.content or "" + assistant_message += text + output(text, end="", style="lightgreen") + messages.append(dict(role="assistant", content=assistant_message)) + output("") + except KeyboardInterrupt: + break + output("\nStopping model server...", style="green") + output("Stopped model server", style="green") + + +def run(bento: BentoInfo, port: int = 3000, timeout: int = 600): + asyncio.run(_run_model(bento, port=port, timeout=timeout)) diff --git a/openllm_next/model.py b/openllm_next/model.py new file mode 100644 index 00000000..1d3c38b7 --- /dev/null +++ b/openllm_next/model.py @@ -0,0 +1,173 @@ +import typing +from typing import Optional + +import tabulate +import typer + +from openllm_next.accelerator_spec import DeploymentTarget, can_run +from openllm_next.analytic import OpenLLMTyper +from openllm_next.common import ( + FORCE, + VERBOSE_LEVEL, + BentoInfo, + load_config, + output, +) +from openllm_next.repo import ensure_repo_updated, parse_repo_url + +app = OpenLLMTyper(help="manage models") + + +@app.command() +def get( + tag: str, + repo: Optional[str] = None, + verbose: bool = False, +): + if verbose: + VERBOSE_LEVEL.set(20) + bento_info = ensure_bento(tag, repo_name=repo) + if bento_info: + output(bento_info) + + +@app.command(name="list") +def list_( + tag: Optional[str] = None, + repo: Optional[str] = None, + verbose: bool = False, +): + if verbose: + VERBOSE_LEVEL.set(20) + + bentos = list_bento(tag=tag, repo_name=repo) + bentos.sort(key=lambda x: x.name) + + seen = set() + + def is_seen(value): + if value in seen: + return True + seen.add(value) + return False + + table = tabulate.tabulate( + [ + [ + "" if is_seen(bento.name) else bento.name, + bento.tag, + bento.repo.name, + bento.pretty_gpu, + ",".join(bento.platforms), + ] + for bento in bentos + ], + headers=["model", "version", "repo", "required VRAM", "platforms"], + ) + output(table) + + +def ensure_bento( + model: str, + target: Optional[DeploymentTarget] = None, + repo_name: Optional[str] = None, +) -> BentoInfo: + bentos = list_bento(model, repo_name=repo_name) + if len(bentos) == 0: + output(f"No model found for {model}", style="red") + raise typer.Exit(1) + + if len(bentos) == 1: + if FORCE.get(): + output(f"Found model {bentos[0]}", style="green") + return bentos[0] + if target is None: + return bentos[0] + if can_run(bentos[0], target) <= 0: + return bentos[0] + output(f"Found model {bentos[0]}", style="green") + return bentos[0] + + if target is None: + output( + f"Multiple models match {model}, did you mean one of these?", + style="red", + ) + for bento in bentos: + output(f" {bento}") + raise typer.Exit(1) + + filtered = [bento for bento in bentos if can_run(bento, target) > 0] + if len(filtered) == 0: + output(f"No deployment target found for {model}", style="red") + raise typer.Exit(1) + + if len(filtered) == 0: + output(f"No deployment target found for {model}", style="red") + raise typer.Exit(1) + + if len(bentos) > 1: + output( + f"Multiple models match {model}, did you mean one of these?", + style="red", + ) + for bento in bentos: + output(f" {bento}") + raise typer.Exit(1) + + return bentos[0] + + +def list_bento( + tag: typing.Optional[str] = None, + repo_name: typing.Optional[str] = None, + include_alias: bool = False, +) -> typing.List[BentoInfo]: + ensure_repo_updated() + + if repo_name is not None: + config = load_config() + if repo_name not in config.repos: + output(f"Repo `{repo_name}` not found, did you mean one of these?") + for repo_name in config.repos: + output(f" {repo_name}") + raise typer.Exit(1) + + if not tag: + glob_pattern = "bentoml/bentos/*/*" + elif ":" in tag: + bento_name, version = tag.split(":") + glob_pattern = f"bentoml/bentos/{bento_name}/{version}" + else: + glob_pattern = f"bentoml/bentos/{tag}/*" + + model_list = [] + config = load_config() + for _repo_name, repo_url in config.repos.items(): + if repo_name is not None and _repo_name != repo_name: + continue + repo = parse_repo_url(repo_url, _repo_name) + for path in repo.path.glob(glob_pattern): + if path.is_dir() and (path / "bento.yaml").exists(): + model = BentoInfo(repo=repo, path=path) + elif path.is_file(): + with open(path) as f: + origin_name = f.read().strip() + origin_path = path.parent / origin_name + model = BentoInfo(alias=path.name, repo=repo, path=origin_path) + else: + model = None + if model: + model_list.append(model) + model_list.sort(key=lambda x: x.tag) + if not include_alias: + seen = set() + model_list = [ + x + for x in model_list + if not ( + f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen + or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}") + ) + ] + return model_list diff --git a/openllm_next/repo.py b/openllm_next/repo.py new file mode 100644 index 00000000..7d335c50 --- /dev/null +++ b/openllm_next/repo.py @@ -0,0 +1,203 @@ +import datetime +import re +import shutil + +import pyaml +import questionary +import typer + +from openllm_next.analytic import OpenLLMTyper +from openllm_next.common import ( + INTERACTIVE, + REPO_DIR, + VERBOSE_LEVEL, + RepoInfo, + load_config, + output, + save_config, +) + +UPDATE_INTERVAL = datetime.timedelta(days=3) + +app = OpenLLMTyper(help="manage repos") + + +@app.command() +def list(verbose: bool = False): + if verbose: + VERBOSE_LEVEL.set(20) + config = load_config() + pyaml.pprint( + [parse_repo_url(repo, name) for name, repo in config.repos.items()], + sort_dicts=False, + sort_keys=False, + ) + + +@app.command() +def remove(name: str): + config = load_config() + if name not in config.repos: + output(f"Repo {name} does not exist", style="red") + return + + del config.repos[name] + save_config(config) + output(f"Repo {name} removed", style="green") + + +def _complete_alias(repo_name: str): + from openllm_next.model import list_bento + + for bento in list_bento(repo_name=repo_name): + alias = bento.labels.get("openllm_alias", "").strip() + if alias: + for a in alias.split(","): + with open(bento.path.parent / a, "w") as f: + f.write(bento.version) + + +@app.command() +def update(): + import dulwich + import dulwich.errors + import dulwich.porcelain + + config = load_config() + repos_in_use = set() + for repo_name, repo in config.repos.items(): + repo = parse_repo_url(repo, repo_name) + repos_in_use.add((repo.server, repo.owner, repo.repo)) + if repo.path.exists(): # TODO: use update instead of remove and clone + shutil.rmtree(repo.path, ignore_errors=True) + if not repo.path.exists(): + repo.path.parent.mkdir(parents=True, exist_ok=True) + try: + dulwich.porcelain.clone( + f"https://{repo.server}/{repo.owner}/{repo.repo}.git", + str(repo.path), + checkout=True, + depth=1, + branch=repo.branch, + ) + output("") + output(f"Repo `{repo.name}` updated", style="green") + except: + shutil.rmtree(repo.path, ignore_errors=True) + output(f"Failed to clone repo {repo.name}", style="red") + else: + try: + import dulwich.porcelain + + dulwich.porcelain.pull( + str(repo.path), + f"https://{repo.server}/{repo.owner}/{repo.repo}.git", + refspecs=repo.branch, + force=True, + ) + dulwich.porcelain.clean(str(repo.path), str(repo.path)) + output("") + output(f"Repo `{repo.name}` updated", style="green") + except: + shutil.rmtree(repo.path, ignore_errors=True) + output(f"Failed to update repo {repo.name}", style="red") + for c in REPO_DIR.glob("*/*/*"): + repo_spec = tuple(c.parts[-3:]) + if repo_spec not in repos_in_use: + shutil.rmtree(c, ignore_errors=True) + output(f"Removed unused repo cache {c}") + with open(REPO_DIR / "last_update", "w") as f: + f.write(datetime.datetime.now().isoformat()) + for repo_name in config.repos: + _complete_alias(repo_name) + + +def ensure_repo_updated(): + last_update_file = REPO_DIR / "last_update" + if not last_update_file.exists(): + if INTERACTIVE.get(): + choice = questionary.confirm( + "The repo cache is never updated, do you want to update it to fetch the latest model list?" + ).ask() + if choice: + update() + return + else: + output( + "The repo cache is never updated, please run `openllm repo update` to fetch the latest model list", + style="red", + ) + raise typer.Exit(1) + last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip()) + if datetime.datetime.now() - last_update > UPDATE_INTERVAL: + if INTERACTIVE.get(): + choice = questionary.confirm( + "The repo cache is outdated, do you want to update it to fetch the latest model list?" + ).ask() + if choice: + update() + else: + output( + "The repo cache is outdated, please run `openllm repo update` to fetch the latest model list", + style="yellow", + ) + + +GIT_REPO_RE = re.compile( + r"git\+https://(?P.+)/(?P.+)/(?P.+?)(@(?P.+))?$" +) + + +def parse_repo_url(repo_url, repo_name=None) -> RepoInfo: + """ + parse the git repo url to server, owner, repo name, branch + >>> parse_repo_url("git+https://github.com/bentoml/bentovllm@main") + ('github.com', 'bentoml', 'bentovllm', 'main') + + >>> parse_repo_url("git+https://github.com/bentoml/bentovllm") + ('github.com', 'bentoml', 'bentovllm', 'main') + """ + match = GIT_REPO_RE.match(repo_url) + if not match: + raise ValueError(f"Invalid git repo url: {repo_url}") + server = match.group("server") + owner = match.group("owner") + repo = match.group("repo") + branch = match.group("branch") or "main" + path = REPO_DIR / server / owner / repo + return RepoInfo( + name=repo if repo_name is None else repo_name, + url=repo_url, + server=server, + owner=owner, + repo=repo, + branch=branch, + path=path, + ) + + +@app.command() +def add(name: str, repo: str): + name = name.lower() + if not name.isidentifier(): + output( + f"Invalid repo name: {name}, should only contain letters, numbers and underscores", + style="red", + ) + return + + config = load_config() + if name in config.repos: + override = questionary.confirm( + f"Repo {name} already exists({config.repos[name]}), override?" + ).ask() + if not override: + return + + config.repos[name] = repo + save_config(config) + output(f"Repo {name} added", style="green") + + +if __name__ == "__main__": + app() diff --git a/openllm_next/venv.py b/openllm_next/venv.py new file mode 100644 index 00000000..0e7e5124 --- /dev/null +++ b/openllm_next/venv.py @@ -0,0 +1,164 @@ +import functools +import os +import pathlib +import shutil +import typing +from typing import Iterable + +import typer + +from openllm_next.common import ( + VENV_DIR, + VERBOSE_LEVEL, + BentoInfo, + VenvSpec, + output, + run_command, +) + + +@functools.lru_cache +def _resolve_packages(requirement: typing.Union[pathlib.Path, str]): + from pip_requirements_parser import RequirementsFile + + requirements_txt = RequirementsFile.from_file( + str(requirement), + include_nested=True, + ) + return requirements_txt.requirements + + +def _filter_preheat_packages(requirements: Iterable) -> list[str]: + PREHEAT_PIP_PACKAGES = ["torch", "vllm"] + + deps: list[str] = [] + for req in requirements: + if ( + req.is_editable + or req.is_local_path + or req.is_url + or req.is_wheel + or not req.name + or not req.specifier + ): + continue + for sp in req.specifier: + if sp.operator == "==" and req.name in PREHEAT_PIP_PACKAGES: + assert req.line is not None + deps.append(req.line) + break + return deps + + +@functools.lru_cache +def _resolve_bento_env_specs(bento: BentoInfo): + ver_file = bento.path / "env" / "python" / "version.txt" + assert ver_file.exists(), f"cannot find version file in {bento.path}" + + lock_file = bento.path / "env" / "python" / "requirements.lock.txt" + if not lock_file.exists(): + lock_file = bento.path / "env" / "python" / "requirements.txt" + + reqs = _resolve_packages(lock_file) + preheat_packages = _filter_preheat_packages(reqs) + ver = ver_file.read_text().strip() + return ( + VenvSpec( + python_version=ver, + python_packages=preheat_packages, + name_prefix=f"{bento.tag.replace(':', '_')}-1-", + ), + VenvSpec( + python_version=ver, + python_packages=[v.line for v in reqs], + name_prefix=f"{bento.tag.replace(':', '_')}-2-", + ), + ) + + +def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path: + if os.name == "nt": + return venv / "Lib/site-packages" + else: + return next(venv.glob("lib/python*")) / "site-packages" + + +def _ensure_venv( + env_spec: VenvSpec, + parrent_venv: typing.Optional[pathlib.Path] = None, +) -> pathlib.Path: + venv = VENV_DIR / str(hash(env_spec)) + if venv.exists() and not (venv / "DONE").exists(): + shutil.rmtree(venv, ignore_errors=True) + if not venv.exists(): + output(f"Installing model dependencies({venv})...", style="green") + + venv_py = ( + venv / "Scripts" / "python.exe" + if os.name == "nt" + else venv / "bin" / "python" + ) + try: + run_command( + ["python", "-m", "uv", "venv", venv], + silent=VERBOSE_LEVEL.get() < 10, + ) + lib_dir = _get_lib_dir(venv) + if parrent_venv is not None: + parent_lib_dir = _get_lib_dir(parrent_venv) + with open(lib_dir / f"{parrent_venv.name}.pth", "w+") as f: + f.write(str(parent_lib_dir)) + with open(venv / "requirements.txt", "w") as f: + f.write("\n".join(sorted(env_spec.python_packages))) + run_command( + [ + "python", + "-m", + "uv", + "pip", + "install", + "-p", + str(venv_py), + "-r", + venv / "requirements.txt", + ], + silent=VERBOSE_LEVEL.get() < 10, + ) + with open(venv / "DONE", "w") as f: + f.write("DONE") + except Exception: + shutil.rmtree(venv, ignore_errors=True) + output( + f"Failed to install dependencies to {venv}. Cleaned up.", + style="red", + ) + raise typer.Exit(1) + output(f"Successfully installed dependencies to {venv}.", style="green") + return venv + else: + return venv + + +def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path: + last_venv = None + for env_spec in env_spec_list: + last_venv = _ensure_venv(env_spec, last_venv) + assert last_venv is not None + return last_venv + + +def ensure_venv(bento: BentoInfo) -> pathlib.Path: + return _ensure_venvs(_resolve_bento_env_specs(bento)) + + +def _check_venv(env_spec: VenvSpec) -> bool: + venv = VENV_DIR / str(hash(env_spec)) + if not venv.exists(): + return False + if venv.exists() and not (venv / "DONE").exists(): + return False + return True + + +def check_venv(bento: BentoInfo) -> bool: + return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento)) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..95968631 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "openllm-next" +version = "0.0.1" +description = "A description of your package." +authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}] +license = {file = "LICENSE"} +dependencies = [ + "bentoml", + "typer", + "questionary", + "pyaml", + "psutil", + "pathlib", + "pip_requirements_parser", + "nvidia-ml-py", + "dulwich", + "tabulate", + "uv", + "openai==1.35.9", +] + +[project.scripts] +openllm = "openllm_next.__main__:main" + +[tool.typer] +src-dir = "openllm_next" + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true