From 58fa8a70cb39a65ca478b778fe058da9bafbe308 Mon Sep 17 00:00:00 2001 From: bojiang Date: Sat, 18 May 2024 12:41:54 +0800 Subject: [PATCH] feat: repo/model/serve --- .gitignore | 161 ++++++++++++ LICENSE | 201 +++++++++++++++ README.md | 150 +++++++++++ cllama/__init__.py | 0 cllama/__main__.py | 295 +++++++++++++++++++++ cllama/aws.py | 630 +++++++++++++++++++++++++++++++++++++++++++++ cllama/spec.py | 11 + pyproject.toml | 22 ++ req.txt | 7 + 9 files changed, 1477 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 cllama/__init__.py create mode 100644 cllama/__main__.py create mode 100644 cllama/aws.py create mode 100644 cllama/spec.py create mode 100644 pyproject.toml create mode 100644 req.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7751b092 --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +*.whl diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 00000000..ceb0ae1c --- /dev/null +++ b/README.md @@ -0,0 +1,150 @@ +
+

Self-host LLMs with vLLM and BentoML

+
+ +This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [vLLM](https://vllm.ai), a high-throughput and memory-efficient inference engine. + +See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects. + +💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or vLLM options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM). + + +## Prerequisites + +- You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more. +- You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first. +- If you want to test the Service locally, you need a Nvidia GPU with at least 16G VRAM. +- (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details. + +## Install dependencies + +```bash +git clone https://github.com/bentoml/BentoVLLM.git +cd BentoVLLM/mistral-7b-instruct +pip install -r requirements.txt && pip install -f -U "pydantic>=2.0" +``` + +## Run the BentoML Service + +We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service. + +```bash +$ bentoml serve . + +2024-01-18T07:51:30+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:VLLM" listening on http://localhost:3000 (Press CTRL+C to quit) +INFO 01-18 07:51:40 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. +INFO 01-18 07:51:40 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. +INFO 01-18 07:51:46 model_runner.py:547] Graph capturing finished in 6 secs. +``` + +The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways. + +
+ +CURL + +```bash +curl -X 'POST' \ + 'http://localhost:3000/generate' \ + -H 'accept: text/event-stream' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt": "Explain superconductors like I'\''m five years old", + "tokens": null +}' +``` + +
+ +
+ +Python client + +```python +import bentoml + +with bentoml.SyncHTTPClient("http://localhost:3000") as client: + response_generator = client.generate( + prompt="Explain superconductors like I'm five years old", + tokens=None + ) + for response in response_generator: + print(response) +``` + +
+ +
+ +OpenAI-compatible endpoints + +This Service uses the `@openai_endpoints` decorator to set up OpenAI-compatible endpoints (`chat/completions` and `completions`). This means your client can interact with the backend Service (in this case, the VLLM class) as if they were communicating directly with OpenAI's API. This [utility](mistral-7b-instruct/bentovllm_openai/) does not affect your BentoML Service code, and you can use it for other LLMs as well. + +```python +from openai import OpenAI + +client = OpenAI(base_url='http://localhost:3000/v1', api_key='na') + +# Use the following func to get the available models +client.models.list() + +chat_completion = client.chat.completions.create( + model="mistralai/Mistral-7B-Instruct-v0.2", + messages=[ + { + "role": "user", + "content": "Explain superconductors like I'm five years old" + } + ], + stream=True, +) +for chunk in chat_completion: + # Extract and print the content of the model's reply + print(chunk.choices[0].delta.content or "", end="") +``` + +**Note**: If your Service is deployed with [protected endpoints on BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#access-protected-deployments), you need to set the environment variable `OPENAI_API_KEY` to your BentoCloud API key first. + +```bash +export OPENAI_API_KEY={YOUR_BENTOCLOUD_API_TOKEN} +``` + +You can then use the following line to replace the client in the above code snippet. Refer to [Obtain the endpoint URL](https://docs.bentoml.com/en/latest/bentocloud/how-tos/call-deployment-endpoints.html#obtain-the-endpoint-url) to retrieve the endpoint URL. + +```python +client = OpenAI(base_url='your_bentocloud_deployment_endpoint_url/v1') +``` + +
+ +For detailed explanations of the Service code, see [vLLM inference](https://docs.bentoml.org/en/latest/use-cases/large-language-models/vllm.html). + +## Deploy to BentoCloud + +After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account. + +Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it. + +```bash +bentoml deploy . +``` + +Once the application is up and running on BentoCloud, you can access it via the exposed URL. + +**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html). + + +## Different LLM Models + +Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories. + +- [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/) +- [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/) +- [Llama-2-7b-chat-hf](llama2-7b-chat/) +- [SOLAR-10.7B-v1.0](solar-10.7b-instruct/) + + +## LLM tools integration examples + +- Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service. +- [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation. diff --git a/cllama/__init__.py b/cllama/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cllama/__main__.py b/cllama/__main__.py new file mode 100644 index 00000000..9c08f408 --- /dev/null +++ b/cllama/__main__.py @@ -0,0 +1,295 @@ +import typer +import shlex +import os +from typing_extensions import TypedDict +import collections + +import prompt_toolkit +import shutil +import pydantic +import yaml +import json +import questionary +import re +import subprocess +import pyaml +import pathlib +from cllama.spec import GPU_MEMORY + + +ERROR_STYLE = "red" +SUCCESS_STYLE = "green" + + +CLLAMA_HOME = pathlib.Path.home() / ".openllm_next" +REPO_DIR = CLLAMA_HOME / "repos" +TEMP_DIR = CLLAMA_HOME / "temp" +VENV_DIR = CLLAMA_HOME / "venv" + +REPO_DIR.mkdir(exist_ok=True, parents=True) +TEMP_DIR.mkdir(exist_ok=True, parents=True) +VENV_DIR.mkdir(exist_ok=True, parents=True) + +CONFIG_FILE = CLLAMA_HOME / "config.json" + + +app = typer.Typer() +repo_app = typer.Typer() +model_app = typer.Typer() + +app.add_typer(repo_app, name="repo") +app.add_typer(model_app, name="model") + + +class Config(pydantic.BaseModel): + repos: dict[str, str] = { + "default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml" + } + default_repo: str = "default" + + +def _load_config(): + if CONFIG_FILE.exists(): + with open(CONFIG_FILE) as f: + return Config(**json.load(f)) + return Config() + + +def _save_config(config): + with open(CONFIG_FILE, "w") as f: + json.dump(config.dict(), f, indent=2) + + +class RepoInfo(TypedDict): + name: str + path: str + url: str + server: str + owner: str + repo: str + branch: str + + +class ModelInfo(TypedDict): + repo: RepoInfo + path: str + + +class BentoInfo(TypedDict): + model: ModelInfo + bento_yaml: dict + + +def _load_model_map() -> dict[str, dict[str, ModelInfo]]: + model_map = collections.defaultdict(dict) + config = _load_config() + for repo_name, repo_url in config.repos.items(): + server, owner, repo, branch = _parse_repo_url(repo_url) + repo_dir = REPO_DIR / server / owner / repo + for path in repo_dir.glob("bentoml/bentos/*/*"): + if path.is_dir(): + model_map[path.parent.name][path.name] = ModelInfo( + repo=RepoInfo( + name=repo_name, + url=repo_url, + server=server, + owner=owner, + repo=repo, + branch=branch, + path=str(repo_dir), + ), + path=str(path), + ) + elif path.is_file(): + with open(path) as f: + origin_name = f.read().strip() + origin_path = path.parent / origin_name + model_map[path.parent.name][path.name] = ModelInfo( + repo=RepoInfo( + name=repo_name, + url=repo_url, + server=server, + owner=owner, + repo=repo, + branch=branch, + path=str(repo_dir), + ), + path=str(origin_path), + ) + return model_map + + +GIT_REPO_RE = re.compile( + r"git\+https://(?P.+)/(?P.+)/(?P.+?)(@(?P.+))?$" +) + + +@repo_app.command(name="list") +def repo_list(): + config = _load_config() + pyaml.pprint(config.repos) + + +def _parse_repo_url(repo_url): + """ + parse the git repo url to server, owner, repo name, branch + >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main") + ('github.com', 'bojiang', 'bentovllm', 'main') + + >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm") + ('github.com', 'bojiang', 'bentovllm', 'main') + """ + match = GIT_REPO_RE.match(repo_url) + if not match: + raise ValueError(f"Invalid git repo url: {repo_url}") + return ( + match.group("server"), + match.group("owner"), + match.group("repo"), + match.group("branch") or "main", + ) + + +@repo_app.command(name="add") +def repo_add(name: str, repo: str): + name = name.lower() + if not name.isidentifier(): + questionary.print( + f"Invalid repo name: {name}, should only contain letters, numbers and underscores", + style=ERROR_STYLE, + ) + return + + config = _load_config() + if name in config.repos: + override = questionary.confirm( + f"Repo {name} already exists({config.repos[name]}), override?" + ).ask() + if not override: + return + + config.repos[name] = repo + _save_config(config) + pyaml.pprint(config.repos) + + +@repo_app.command(name="remove") +def repo_remove(name: str): + config = _load_config() + if name not in config.repos: + questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE) + return + + del config.repos[name] + _save_config(config) + pyaml.pprint(config.repos) + + +def _run_command(cmd, cwd=None, env=None, copy_env=True): + questionary.print("\n") + env = env or {} + if cwd: + questionary.print(f"$ cd {cwd}", style="bold") + if env: + for k, v in env.items(): + questionary.print(f"$ export {k}={shlex.quote(v)}", style="bold") + if copy_env: + env = {**os.environ, **env} + questionary.print(f"$ {' '.join(cmd)}", style="bold") + try: + subprocess.run(cmd, cwd=cwd, env=env, check=True) + except subprocess.CalledProcessError: + questionary.print("Command failed", style=ERROR_STYLE) + return + + +@repo_app.command(name="update") +def repo_update(): + config = _load_config() + repos_in_use = set() + for name, repo in config.repos.items(): + server, owner, repo_name, branch = _parse_repo_url(repo) + repos_in_use.add((server, owner, repo_name)) + repo_dir = REPO_DIR / server / owner / repo_name + if not repo_dir.exists(): + repo_dir.parent.mkdir(parents=True, exist_ok=True) + try: + cmd = [ + "git", + "clone", + "--branch", + branch, + f"https://{server}/{owner}/{repo_name}.git", + str(repo_dir), + ] + _run_command(cmd) + except subprocess.CalledProcessError: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE) + else: + try: + cmd = ["git", "fetch", "origin", branch] + _run_command(cmd, cwd=repo_dir) + cmd = ["git", "reset", "--hard", f"origin/{branch}"] + _run_command(cmd, cwd=repo_dir) + except: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE) + for repo_dir in REPO_DIR.glob("*/*/*"): + if tuple(repo_dir.parts[-3:]) not in repos_in_use: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Removed unused repo {repo_dir}") + questionary.print("Repos updated", style=SUCCESS_STYLE) + + +@model_app.command(name="list") +def model_list(): + pyaml.pprint(_load_model_map()) + + +def _get_bento_info(tag): + model_map = _load_model_map() + bento, version = tag.split(":") + if bento not in model_map or version not in model_map[bento]: + questionary.print(f"Model {tag} not found", style=ERROR_STYLE) + return + model_info = model_map[bento][version] + path = pathlib.Path(model_info["path"]) + + bento_file = path / "bento.yaml" + bento_info = yaml.safe_load(bento_file.read_text()) + return BentoInfo( + model=model_info, + bento_yaml=bento_info, + ) + + +@model_app.command(name="get") +def model_get(tag: str): + bento_info = _get_bento_info(tag) + if bento_info: + pyaml.pprint(bento_info) + + +def _serve_model(model: str): + if ":" not in model: + model = f"{model}:latest" + bento_info = _get_bento_info(model) + if not bento_info: + questionary.print(f"Model {model} not found", style=ERROR_STYLE) + return + cmd = ["bentoml", "serve", model] + env = { + "CLLAMA_MODEL": model, + "BENTOML_HOME": bento_info["model"]["repo"]["path"] + "/bentoml", + } + _run_command(cmd, env=env) + + +@app.command() +def serve(model: str): + _serve_model(model) + + +if __name__ == "__main__": + app() diff --git a/cllama/aws.py b/cllama/aws.py new file mode 100644 index 00000000..27d86ba9 --- /dev/null +++ b/cllama/aws.py @@ -0,0 +1,630 @@ +import typer +import typing +import collections + +import prompt_toolkit +from prompt_toolkit import print_formatted_text as print +import time +import uuid +import shutil +import pydantic +from urllib.parse import urlparse +import yaml +import json +import bentoml +import questionary +import os +import re +import subprocess +import pyaml +import pathlib +from cllama.spec import GPU_MEMORY + +ERROR_STYLE = "red" +SUCCESS_STYLE = "green" + + +CLLAMA_HOME = pathlib.Path.home() / ".openllm_next" +REPO_DIR = CLLAMA_HOME / "repos" +TEMP_DIR = CLLAMA_HOME / "temp" +VENV_DIR = CLLAMA_HOME / "venv" + +REPO_DIR.mkdir(exist_ok=True, parents=True) +TEMP_DIR.mkdir(exist_ok=True, parents=True) +VENV_DIR.mkdir(exist_ok=True, parents=True) + +CONFIG_FILE = CLLAMA_HOME / "config.json" + + +app = typer.Typer() +repo_app = typer.Typer() +model_app = typer.Typer() + +app.add_typer(repo_app, name="repo") +app.add_typer(model_app, name="model") + + +class Config(pydantic.BaseModel): + repos: dict[str, str] = { + "default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml" + } + default_repo: str = "default" + + +def _load_config(): + if CONFIG_FILE.exists(): + with open(CONFIG_FILE) as f: + return Config(**json.load(f)) + return Config() + + +def _save_config(config): + with open(CONFIG_FILE, "w") as f: + json.dump(config.dict(), f, indent=2) + + +class ModelInfo(typing.TypedDict): + repo: str + path: str + + +def _load_model_map() -> dict[str, dict[str, ModelInfo]]: + model_map = collections.defaultdict(dict) + config = _load_config() + for repo_name, repo_url in config.repos.items(): + server, owner, repo, _ = _parse_repo_url(repo_url) + repo_dir = REPO_DIR / server / owner / repo + for path in repo_dir.glob("bentoml/bentos/*/*"): + if path.is_dir(): + model_map[path.parent.name][path.name] = ModelInfo( + repo=repo_name, + path=str(path), + ) + elif path.is_file(): + with open(path) as f: + origin_name = f.read().strip() + origin_path = path.parent / origin_name + model_map[path.parent.name][path.name] = ModelInfo( + repo=repo_name, + path=str(origin_path), + ) + return model_map + + +GIT_REPO_RE = re.compile( + r"git\+https://(?P.+)/(?P.+)/(?P.+?)(@(?P.+))?$" +) + + +@repo_app.command(name="list") +def repo_list(): + config = _load_config() + pyaml.pprint(config.repos) + + +def _parse_repo_url(repo_url): + """ + parse the git repo url to server, owner, repo name, branch + >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main") + ('github.com', 'bojiang', 'bentovllm', 'main') + + >>> _parse_repo_url("git+https://github.com/bojiang/bentovllm") + ('github.com', 'bojiang', 'bentovllm', 'main') + """ + match = GIT_REPO_RE.match(repo_url) + if not match: + raise ValueError(f"Invalid git repo url: {repo_url}") + return ( + match.group("server"), + match.group("owner"), + match.group("repo"), + match.group("branch") or "main", + ) + + +@repo_app.command(name="add") +def repo_add(name: str, repo: str): + name = name.lower() + if not name.isidentifier(): + questionary.print( + f"Invalid repo name: {name}, should only contain letters, numbers and underscores", + style=ERROR_STYLE, + ) + return + + config = _load_config() + if name in config.repos: + override = questionary.confirm( + f"Repo {name} already exists({config.repos[name]}), override?" + ).ask() + if not override: + return + + config.repos[name] = repo + _save_config(config) + pyaml.pprint(config.repos) + + +@repo_app.command(name="remove") +def repo_remove(name: str): + config = _load_config() + if name not in config.repos: + questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE) + return + + del config.repos[name] + _save_config(config) + pyaml.pprint(config.repos) + + +def _run_command(cmd, cwd=None): + questionary.print(f"\n$ {' '.join(cmd)}", style="bold") + subprocess.run(cmd, cwd=cwd, check=True) + + +@repo_app.command(name="update") +def repo_update(): + config = _load_config() + repos_in_use = set() + for name, repo in config.repos.items(): + server, owner, repo_name, branch = _parse_repo_url(repo) + repos_in_use.add((server, owner, repo_name)) + repo_dir = REPO_DIR / server / owner / repo_name + if not repo_dir.exists(): + repo_dir.parent.mkdir(parents=True, exist_ok=True) + try: + cmd = [ + "git", + "clone", + "--branch", + branch, + f"https://{server}/{owner}/{repo_name}.git", + str(repo_dir), + ] + _run_command(cmd) + except subprocess.CalledProcessError: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE) + else: + try: + cmd = ["git", "fetch", "origin", branch] + _run_command(cmd, cwd=repo_dir) + cmd = ["git", "reset", "--hard", f"origin/{branch}"] + _run_command(cmd, cwd=repo_dir) + except: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE) + for repo_dir in REPO_DIR.glob("*/*/*"): + if tuple(repo_dir.parts[-3:]) not in repos_in_use: + shutil.rmtree(repo_dir, ignore_errors=True) + questionary.print(f"Removed unused repo {repo_dir}") + questionary.print("Repos updated", style=SUCCESS_STYLE) + + +@model_app.command(name="list") +def model_list(): + pyaml.pprint(_load_model_map()) + + +def _get_bento_info(tag): + model_map = _load_model_map() + bento, version = tag.split(":") + if bento not in model_map or version not in model_map[bento]: + questionary.print(f"Model {tag} not found", style=ERROR_STYLE) + return + model_info = model_map[bento][version] + repo_name = model_info["repo"] + path = pathlib.Path(model_info["path"]) + + bento_file = path / "bento.yaml" + bento_info = yaml.safe_load(bento_file.read_text()) + return bento_info + + +@model_app.command(name="get") +def model_get(tag: str): + bento_info = _get_bento_info(tag) + if bento_info: + pyaml.pprint(bento_info) + + +def _filter_instance_types( + instance_types, + gpu_count, + gpu_memory=None, + gpu_type=None, + level="match", +): + if gpu_memory is None: + if gpu_type is None: + raise ValueError("Either gpu_memory or gpu_type must be provided") + gpu_memory = GPU_MEMORY[gpu_type] + + def _check_instance(spec): + if gpu_count == 0 or gpu_count is None: + if "GpuInfo" in spec: + return False + else: + return True + else: + gpus = spec.get("GpuInfo", {}).get("Gpus", []) + if len(gpus) == 0: + return False + it_gpu = gpus[0] + it_gpu_mem = it_gpu["MemoryInfo"]["SizeInMiB"] / 1024 + + if it_gpu["Count"] == gpu_count and it_gpu_mem == gpu_memory: + return True + elif it_gpu["Count"] >= gpu_count and it_gpu_mem >= gpu_memory: + if level == "match": + return False + elif level == "usable": + return True + else: + assert False + else: + return False + + def _sort_key(spec): + return ( + spec["InstanceType"].split(".")[0], + spec.get("GpuInfo", {}).get("Gpus", [{}])[0].get("Count", 0), + spec.get("VCpuInfo", {}).get("DefaultVCpus", 0), + spec.get("MemoryInfo", {}).get("SizeInMiB", 0), + ) + + return sorted(filter(_check_instance, instance_types), key=_sort_key) + + +def _resolve_git_package(package): + match = REG_GITPACKAGE.match(package) + if not match: + raise ValueError(f"Invalid git package: {package}") + repo_url, branch, subdirectory = match.groups() + parsed = urlparse(repo_url) + + path_parts = [parsed.netloc] + parsed.path.split("/") + + return repo_url, branch, subdirectory, path_parts + + +def _get_it_card(spec): + """ + InstanceType: g4dn.2xlarge + VCpuInfo: + DefaultCores: 32 + DefaultThreadsPerCore: 2 + DefaultVCpus: 64 + + MemoryInfo: + SizeInMiB: 32768 + + GpuInfo: + Gpus: + - Count: 1 + Manufacturer: NVIDIA + MemoryInfo: + SizeInMiB: 16384 + Name: T4 + TotalGpuMemoryInMiB: 16384 + """ + return f"cpus: {spec['VCpuInfo']['DefaultVCpus']}, mem: {spec['MemoryInfo']['SizeInMiB']}, gpu: {spec['GpuInfo']['Gpus'][0]['Name']} x {spec['GpuInfo']['Gpus'][0]['Count']}, cost: $0.1/hour" + + +def _ensure_aws_security_group(group_name="cllama-http-default"): + try: + existing_groups = subprocess.check_output( + [ + "aws", + "ec2", + "describe-security-groups", + "--filters", + f"Name=group-name,Values={group_name}", + "--no-cli-pager", + ] + ) + existing_groups = json.loads(existing_groups) + if existing_groups["SecurityGroups"]: + return existing_groups["SecurityGroups"][0]["GroupId"] + + result = subprocess.check_output( + [ + "aws", + "ec2", + "create-security-group", + "--group-name", + group_name, + "--description", + "Default VPC security group for cllama services", + "--no-cli-pager", + ] + ) + result = json.loads(result) + security_group_id = result["GroupId"] + + subprocess.check_call( + [ + "aws", + "ec2", + "authorize-security-group-ingress", + "--group-id", + security_group_id, + "--protocol", + "tcp", + "--port", + "80", + "--cidr", + "0.0.0.0/0", + "--no-cli-pager", + ] + ) + subprocess.check_call( + [ + "aws", + "ec2", + "authorize-security-group-ingress", + "--group-id", + security_group_id, + "--protocol", + "tcp", + "--port", + "443", + "--cidr", + "0.0.0.0/0", + "--no-cli-pager", + ] + ) + subprocess.check_call( + [ + "aws", + "ec2", + "authorize-security-group-ingress", + "--group-id", + security_group_id, + "--protocol", + "tcp", + "--port", + "22", + "--cidr", + "0.0.0.0/0", + "--no-cli-pager", + ] + ) + return security_group_id + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to create security group: {e}") + + +@app.command() +def serve(model: str, tag: str = "latest", force_rebuild: bool = False): + if ":" in model: + model, tag = model.split(":") + if tag == "latest": + tag = next(iter(MODEL_INFOS[model].keys())) + + package = MODEL_INFOS[model][tag] + repo, branch, subdirectory, path_parts = _resolve_git_package(package) + repo_dir = REPO_DIR.joinpath(*path_parts) + bento_project_dir = repo_dir / subdirectory + + if force_rebuild: + shutil.rmtree(repo_dir, ignore_errors=True) + + if not repo_dir.exists(): + repo_dir.parent.mkdir(parents=True, exist_ok=True) + try: + cmd = ["git", "clone", "--branch", branch, repo, str(repo_dir)] + print(f"\n$ {' '.join(cmd)}") + subprocess.run(cmd, check=True) + except: + shutil.rmtree(repo_dir, ignore_errors=True) + raise + + bento_info = _get_bento_info(f"{model}:{tag}", bento_project_dir) + + if len(bento_info["services"]) != 1: + raise ValueError("Only support one service currently") + + envs = {} + if len(bento_info.get("envs", [])) > 0: + for env in bento_info["envs"]: + if env["name"] == "CLLAMA_MODEL": + envs[env["name"]] = f"{model}:{tag}" + continue + if env["name"] in os.environ: + value = os.environ.get(env["name"]) + questionary.print(f"Using environment value for {env['name']}") + elif env.get("value"): + value = questionary.text( + f"Enter value for {env['name']}", + default=env["value"], + ).ask() + else: + value = questionary.text( + f"Enter value for {env['name']}", + ).ask() + envs[env["name"]] = value + + cloud_provider = questionary.select( + "Select a cloud provider", + choices=[ + questionary.Choice(title="Local", value="aws"), + questionary.Choice(title="BentoCloud", value="cloud"), + ], + ).ask() + + if cloud_provider == "cloud": + cloud_provider = questionary.select( + "You haven't logged in to BentoCloud, select an action", + choices=[ + questionary.Choice(title="Login with Token", value="login"), + questionary.Choice(title="Sign up ($10 free credit)", value="signup"), + ], + ).ask() + if cloud_provider == "login": + token = questionary.text("Enter your token").ask() + cmd = ["bentoml", "cloud", "login", "--token", token] + # print(f"\n$ {' '.join(cmd)}") + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError: + raise RuntimeError("Failed to login") + elif cloud_provider == "signup": + token = questionary.text( + "Open https://cloud.bentoml.org/signup in your browser", + ).ask() + # cmd = ["bentoml", "cloud", "signup"] + # print(f"\n$ {' '.join(cmd)}") + # try: + # subprocess.check_call(cmd) + # except subprocess.CalledProcessError: + # raise RuntimeError("Failed to sign up") + + elif cloud_provider == "aws": + try: + cmd = ["aws", "ec2", "describe-instance-types", "--no-cli-pager"] + print(f"\n$ {' '.join(cmd)}") + _instance_types = subprocess.check_output(cmd, text=True) + except subprocess.CalledProcessError: + raise + # print(e) + # _cli_install_aws() + available_it_infos = json.loads(_instance_types)["InstanceTypes"] + # pyaml.p(available_it_infos) + + service = bento_info["services"][0] + if "config" not in service or "resources" not in service["config"]: + raise ValueError("Service config is missing") + elif "gpu" in service["config"]["resources"]: + gpu_count = service["config"]["resources"]["gpu"] + gpu_type = service["config"]["resources"].get("gpu_type") + gpu_memory = service["config"]["resources"].get("gpu_memory") + supported_its = _filter_instance_types( + available_it_infos, + gpu_count, + gpu_memory, + gpu_type, + ) + it = questionary.select( + "Select an instance type", + choices=[ + questionary.Choice( + title=_get_it_card(it_info), + value=it_info["InstanceType"], + ) + for it_info in supported_its + ], + ).ask() + security_group_id = _ensure_aws_security_group() + AMI = "ami-02623cf022763d4a1" + + init_script_file = TEMP_DIR / f"init_script_{str(uuid.uuid4())[:8]}.sh" + with open(init_script_file, "w") as f: + f.write( + INIT_SCRIPT_TEMPLATE.format( + repo=repo, + subdirectory=subdirectory, + model=model, + tag=tag, + env_args=" ".join([f"-e {k}={v}" for k, v in envs.items()]), + ) + ) + # grant permission + os.chmod(init_script_file, 0o755) + cmd = [ + "aws", + "ec2", + "run-instances", + "--image-id", + AMI, + "--instance-type", + it, + "--security-group-ids", + security_group_id, + "--user-data", + f"file://{init_script_file}", + "--key-name", + "jiang", + "--count", + "1", + "--no-cli-pager", + ] + # print(f"\n$ {' '.join(cmd)}") + try: + result = subprocess.check_output(cmd) + except subprocess.CalledProcessError: + raise RuntimeError("Failed to create instance") + result = json.loads(result) + instance_id = result["Instances"][0]["InstanceId"] + print(f"Deployment {instance_id} is created") + + cmd = [ + "aws", + "ec2", + "describe-instances", + "--instance-ids", + instance_id, + "--no-cli-pager", + ] + # print(f"\n$ {' '.join(cmd)}") + result = subprocess.check_output(cmd) + result = json.loads(result) + public_ip = result["Reservations"][0]["Instances"][0]["PublicIpAddress"] + print(f"Public IP: {public_ip}") + + server_start_time = time.time() + print("Server is starting...") + with prompt_toolkit.shortcuts.ProgressBar() as pb: + for _ in pb(range(100)): + start_time = time.time() + try: + with bentoml.SyncHTTPClient(f"http://{public_ip}"): + break + except Exception: + time.sleep(max(0, 6 - (time.time() - start_time))) + else: + raise RuntimeError("Instance is not ready after 10 minutes") + print(f"Server started in {time.time() - server_start_time:.2f} seconds") + print(f"HTTP server is ready at http://{public_ip}") + return + else: + raise ValueError("GPU is required for now") + if cloud_provider == "bentocloud": + cmd = ["bentoml", "cloud", "current-context"] + # print(f"\n$ {' '.join(cmd)}") + try: + output = subprocess.check_output(cmd, text=True) + except subprocess.CalledProcessError: + raise RuntimeError( + "Failed to get bentocloud login context, please login first", + ) + + +@app.command() +def run(model: str, tag: str = "latest", force_rebuild: bool = False): + serve(model, tag, force_rebuild) + + +INIT_SCRIPT_TEMPLATE = """#!/bin/bash +pip3 install bentoml +rm -r /usr/local/cuda* +git clone {repo} /root/bento_repo +export BENTOML_HOME=/root/bento_repo/{subdirectory} +bentoml containerize {model}:{tag} --image-tag {model}:{tag} +docker run --restart always --gpus all -d -p 80:3000 {env_args} {model}:{tag} + +nvidia-smi -q | grep -A2 "ECC Mode" | grep "Current" | grep "Enabled" +ECC_ENABLED=$? + +if [[ $ECC_ENABLED -eq 0 ]]; then + echo "ECC is enabled. Disabling now..." + nvidia-smi -e 0 + reboot +else + echo "ECC is not enabled. No changes made." +fi +""" + + +if __name__ == "__main__": + app() diff --git a/cllama/spec.py b/cllama/spec.py new file mode 100644 index 00000000..006041fc --- /dev/null +++ b/cllama/spec.py @@ -0,0 +1,11 @@ +GPU_MEMORY = { + "nvidia-tesla-t4": 16, + "nvidia-tesla-v100": 16, + "nvidia-tesla-p100": 16, + "nvidia-tesla-p4": 8, + "nvidia-tesla-k80": 12, + "nvidia-tesla-a100": 40, + "nvidia-tesla-a100-80gb": 80, + "nvidia-tesla-a10g": 24, + "nvidia-l4": 24, +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..75d7a40c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "cllama" +version = "0.0.1" +description = "A description of your package." +authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}] +license = {file = "LICENSE"} +dependencies = [ + "typer", + "bentoml", + "pyaml", + "fastapi", + "questionary", + "psutil", + "pathlib" +] + +[tool.typer] +src-dir = "cllama" diff --git a/req.txt b/req.txt new file mode 100644 index 00000000..4f08fb7b --- /dev/null +++ b/req.txt @@ -0,0 +1,7 @@ +typer +bentoml +pyaml +fastapi +questionary +psutil +pathlib