feat: repo/model/serve

This commit is contained in:
bojiang
2024-05-18 12:41:54 +08:00
commit 58fa8a70cb
9 changed files with 1477 additions and 0 deletions

161
.gitignore vendored Normal file
View File

@@ -0,0 +1,161 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.whl

201
LICENSE Normal file
View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

150
README.md Normal file
View File

@@ -0,0 +1,150 @@
<div align="center">
<h1 align="center">Self-host LLMs with vLLM and BentoML</h1>
</div>
This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [vLLM](https://vllm.ai), a high-throughput and memory-efficient inference engine.
See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects.
💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or vLLM options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
## Prerequisites
- You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
- You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
- If you want to test the Service locally, you need a Nvidia GPU with at least 16G VRAM.
- (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
## Install dependencies
```bash
git clone https://github.com/bentoml/BentoVLLM.git
cd BentoVLLM/mistral-7b-instruct
pip install -r requirements.txt && pip install -f -U "pydantic>=2.0"
```
## Run the BentoML Service
We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
```bash
$ bentoml serve .
2024-01-18T07:51:30+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:VLLM" listening on http://localhost:3000 (Press CTRL+C to quit)
INFO 01-18 07:51:40 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-18 07:51:40 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-18 07:51:46 model_runner.py:547] Graph capturing finished in 6 secs.
```
The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
<details>
<summary>CURL</summary>
```bash
curl -X 'POST' \
'http://localhost:3000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Explain superconductors like I'\''m five years old",
"tokens": null
}'
```
</details>
<details>
<summary>Python client</summary>
```python
import bentoml
with bentoml.SyncHTTPClient("http://localhost:3000") as client:
response_generator = client.generate(
prompt="Explain superconductors like I'm five years old",
tokens=None
)
for response in response_generator:
print(response)
```
</details>
<details>
<summary>OpenAI-compatible endpoints</summary>
This Service uses the `@openai_endpoints` decorator to set up OpenAI-compatible endpoints (`chat/completions` and `completions`). This means your client can interact with the backend Service (in this case, the VLLM class) as if they were communicating directly with OpenAI's API. This [utility](mistral-7b-instruct/bentovllm_openai/) does not affect your BentoML Service code, and you can use it for other LLMs as well.
```python
from openai import OpenAI
client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
# Use the following func to get the available models
client.models.list()
chat_completion = client.chat.completions.create(
model="mistralai/Mistral-7B-Instruct-v0.2",
messages=[
{
"role": "user",
"content": "Explain superconductors like I'm five years old"
}
],
stream=True,
)
for chunk in chat_completion:
# Extract and print the content of the model's reply
print(chunk.choices[0].delta.content or "", end="")
```
**Note**: If your Service is deployed with [protected endpoints on BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#access-protected-deployments), you need to set the environment variable `OPENAI_API_KEY` to your BentoCloud API key first.
```bash
export OPENAI_API_KEY={YOUR_BENTOCLOUD_API_TOKEN}
```
You can then use the following line to replace the client in the above code snippet. Refer to [Obtain the endpoint URL](https://docs.bentoml.com/en/latest/bentocloud/how-tos/call-deployment-endpoints.html#obtain-the-endpoint-url) to retrieve the endpoint URL.
```python
client = OpenAI(base_url='your_bentocloud_deployment_endpoint_url/v1')
```
</details>
For detailed explanations of the Service code, see [vLLM inference](https://docs.bentoml.org/en/latest/use-cases/large-language-models/vllm.html).
## Deploy to BentoCloud
After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
```bash
bentoml deploy .
```
Once the application is up and running on BentoCloud, you can access it via the exposed URL.
**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
## Different LLM Models
Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories.
- [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/)
- [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/)
- [Llama-2-7b-chat-hf](llama2-7b-chat/)
- [SOLAR-10.7B-v1.0](solar-10.7b-instruct/)
## LLM tools integration examples
- Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service.
- [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation.

0
cllama/__init__.py Normal file
View File

295
cllama/__main__.py Normal file
View File

@@ -0,0 +1,295 @@
import typer
import shlex
import os
from typing_extensions import TypedDict
import collections
import prompt_toolkit
import shutil
import pydantic
import yaml
import json
import questionary
import re
import subprocess
import pyaml
import pathlib
from cllama.spec import GPU_MEMORY
ERROR_STYLE = "red"
SUCCESS_STYLE = "green"
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
REPO_DIR = CLLAMA_HOME / "repos"
TEMP_DIR = CLLAMA_HOME / "temp"
VENV_DIR = CLLAMA_HOME / "venv"
REPO_DIR.mkdir(exist_ok=True, parents=True)
TEMP_DIR.mkdir(exist_ok=True, parents=True)
VENV_DIR.mkdir(exist_ok=True, parents=True)
CONFIG_FILE = CLLAMA_HOME / "config.json"
app = typer.Typer()
repo_app = typer.Typer()
model_app = typer.Typer()
app.add_typer(repo_app, name="repo")
app.add_typer(model_app, name="model")
class Config(pydantic.BaseModel):
repos: dict[str, str] = {
"default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
}
default_repo: str = "default"
def _load_config():
if CONFIG_FILE.exists():
with open(CONFIG_FILE) as f:
return Config(**json.load(f))
return Config()
def _save_config(config):
with open(CONFIG_FILE, "w") as f:
json.dump(config.dict(), f, indent=2)
class RepoInfo(TypedDict):
name: str
path: str
url: str
server: str
owner: str
repo: str
branch: str
class ModelInfo(TypedDict):
repo: RepoInfo
path: str
class BentoInfo(TypedDict):
model: ModelInfo
bento_yaml: dict
def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
model_map = collections.defaultdict(dict)
config = _load_config()
for repo_name, repo_url in config.repos.items():
server, owner, repo, branch = _parse_repo_url(repo_url)
repo_dir = REPO_DIR / server / owner / repo
for path in repo_dir.glob("bentoml/bentos/*/*"):
if path.is_dir():
model_map[path.parent.name][path.name] = ModelInfo(
repo=RepoInfo(
name=repo_name,
url=repo_url,
server=server,
owner=owner,
repo=repo,
branch=branch,
path=str(repo_dir),
),
path=str(path),
)
elif path.is_file():
with open(path) as f:
origin_name = f.read().strip()
origin_path = path.parent / origin_name
model_map[path.parent.name][path.name] = ModelInfo(
repo=RepoInfo(
name=repo_name,
url=repo_url,
server=server,
owner=owner,
repo=repo,
branch=branch,
path=str(repo_dir),
),
path=str(origin_path),
)
return model_map
GIT_REPO_RE = re.compile(
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
)
@repo_app.command(name="list")
def repo_list():
config = _load_config()
pyaml.pprint(config.repos)
def _parse_repo_url(repo_url):
"""
parse the git repo url to server, owner, repo name, branch
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
('github.com', 'bojiang', 'bentovllm', 'main')
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
('github.com', 'bojiang', 'bentovllm', 'main')
"""
match = GIT_REPO_RE.match(repo_url)
if not match:
raise ValueError(f"Invalid git repo url: {repo_url}")
return (
match.group("server"),
match.group("owner"),
match.group("repo"),
match.group("branch") or "main",
)
@repo_app.command(name="add")
def repo_add(name: str, repo: str):
name = name.lower()
if not name.isidentifier():
questionary.print(
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
style=ERROR_STYLE,
)
return
config = _load_config()
if name in config.repos:
override = questionary.confirm(
f"Repo {name} already exists({config.repos[name]}), override?"
).ask()
if not override:
return
config.repos[name] = repo
_save_config(config)
pyaml.pprint(config.repos)
@repo_app.command(name="remove")
def repo_remove(name: str):
config = _load_config()
if name not in config.repos:
questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
return
del config.repos[name]
_save_config(config)
pyaml.pprint(config.repos)
def _run_command(cmd, cwd=None, env=None, copy_env=True):
questionary.print("\n")
env = env or {}
if cwd:
questionary.print(f"$ cd {cwd}", style="bold")
if env:
for k, v in env.items():
questionary.print(f"$ export {k}={shlex.quote(v)}", style="bold")
if copy_env:
env = {**os.environ, **env}
questionary.print(f"$ {' '.join(cmd)}", style="bold")
try:
subprocess.run(cmd, cwd=cwd, env=env, check=True)
except subprocess.CalledProcessError:
questionary.print("Command failed", style=ERROR_STYLE)
return
@repo_app.command(name="update")
def repo_update():
config = _load_config()
repos_in_use = set()
for name, repo in config.repos.items():
server, owner, repo_name, branch = _parse_repo_url(repo)
repos_in_use.add((server, owner, repo_name))
repo_dir = REPO_DIR / server / owner / repo_name
if not repo_dir.exists():
repo_dir.parent.mkdir(parents=True, exist_ok=True)
try:
cmd = [
"git",
"clone",
"--branch",
branch,
f"https://{server}/{owner}/{repo_name}.git",
str(repo_dir),
]
_run_command(cmd)
except subprocess.CalledProcessError:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
else:
try:
cmd = ["git", "fetch", "origin", branch]
_run_command(cmd, cwd=repo_dir)
cmd = ["git", "reset", "--hard", f"origin/{branch}"]
_run_command(cmd, cwd=repo_dir)
except:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
for repo_dir in REPO_DIR.glob("*/*/*"):
if tuple(repo_dir.parts[-3:]) not in repos_in_use:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Removed unused repo {repo_dir}")
questionary.print("Repos updated", style=SUCCESS_STYLE)
@model_app.command(name="list")
def model_list():
pyaml.pprint(_load_model_map())
def _get_bento_info(tag):
model_map = _load_model_map()
bento, version = tag.split(":")
if bento not in model_map or version not in model_map[bento]:
questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
return
model_info = model_map[bento][version]
path = pathlib.Path(model_info["path"])
bento_file = path / "bento.yaml"
bento_info = yaml.safe_load(bento_file.read_text())
return BentoInfo(
model=model_info,
bento_yaml=bento_info,
)
@model_app.command(name="get")
def model_get(tag: str):
bento_info = _get_bento_info(tag)
if bento_info:
pyaml.pprint(bento_info)
def _serve_model(model: str):
if ":" not in model:
model = f"{model}:latest"
bento_info = _get_bento_info(model)
if not bento_info:
questionary.print(f"Model {model} not found", style=ERROR_STYLE)
return
cmd = ["bentoml", "serve", model]
env = {
"CLLAMA_MODEL": model,
"BENTOML_HOME": bento_info["model"]["repo"]["path"] + "/bentoml",
}
_run_command(cmd, env=env)
@app.command()
def serve(model: str):
_serve_model(model)
if __name__ == "__main__":
app()

630
cllama/aws.py Normal file
View File

@@ -0,0 +1,630 @@
import typer
import typing
import collections
import prompt_toolkit
from prompt_toolkit import print_formatted_text as print
import time
import uuid
import shutil
import pydantic
from urllib.parse import urlparse
import yaml
import json
import bentoml
import questionary
import os
import re
import subprocess
import pyaml
import pathlib
from cllama.spec import GPU_MEMORY
ERROR_STYLE = "red"
SUCCESS_STYLE = "green"
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
REPO_DIR = CLLAMA_HOME / "repos"
TEMP_DIR = CLLAMA_HOME / "temp"
VENV_DIR = CLLAMA_HOME / "venv"
REPO_DIR.mkdir(exist_ok=True, parents=True)
TEMP_DIR.mkdir(exist_ok=True, parents=True)
VENV_DIR.mkdir(exist_ok=True, parents=True)
CONFIG_FILE = CLLAMA_HOME / "config.json"
app = typer.Typer()
repo_app = typer.Typer()
model_app = typer.Typer()
app.add_typer(repo_app, name="repo")
app.add_typer(model_app, name="model")
class Config(pydantic.BaseModel):
repos: dict[str, str] = {
"default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
}
default_repo: str = "default"
def _load_config():
if CONFIG_FILE.exists():
with open(CONFIG_FILE) as f:
return Config(**json.load(f))
return Config()
def _save_config(config):
with open(CONFIG_FILE, "w") as f:
json.dump(config.dict(), f, indent=2)
class ModelInfo(typing.TypedDict):
repo: str
path: str
def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
model_map = collections.defaultdict(dict)
config = _load_config()
for repo_name, repo_url in config.repos.items():
server, owner, repo, _ = _parse_repo_url(repo_url)
repo_dir = REPO_DIR / server / owner / repo
for path in repo_dir.glob("bentoml/bentos/*/*"):
if path.is_dir():
model_map[path.parent.name][path.name] = ModelInfo(
repo=repo_name,
path=str(path),
)
elif path.is_file():
with open(path) as f:
origin_name = f.read().strip()
origin_path = path.parent / origin_name
model_map[path.parent.name][path.name] = ModelInfo(
repo=repo_name,
path=str(origin_path),
)
return model_map
GIT_REPO_RE = re.compile(
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
)
@repo_app.command(name="list")
def repo_list():
config = _load_config()
pyaml.pprint(config.repos)
def _parse_repo_url(repo_url):
"""
parse the git repo url to server, owner, repo name, branch
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
('github.com', 'bojiang', 'bentovllm', 'main')
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
('github.com', 'bojiang', 'bentovllm', 'main')
"""
match = GIT_REPO_RE.match(repo_url)
if not match:
raise ValueError(f"Invalid git repo url: {repo_url}")
return (
match.group("server"),
match.group("owner"),
match.group("repo"),
match.group("branch") or "main",
)
@repo_app.command(name="add")
def repo_add(name: str, repo: str):
name = name.lower()
if not name.isidentifier():
questionary.print(
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
style=ERROR_STYLE,
)
return
config = _load_config()
if name in config.repos:
override = questionary.confirm(
f"Repo {name} already exists({config.repos[name]}), override?"
).ask()
if not override:
return
config.repos[name] = repo
_save_config(config)
pyaml.pprint(config.repos)
@repo_app.command(name="remove")
def repo_remove(name: str):
config = _load_config()
if name not in config.repos:
questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
return
del config.repos[name]
_save_config(config)
pyaml.pprint(config.repos)
def _run_command(cmd, cwd=None):
questionary.print(f"\n$ {' '.join(cmd)}", style="bold")
subprocess.run(cmd, cwd=cwd, check=True)
@repo_app.command(name="update")
def repo_update():
config = _load_config()
repos_in_use = set()
for name, repo in config.repos.items():
server, owner, repo_name, branch = _parse_repo_url(repo)
repos_in_use.add((server, owner, repo_name))
repo_dir = REPO_DIR / server / owner / repo_name
if not repo_dir.exists():
repo_dir.parent.mkdir(parents=True, exist_ok=True)
try:
cmd = [
"git",
"clone",
"--branch",
branch,
f"https://{server}/{owner}/{repo_name}.git",
str(repo_dir),
]
_run_command(cmd)
except subprocess.CalledProcessError:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
else:
try:
cmd = ["git", "fetch", "origin", branch]
_run_command(cmd, cwd=repo_dir)
cmd = ["git", "reset", "--hard", f"origin/{branch}"]
_run_command(cmd, cwd=repo_dir)
except:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
for repo_dir in REPO_DIR.glob("*/*/*"):
if tuple(repo_dir.parts[-3:]) not in repos_in_use:
shutil.rmtree(repo_dir, ignore_errors=True)
questionary.print(f"Removed unused repo {repo_dir}")
questionary.print("Repos updated", style=SUCCESS_STYLE)
@model_app.command(name="list")
def model_list():
pyaml.pprint(_load_model_map())
def _get_bento_info(tag):
model_map = _load_model_map()
bento, version = tag.split(":")
if bento not in model_map or version not in model_map[bento]:
questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
return
model_info = model_map[bento][version]
repo_name = model_info["repo"]
path = pathlib.Path(model_info["path"])
bento_file = path / "bento.yaml"
bento_info = yaml.safe_load(bento_file.read_text())
return bento_info
@model_app.command(name="get")
def model_get(tag: str):
bento_info = _get_bento_info(tag)
if bento_info:
pyaml.pprint(bento_info)
def _filter_instance_types(
instance_types,
gpu_count,
gpu_memory=None,
gpu_type=None,
level="match",
):
if gpu_memory is None:
if gpu_type is None:
raise ValueError("Either gpu_memory or gpu_type must be provided")
gpu_memory = GPU_MEMORY[gpu_type]
def _check_instance(spec):
if gpu_count == 0 or gpu_count is None:
if "GpuInfo" in spec:
return False
else:
return True
else:
gpus = spec.get("GpuInfo", {}).get("Gpus", [])
if len(gpus) == 0:
return False
it_gpu = gpus[0]
it_gpu_mem = it_gpu["MemoryInfo"]["SizeInMiB"] / 1024
if it_gpu["Count"] == gpu_count and it_gpu_mem == gpu_memory:
return True
elif it_gpu["Count"] >= gpu_count and it_gpu_mem >= gpu_memory:
if level == "match":
return False
elif level == "usable":
return True
else:
assert False
else:
return False
def _sort_key(spec):
return (
spec["InstanceType"].split(".")[0],
spec.get("GpuInfo", {}).get("Gpus", [{}])[0].get("Count", 0),
spec.get("VCpuInfo", {}).get("DefaultVCpus", 0),
spec.get("MemoryInfo", {}).get("SizeInMiB", 0),
)
return sorted(filter(_check_instance, instance_types), key=_sort_key)
def _resolve_git_package(package):
match = REG_GITPACKAGE.match(package)
if not match:
raise ValueError(f"Invalid git package: {package}")
repo_url, branch, subdirectory = match.groups()
parsed = urlparse(repo_url)
path_parts = [parsed.netloc] + parsed.path.split("/")
return repo_url, branch, subdirectory, path_parts
def _get_it_card(spec):
"""
InstanceType: g4dn.2xlarge
VCpuInfo:
DefaultCores: 32
DefaultThreadsPerCore: 2
DefaultVCpus: 64
MemoryInfo:
SizeInMiB: 32768
GpuInfo:
Gpus:
- Count: 1
Manufacturer: NVIDIA
MemoryInfo:
SizeInMiB: 16384
Name: T4
TotalGpuMemoryInMiB: 16384
"""
return f"cpus: {spec['VCpuInfo']['DefaultVCpus']}, mem: {spec['MemoryInfo']['SizeInMiB']}, gpu: {spec['GpuInfo']['Gpus'][0]['Name']} x {spec['GpuInfo']['Gpus'][0]['Count']}, cost: $0.1/hour"
def _ensure_aws_security_group(group_name="cllama-http-default"):
try:
existing_groups = subprocess.check_output(
[
"aws",
"ec2",
"describe-security-groups",
"--filters",
f"Name=group-name,Values={group_name}",
"--no-cli-pager",
]
)
existing_groups = json.loads(existing_groups)
if existing_groups["SecurityGroups"]:
return existing_groups["SecurityGroups"][0]["GroupId"]
result = subprocess.check_output(
[
"aws",
"ec2",
"create-security-group",
"--group-name",
group_name,
"--description",
"Default VPC security group for cllama services",
"--no-cli-pager",
]
)
result = json.loads(result)
security_group_id = result["GroupId"]
subprocess.check_call(
[
"aws",
"ec2",
"authorize-security-group-ingress",
"--group-id",
security_group_id,
"--protocol",
"tcp",
"--port",
"80",
"--cidr",
"0.0.0.0/0",
"--no-cli-pager",
]
)
subprocess.check_call(
[
"aws",
"ec2",
"authorize-security-group-ingress",
"--group-id",
security_group_id,
"--protocol",
"tcp",
"--port",
"443",
"--cidr",
"0.0.0.0/0",
"--no-cli-pager",
]
)
subprocess.check_call(
[
"aws",
"ec2",
"authorize-security-group-ingress",
"--group-id",
security_group_id,
"--protocol",
"tcp",
"--port",
"22",
"--cidr",
"0.0.0.0/0",
"--no-cli-pager",
]
)
return security_group_id
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to create security group: {e}")
@app.command()
def serve(model: str, tag: str = "latest", force_rebuild: bool = False):
if ":" in model:
model, tag = model.split(":")
if tag == "latest":
tag = next(iter(MODEL_INFOS[model].keys()))
package = MODEL_INFOS[model][tag]
repo, branch, subdirectory, path_parts = _resolve_git_package(package)
repo_dir = REPO_DIR.joinpath(*path_parts)
bento_project_dir = repo_dir / subdirectory
if force_rebuild:
shutil.rmtree(repo_dir, ignore_errors=True)
if not repo_dir.exists():
repo_dir.parent.mkdir(parents=True, exist_ok=True)
try:
cmd = ["git", "clone", "--branch", branch, repo, str(repo_dir)]
print(f"\n$ {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except:
shutil.rmtree(repo_dir, ignore_errors=True)
raise
bento_info = _get_bento_info(f"{model}:{tag}", bento_project_dir)
if len(bento_info["services"]) != 1:
raise ValueError("Only support one service currently")
envs = {}
if len(bento_info.get("envs", [])) > 0:
for env in bento_info["envs"]:
if env["name"] == "CLLAMA_MODEL":
envs[env["name"]] = f"{model}:{tag}"
continue
if env["name"] in os.environ:
value = os.environ.get(env["name"])
questionary.print(f"Using environment value for {env['name']}")
elif env.get("value"):
value = questionary.text(
f"Enter value for {env['name']}",
default=env["value"],
).ask()
else:
value = questionary.text(
f"Enter value for {env['name']}",
).ask()
envs[env["name"]] = value
cloud_provider = questionary.select(
"Select a cloud provider",
choices=[
questionary.Choice(title="Local", value="aws"),
questionary.Choice(title="BentoCloud", value="cloud"),
],
).ask()
if cloud_provider == "cloud":
cloud_provider = questionary.select(
"You haven't logged in to BentoCloud, select an action",
choices=[
questionary.Choice(title="Login with Token", value="login"),
questionary.Choice(title="Sign up ($10 free credit)", value="signup"),
],
).ask()
if cloud_provider == "login":
token = questionary.text("Enter your token").ask()
cmd = ["bentoml", "cloud", "login", "--token", token]
# print(f"\n$ {' '.join(cmd)}")
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError:
raise RuntimeError("Failed to login")
elif cloud_provider == "signup":
token = questionary.text(
"Open https://cloud.bentoml.org/signup in your browser",
).ask()
# cmd = ["bentoml", "cloud", "signup"]
# print(f"\n$ {' '.join(cmd)}")
# try:
# subprocess.check_call(cmd)
# except subprocess.CalledProcessError:
# raise RuntimeError("Failed to sign up")
elif cloud_provider == "aws":
try:
cmd = ["aws", "ec2", "describe-instance-types", "--no-cli-pager"]
print(f"\n$ {' '.join(cmd)}")
_instance_types = subprocess.check_output(cmd, text=True)
except subprocess.CalledProcessError:
raise
# print(e)
# _cli_install_aws()
available_it_infos = json.loads(_instance_types)["InstanceTypes"]
# pyaml.p(available_it_infos)
service = bento_info["services"][0]
if "config" not in service or "resources" not in service["config"]:
raise ValueError("Service config is missing")
elif "gpu" in service["config"]["resources"]:
gpu_count = service["config"]["resources"]["gpu"]
gpu_type = service["config"]["resources"].get("gpu_type")
gpu_memory = service["config"]["resources"].get("gpu_memory")
supported_its = _filter_instance_types(
available_it_infos,
gpu_count,
gpu_memory,
gpu_type,
)
it = questionary.select(
"Select an instance type",
choices=[
questionary.Choice(
title=_get_it_card(it_info),
value=it_info["InstanceType"],
)
for it_info in supported_its
],
).ask()
security_group_id = _ensure_aws_security_group()
AMI = "ami-02623cf022763d4a1"
init_script_file = TEMP_DIR / f"init_script_{str(uuid.uuid4())[:8]}.sh"
with open(init_script_file, "w") as f:
f.write(
INIT_SCRIPT_TEMPLATE.format(
repo=repo,
subdirectory=subdirectory,
model=model,
tag=tag,
env_args=" ".join([f"-e {k}={v}" for k, v in envs.items()]),
)
)
# grant permission
os.chmod(init_script_file, 0o755)
cmd = [
"aws",
"ec2",
"run-instances",
"--image-id",
AMI,
"--instance-type",
it,
"--security-group-ids",
security_group_id,
"--user-data",
f"file://{init_script_file}",
"--key-name",
"jiang",
"--count",
"1",
"--no-cli-pager",
]
# print(f"\n$ {' '.join(cmd)}")
try:
result = subprocess.check_output(cmd)
except subprocess.CalledProcessError:
raise RuntimeError("Failed to create instance")
result = json.loads(result)
instance_id = result["Instances"][0]["InstanceId"]
print(f"Deployment {instance_id} is created")
cmd = [
"aws",
"ec2",
"describe-instances",
"--instance-ids",
instance_id,
"--no-cli-pager",
]
# print(f"\n$ {' '.join(cmd)}")
result = subprocess.check_output(cmd)
result = json.loads(result)
public_ip = result["Reservations"][0]["Instances"][0]["PublicIpAddress"]
print(f"Public IP: {public_ip}")
server_start_time = time.time()
print("Server is starting...")
with prompt_toolkit.shortcuts.ProgressBar() as pb:
for _ in pb(range(100)):
start_time = time.time()
try:
with bentoml.SyncHTTPClient(f"http://{public_ip}"):
break
except Exception:
time.sleep(max(0, 6 - (time.time() - start_time)))
else:
raise RuntimeError("Instance is not ready after 10 minutes")
print(f"Server started in {time.time() - server_start_time:.2f} seconds")
print(f"HTTP server is ready at http://{public_ip}")
return
else:
raise ValueError("GPU is required for now")
if cloud_provider == "bentocloud":
cmd = ["bentoml", "cloud", "current-context"]
# print(f"\n$ {' '.join(cmd)}")
try:
output = subprocess.check_output(cmd, text=True)
except subprocess.CalledProcessError:
raise RuntimeError(
"Failed to get bentocloud login context, please login first",
)
@app.command()
def run(model: str, tag: str = "latest", force_rebuild: bool = False):
serve(model, tag, force_rebuild)
INIT_SCRIPT_TEMPLATE = """#!/bin/bash
pip3 install bentoml
rm -r /usr/local/cuda*
git clone {repo} /root/bento_repo
export BENTOML_HOME=/root/bento_repo/{subdirectory}
bentoml containerize {model}:{tag} --image-tag {model}:{tag}
docker run --restart always --gpus all -d -p 80:3000 {env_args} {model}:{tag}
nvidia-smi -q | grep -A2 "ECC Mode" | grep "Current" | grep "Enabled"
ECC_ENABLED=$?
if [[ $ECC_ENABLED -eq 0 ]]; then
echo "ECC is enabled. Disabling now..."
nvidia-smi -e 0
reboot
else
echo "ECC is not enabled. No changes made."
fi
"""
if __name__ == "__main__":
app()

11
cllama/spec.py Normal file
View File

@@ -0,0 +1,11 @@
GPU_MEMORY = {
"nvidia-tesla-t4": 16,
"nvidia-tesla-v100": 16,
"nvidia-tesla-p100": 16,
"nvidia-tesla-p4": 8,
"nvidia-tesla-k80": 12,
"nvidia-tesla-a100": 40,
"nvidia-tesla-a100-80gb": 80,
"nvidia-tesla-a10g": 24,
"nvidia-l4": 24,
}

22
pyproject.toml Normal file
View File

@@ -0,0 +1,22 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "cllama"
version = "0.0.1"
description = "A description of your package."
authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
license = {file = "LICENSE"}
dependencies = [
"typer",
"bentoml",
"pyaml",
"fastapi",
"questionary",
"psutil",
"pathlib"
]
[tool.typer]
src-dir = "cllama"

7
req.txt Normal file
View File

@@ -0,0 +1,7 @@
typer
bentoml
pyaml
fastapi
questionary
psutil
pathlib