mirror of
https://github.com/bentoml/OpenLLM.git
synced 2025-12-23 23:57:46 -05:00
feat: repo/model/serve
This commit is contained in:
161
.gitignore
vendored
Normal file
161
.gitignore
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
*.whl
|
||||
201
LICENSE
Normal file
201
LICENSE
Normal file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
150
README.md
Normal file
150
README.md
Normal file
@@ -0,0 +1,150 @@
|
||||
<div align="center">
|
||||
<h1 align="center">Self-host LLMs with vLLM and BentoML</h1>
|
||||
</div>
|
||||
|
||||
This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [vLLM](https://vllm.ai), a high-throughput and memory-efficient inference engine.
|
||||
|
||||
See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects.
|
||||
|
||||
💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or vLLM options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
|
||||
- You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
|
||||
- If you want to test the Service locally, you need a Nvidia GPU with at least 16G VRAM.
|
||||
- (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
|
||||
|
||||
## Install dependencies
|
||||
|
||||
```bash
|
||||
git clone https://github.com/bentoml/BentoVLLM.git
|
||||
cd BentoVLLM/mistral-7b-instruct
|
||||
pip install -r requirements.txt && pip install -f -U "pydantic>=2.0"
|
||||
```
|
||||
|
||||
## Run the BentoML Service
|
||||
|
||||
We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
|
||||
|
||||
```bash
|
||||
$ bentoml serve .
|
||||
|
||||
2024-01-18T07:51:30+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:VLLM" listening on http://localhost:3000 (Press CTRL+C to quit)
|
||||
INFO 01-18 07:51:40 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
|
||||
INFO 01-18 07:51:40 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
|
||||
INFO 01-18 07:51:46 model_runner.py:547] Graph capturing finished in 6 secs.
|
||||
```
|
||||
|
||||
The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL</summary>
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://localhost:3000/generate' \
|
||||
-H 'accept: text/event-stream' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"prompt": "Explain superconductors like I'\''m five years old",
|
||||
"tokens": null
|
||||
}'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Python client</summary>
|
||||
|
||||
```python
|
||||
import bentoml
|
||||
|
||||
with bentoml.SyncHTTPClient("http://localhost:3000") as client:
|
||||
response_generator = client.generate(
|
||||
prompt="Explain superconductors like I'm five years old",
|
||||
tokens=None
|
||||
)
|
||||
for response in response_generator:
|
||||
print(response)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>OpenAI-compatible endpoints</summary>
|
||||
|
||||
This Service uses the `@openai_endpoints` decorator to set up OpenAI-compatible endpoints (`chat/completions` and `completions`). This means your client can interact with the backend Service (in this case, the VLLM class) as if they were communicating directly with OpenAI's API. This [utility](mistral-7b-instruct/bentovllm_openai/) does not affect your BentoML Service code, and you can use it for other LLMs as well.
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
|
||||
|
||||
# Use the following func to get the available models
|
||||
client.models.list()
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Explain superconductors like I'm five years old"
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
for chunk in chat_completion:
|
||||
# Extract and print the content of the model's reply
|
||||
print(chunk.choices[0].delta.content or "", end="")
|
||||
```
|
||||
|
||||
**Note**: If your Service is deployed with [protected endpoints on BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#access-protected-deployments), you need to set the environment variable `OPENAI_API_KEY` to your BentoCloud API key first.
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY={YOUR_BENTOCLOUD_API_TOKEN}
|
||||
```
|
||||
|
||||
You can then use the following line to replace the client in the above code snippet. Refer to [Obtain the endpoint URL](https://docs.bentoml.com/en/latest/bentocloud/how-tos/call-deployment-endpoints.html#obtain-the-endpoint-url) to retrieve the endpoint URL.
|
||||
|
||||
```python
|
||||
client = OpenAI(base_url='your_bentocloud_deployment_endpoint_url/v1')
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
For detailed explanations of the Service code, see [vLLM inference](https://docs.bentoml.org/en/latest/use-cases/large-language-models/vllm.html).
|
||||
|
||||
## Deploy to BentoCloud
|
||||
|
||||
After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
|
||||
|
||||
Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
|
||||
|
||||
```bash
|
||||
bentoml deploy .
|
||||
```
|
||||
|
||||
Once the application is up and running on BentoCloud, you can access it via the exposed URL.
|
||||
|
||||
**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
|
||||
|
||||
|
||||
## Different LLM Models
|
||||
|
||||
Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories.
|
||||
|
||||
- [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/)
|
||||
- [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/)
|
||||
- [Llama-2-7b-chat-hf](llama2-7b-chat/)
|
||||
- [SOLAR-10.7B-v1.0](solar-10.7b-instruct/)
|
||||
|
||||
|
||||
## LLM tools integration examples
|
||||
|
||||
- Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service.
|
||||
- [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation.
|
||||
0
cllama/__init__.py
Normal file
0
cllama/__init__.py
Normal file
295
cllama/__main__.py
Normal file
295
cllama/__main__.py
Normal file
@@ -0,0 +1,295 @@
|
||||
import typer
|
||||
import shlex
|
||||
import os
|
||||
from typing_extensions import TypedDict
|
||||
import collections
|
||||
|
||||
import prompt_toolkit
|
||||
import shutil
|
||||
import pydantic
|
||||
import yaml
|
||||
import json
|
||||
import questionary
|
||||
import re
|
||||
import subprocess
|
||||
import pyaml
|
||||
import pathlib
|
||||
from cllama.spec import GPU_MEMORY
|
||||
|
||||
|
||||
ERROR_STYLE = "red"
|
||||
SUCCESS_STYLE = "green"
|
||||
|
||||
|
||||
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
|
||||
REPO_DIR = CLLAMA_HOME / "repos"
|
||||
TEMP_DIR = CLLAMA_HOME / "temp"
|
||||
VENV_DIR = CLLAMA_HOME / "venv"
|
||||
|
||||
REPO_DIR.mkdir(exist_ok=True, parents=True)
|
||||
TEMP_DIR.mkdir(exist_ok=True, parents=True)
|
||||
VENV_DIR.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
CONFIG_FILE = CLLAMA_HOME / "config.json"
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
repo_app = typer.Typer()
|
||||
model_app = typer.Typer()
|
||||
|
||||
app.add_typer(repo_app, name="repo")
|
||||
app.add_typer(model_app, name="model")
|
||||
|
||||
|
||||
class Config(pydantic.BaseModel):
|
||||
repos: dict[str, str] = {
|
||||
"default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
|
||||
}
|
||||
default_repo: str = "default"
|
||||
|
||||
|
||||
def _load_config():
|
||||
if CONFIG_FILE.exists():
|
||||
with open(CONFIG_FILE) as f:
|
||||
return Config(**json.load(f))
|
||||
return Config()
|
||||
|
||||
|
||||
def _save_config(config):
|
||||
with open(CONFIG_FILE, "w") as f:
|
||||
json.dump(config.dict(), f, indent=2)
|
||||
|
||||
|
||||
class RepoInfo(TypedDict):
|
||||
name: str
|
||||
path: str
|
||||
url: str
|
||||
server: str
|
||||
owner: str
|
||||
repo: str
|
||||
branch: str
|
||||
|
||||
|
||||
class ModelInfo(TypedDict):
|
||||
repo: RepoInfo
|
||||
path: str
|
||||
|
||||
|
||||
class BentoInfo(TypedDict):
|
||||
model: ModelInfo
|
||||
bento_yaml: dict
|
||||
|
||||
|
||||
def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
|
||||
model_map = collections.defaultdict(dict)
|
||||
config = _load_config()
|
||||
for repo_name, repo_url in config.repos.items():
|
||||
server, owner, repo, branch = _parse_repo_url(repo_url)
|
||||
repo_dir = REPO_DIR / server / owner / repo
|
||||
for path in repo_dir.glob("bentoml/bentos/*/*"):
|
||||
if path.is_dir():
|
||||
model_map[path.parent.name][path.name] = ModelInfo(
|
||||
repo=RepoInfo(
|
||||
name=repo_name,
|
||||
url=repo_url,
|
||||
server=server,
|
||||
owner=owner,
|
||||
repo=repo,
|
||||
branch=branch,
|
||||
path=str(repo_dir),
|
||||
),
|
||||
path=str(path),
|
||||
)
|
||||
elif path.is_file():
|
||||
with open(path) as f:
|
||||
origin_name = f.read().strip()
|
||||
origin_path = path.parent / origin_name
|
||||
model_map[path.parent.name][path.name] = ModelInfo(
|
||||
repo=RepoInfo(
|
||||
name=repo_name,
|
||||
url=repo_url,
|
||||
server=server,
|
||||
owner=owner,
|
||||
repo=repo,
|
||||
branch=branch,
|
||||
path=str(repo_dir),
|
||||
),
|
||||
path=str(origin_path),
|
||||
)
|
||||
return model_map
|
||||
|
||||
|
||||
GIT_REPO_RE = re.compile(
|
||||
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
|
||||
)
|
||||
|
||||
|
||||
@repo_app.command(name="list")
|
||||
def repo_list():
|
||||
config = _load_config()
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
def _parse_repo_url(repo_url):
|
||||
"""
|
||||
parse the git repo url to server, owner, repo name, branch
|
||||
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
|
||||
('github.com', 'bojiang', 'bentovllm', 'main')
|
||||
|
||||
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
|
||||
('github.com', 'bojiang', 'bentovllm', 'main')
|
||||
"""
|
||||
match = GIT_REPO_RE.match(repo_url)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid git repo url: {repo_url}")
|
||||
return (
|
||||
match.group("server"),
|
||||
match.group("owner"),
|
||||
match.group("repo"),
|
||||
match.group("branch") or "main",
|
||||
)
|
||||
|
||||
|
||||
@repo_app.command(name="add")
|
||||
def repo_add(name: str, repo: str):
|
||||
name = name.lower()
|
||||
if not name.isidentifier():
|
||||
questionary.print(
|
||||
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
|
||||
style=ERROR_STYLE,
|
||||
)
|
||||
return
|
||||
|
||||
config = _load_config()
|
||||
if name in config.repos:
|
||||
override = questionary.confirm(
|
||||
f"Repo {name} already exists({config.repos[name]}), override?"
|
||||
).ask()
|
||||
if not override:
|
||||
return
|
||||
|
||||
config.repos[name] = repo
|
||||
_save_config(config)
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
@repo_app.command(name="remove")
|
||||
def repo_remove(name: str):
|
||||
config = _load_config()
|
||||
if name not in config.repos:
|
||||
questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
|
||||
return
|
||||
|
||||
del config.repos[name]
|
||||
_save_config(config)
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
def _run_command(cmd, cwd=None, env=None, copy_env=True):
|
||||
questionary.print("\n")
|
||||
env = env or {}
|
||||
if cwd:
|
||||
questionary.print(f"$ cd {cwd}", style="bold")
|
||||
if env:
|
||||
for k, v in env.items():
|
||||
questionary.print(f"$ export {k}={shlex.quote(v)}", style="bold")
|
||||
if copy_env:
|
||||
env = {**os.environ, **env}
|
||||
questionary.print(f"$ {' '.join(cmd)}", style="bold")
|
||||
try:
|
||||
subprocess.run(cmd, cwd=cwd, env=env, check=True)
|
||||
except subprocess.CalledProcessError:
|
||||
questionary.print("Command failed", style=ERROR_STYLE)
|
||||
return
|
||||
|
||||
|
||||
@repo_app.command(name="update")
|
||||
def repo_update():
|
||||
config = _load_config()
|
||||
repos_in_use = set()
|
||||
for name, repo in config.repos.items():
|
||||
server, owner, repo_name, branch = _parse_repo_url(repo)
|
||||
repos_in_use.add((server, owner, repo_name))
|
||||
repo_dir = REPO_DIR / server / owner / repo_name
|
||||
if not repo_dir.exists():
|
||||
repo_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
cmd = [
|
||||
"git",
|
||||
"clone",
|
||||
"--branch",
|
||||
branch,
|
||||
f"https://{server}/{owner}/{repo_name}.git",
|
||||
str(repo_dir),
|
||||
]
|
||||
_run_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
|
||||
else:
|
||||
try:
|
||||
cmd = ["git", "fetch", "origin", branch]
|
||||
_run_command(cmd, cwd=repo_dir)
|
||||
cmd = ["git", "reset", "--hard", f"origin/{branch}"]
|
||||
_run_command(cmd, cwd=repo_dir)
|
||||
except:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
|
||||
for repo_dir in REPO_DIR.glob("*/*/*"):
|
||||
if tuple(repo_dir.parts[-3:]) not in repos_in_use:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Removed unused repo {repo_dir}")
|
||||
questionary.print("Repos updated", style=SUCCESS_STYLE)
|
||||
|
||||
|
||||
@model_app.command(name="list")
|
||||
def model_list():
|
||||
pyaml.pprint(_load_model_map())
|
||||
|
||||
|
||||
def _get_bento_info(tag):
|
||||
model_map = _load_model_map()
|
||||
bento, version = tag.split(":")
|
||||
if bento not in model_map or version not in model_map[bento]:
|
||||
questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
|
||||
return
|
||||
model_info = model_map[bento][version]
|
||||
path = pathlib.Path(model_info["path"])
|
||||
|
||||
bento_file = path / "bento.yaml"
|
||||
bento_info = yaml.safe_load(bento_file.read_text())
|
||||
return BentoInfo(
|
||||
model=model_info,
|
||||
bento_yaml=bento_info,
|
||||
)
|
||||
|
||||
|
||||
@model_app.command(name="get")
|
||||
def model_get(tag: str):
|
||||
bento_info = _get_bento_info(tag)
|
||||
if bento_info:
|
||||
pyaml.pprint(bento_info)
|
||||
|
||||
|
||||
def _serve_model(model: str):
|
||||
if ":" not in model:
|
||||
model = f"{model}:latest"
|
||||
bento_info = _get_bento_info(model)
|
||||
if not bento_info:
|
||||
questionary.print(f"Model {model} not found", style=ERROR_STYLE)
|
||||
return
|
||||
cmd = ["bentoml", "serve", model]
|
||||
env = {
|
||||
"CLLAMA_MODEL": model,
|
||||
"BENTOML_HOME": bento_info["model"]["repo"]["path"] + "/bentoml",
|
||||
}
|
||||
_run_command(cmd, env=env)
|
||||
|
||||
|
||||
@app.command()
|
||||
def serve(model: str):
|
||||
_serve_model(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
630
cllama/aws.py
Normal file
630
cllama/aws.py
Normal file
@@ -0,0 +1,630 @@
|
||||
import typer
|
||||
import typing
|
||||
import collections
|
||||
|
||||
import prompt_toolkit
|
||||
from prompt_toolkit import print_formatted_text as print
|
||||
import time
|
||||
import uuid
|
||||
import shutil
|
||||
import pydantic
|
||||
from urllib.parse import urlparse
|
||||
import yaml
|
||||
import json
|
||||
import bentoml
|
||||
import questionary
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import pyaml
|
||||
import pathlib
|
||||
from cllama.spec import GPU_MEMORY
|
||||
|
||||
ERROR_STYLE = "red"
|
||||
SUCCESS_STYLE = "green"
|
||||
|
||||
|
||||
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
|
||||
REPO_DIR = CLLAMA_HOME / "repos"
|
||||
TEMP_DIR = CLLAMA_HOME / "temp"
|
||||
VENV_DIR = CLLAMA_HOME / "venv"
|
||||
|
||||
REPO_DIR.mkdir(exist_ok=True, parents=True)
|
||||
TEMP_DIR.mkdir(exist_ok=True, parents=True)
|
||||
VENV_DIR.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
CONFIG_FILE = CLLAMA_HOME / "config.json"
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
repo_app = typer.Typer()
|
||||
model_app = typer.Typer()
|
||||
|
||||
app.add_typer(repo_app, name="repo")
|
||||
app.add_typer(model_app, name="model")
|
||||
|
||||
|
||||
class Config(pydantic.BaseModel):
|
||||
repos: dict[str, str] = {
|
||||
"default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
|
||||
}
|
||||
default_repo: str = "default"
|
||||
|
||||
|
||||
def _load_config():
|
||||
if CONFIG_FILE.exists():
|
||||
with open(CONFIG_FILE) as f:
|
||||
return Config(**json.load(f))
|
||||
return Config()
|
||||
|
||||
|
||||
def _save_config(config):
|
||||
with open(CONFIG_FILE, "w") as f:
|
||||
json.dump(config.dict(), f, indent=2)
|
||||
|
||||
|
||||
class ModelInfo(typing.TypedDict):
|
||||
repo: str
|
||||
path: str
|
||||
|
||||
|
||||
def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
|
||||
model_map = collections.defaultdict(dict)
|
||||
config = _load_config()
|
||||
for repo_name, repo_url in config.repos.items():
|
||||
server, owner, repo, _ = _parse_repo_url(repo_url)
|
||||
repo_dir = REPO_DIR / server / owner / repo
|
||||
for path in repo_dir.glob("bentoml/bentos/*/*"):
|
||||
if path.is_dir():
|
||||
model_map[path.parent.name][path.name] = ModelInfo(
|
||||
repo=repo_name,
|
||||
path=str(path),
|
||||
)
|
||||
elif path.is_file():
|
||||
with open(path) as f:
|
||||
origin_name = f.read().strip()
|
||||
origin_path = path.parent / origin_name
|
||||
model_map[path.parent.name][path.name] = ModelInfo(
|
||||
repo=repo_name,
|
||||
path=str(origin_path),
|
||||
)
|
||||
return model_map
|
||||
|
||||
|
||||
GIT_REPO_RE = re.compile(
|
||||
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
|
||||
)
|
||||
|
||||
|
||||
@repo_app.command(name="list")
|
||||
def repo_list():
|
||||
config = _load_config()
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
def _parse_repo_url(repo_url):
|
||||
"""
|
||||
parse the git repo url to server, owner, repo name, branch
|
||||
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
|
||||
('github.com', 'bojiang', 'bentovllm', 'main')
|
||||
|
||||
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
|
||||
('github.com', 'bojiang', 'bentovllm', 'main')
|
||||
"""
|
||||
match = GIT_REPO_RE.match(repo_url)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid git repo url: {repo_url}")
|
||||
return (
|
||||
match.group("server"),
|
||||
match.group("owner"),
|
||||
match.group("repo"),
|
||||
match.group("branch") or "main",
|
||||
)
|
||||
|
||||
|
||||
@repo_app.command(name="add")
|
||||
def repo_add(name: str, repo: str):
|
||||
name = name.lower()
|
||||
if not name.isidentifier():
|
||||
questionary.print(
|
||||
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
|
||||
style=ERROR_STYLE,
|
||||
)
|
||||
return
|
||||
|
||||
config = _load_config()
|
||||
if name in config.repos:
|
||||
override = questionary.confirm(
|
||||
f"Repo {name} already exists({config.repos[name]}), override?"
|
||||
).ask()
|
||||
if not override:
|
||||
return
|
||||
|
||||
config.repos[name] = repo
|
||||
_save_config(config)
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
@repo_app.command(name="remove")
|
||||
def repo_remove(name: str):
|
||||
config = _load_config()
|
||||
if name not in config.repos:
|
||||
questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
|
||||
return
|
||||
|
||||
del config.repos[name]
|
||||
_save_config(config)
|
||||
pyaml.pprint(config.repos)
|
||||
|
||||
|
||||
def _run_command(cmd, cwd=None):
|
||||
questionary.print(f"\n$ {' '.join(cmd)}", style="bold")
|
||||
subprocess.run(cmd, cwd=cwd, check=True)
|
||||
|
||||
|
||||
@repo_app.command(name="update")
|
||||
def repo_update():
|
||||
config = _load_config()
|
||||
repos_in_use = set()
|
||||
for name, repo in config.repos.items():
|
||||
server, owner, repo_name, branch = _parse_repo_url(repo)
|
||||
repos_in_use.add((server, owner, repo_name))
|
||||
repo_dir = REPO_DIR / server / owner / repo_name
|
||||
if not repo_dir.exists():
|
||||
repo_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
cmd = [
|
||||
"git",
|
||||
"clone",
|
||||
"--branch",
|
||||
branch,
|
||||
f"https://{server}/{owner}/{repo_name}.git",
|
||||
str(repo_dir),
|
||||
]
|
||||
_run_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
|
||||
else:
|
||||
try:
|
||||
cmd = ["git", "fetch", "origin", branch]
|
||||
_run_command(cmd, cwd=repo_dir)
|
||||
cmd = ["git", "reset", "--hard", f"origin/{branch}"]
|
||||
_run_command(cmd, cwd=repo_dir)
|
||||
except:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
|
||||
for repo_dir in REPO_DIR.glob("*/*/*"):
|
||||
if tuple(repo_dir.parts[-3:]) not in repos_in_use:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
questionary.print(f"Removed unused repo {repo_dir}")
|
||||
questionary.print("Repos updated", style=SUCCESS_STYLE)
|
||||
|
||||
|
||||
@model_app.command(name="list")
|
||||
def model_list():
|
||||
pyaml.pprint(_load_model_map())
|
||||
|
||||
|
||||
def _get_bento_info(tag):
|
||||
model_map = _load_model_map()
|
||||
bento, version = tag.split(":")
|
||||
if bento not in model_map or version not in model_map[bento]:
|
||||
questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
|
||||
return
|
||||
model_info = model_map[bento][version]
|
||||
repo_name = model_info["repo"]
|
||||
path = pathlib.Path(model_info["path"])
|
||||
|
||||
bento_file = path / "bento.yaml"
|
||||
bento_info = yaml.safe_load(bento_file.read_text())
|
||||
return bento_info
|
||||
|
||||
|
||||
@model_app.command(name="get")
|
||||
def model_get(tag: str):
|
||||
bento_info = _get_bento_info(tag)
|
||||
if bento_info:
|
||||
pyaml.pprint(bento_info)
|
||||
|
||||
|
||||
def _filter_instance_types(
|
||||
instance_types,
|
||||
gpu_count,
|
||||
gpu_memory=None,
|
||||
gpu_type=None,
|
||||
level="match",
|
||||
):
|
||||
if gpu_memory is None:
|
||||
if gpu_type is None:
|
||||
raise ValueError("Either gpu_memory or gpu_type must be provided")
|
||||
gpu_memory = GPU_MEMORY[gpu_type]
|
||||
|
||||
def _check_instance(spec):
|
||||
if gpu_count == 0 or gpu_count is None:
|
||||
if "GpuInfo" in spec:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
else:
|
||||
gpus = spec.get("GpuInfo", {}).get("Gpus", [])
|
||||
if len(gpus) == 0:
|
||||
return False
|
||||
it_gpu = gpus[0]
|
||||
it_gpu_mem = it_gpu["MemoryInfo"]["SizeInMiB"] / 1024
|
||||
|
||||
if it_gpu["Count"] == gpu_count and it_gpu_mem == gpu_memory:
|
||||
return True
|
||||
elif it_gpu["Count"] >= gpu_count and it_gpu_mem >= gpu_memory:
|
||||
if level == "match":
|
||||
return False
|
||||
elif level == "usable":
|
||||
return True
|
||||
else:
|
||||
assert False
|
||||
else:
|
||||
return False
|
||||
|
||||
def _sort_key(spec):
|
||||
return (
|
||||
spec["InstanceType"].split(".")[0],
|
||||
spec.get("GpuInfo", {}).get("Gpus", [{}])[0].get("Count", 0),
|
||||
spec.get("VCpuInfo", {}).get("DefaultVCpus", 0),
|
||||
spec.get("MemoryInfo", {}).get("SizeInMiB", 0),
|
||||
)
|
||||
|
||||
return sorted(filter(_check_instance, instance_types), key=_sort_key)
|
||||
|
||||
|
||||
def _resolve_git_package(package):
|
||||
match = REG_GITPACKAGE.match(package)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid git package: {package}")
|
||||
repo_url, branch, subdirectory = match.groups()
|
||||
parsed = urlparse(repo_url)
|
||||
|
||||
path_parts = [parsed.netloc] + parsed.path.split("/")
|
||||
|
||||
return repo_url, branch, subdirectory, path_parts
|
||||
|
||||
|
||||
def _get_it_card(spec):
|
||||
"""
|
||||
InstanceType: g4dn.2xlarge
|
||||
VCpuInfo:
|
||||
DefaultCores: 32
|
||||
DefaultThreadsPerCore: 2
|
||||
DefaultVCpus: 64
|
||||
|
||||
MemoryInfo:
|
||||
SizeInMiB: 32768
|
||||
|
||||
GpuInfo:
|
||||
Gpus:
|
||||
- Count: 1
|
||||
Manufacturer: NVIDIA
|
||||
MemoryInfo:
|
||||
SizeInMiB: 16384
|
||||
Name: T4
|
||||
TotalGpuMemoryInMiB: 16384
|
||||
"""
|
||||
return f"cpus: {spec['VCpuInfo']['DefaultVCpus']}, mem: {spec['MemoryInfo']['SizeInMiB']}, gpu: {spec['GpuInfo']['Gpus'][0]['Name']} x {spec['GpuInfo']['Gpus'][0]['Count']}, cost: $0.1/hour"
|
||||
|
||||
|
||||
def _ensure_aws_security_group(group_name="cllama-http-default"):
|
||||
try:
|
||||
existing_groups = subprocess.check_output(
|
||||
[
|
||||
"aws",
|
||||
"ec2",
|
||||
"describe-security-groups",
|
||||
"--filters",
|
||||
f"Name=group-name,Values={group_name}",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
)
|
||||
existing_groups = json.loads(existing_groups)
|
||||
if existing_groups["SecurityGroups"]:
|
||||
return existing_groups["SecurityGroups"][0]["GroupId"]
|
||||
|
||||
result = subprocess.check_output(
|
||||
[
|
||||
"aws",
|
||||
"ec2",
|
||||
"create-security-group",
|
||||
"--group-name",
|
||||
group_name,
|
||||
"--description",
|
||||
"Default VPC security group for cllama services",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
)
|
||||
result = json.loads(result)
|
||||
security_group_id = result["GroupId"]
|
||||
|
||||
subprocess.check_call(
|
||||
[
|
||||
"aws",
|
||||
"ec2",
|
||||
"authorize-security-group-ingress",
|
||||
"--group-id",
|
||||
security_group_id,
|
||||
"--protocol",
|
||||
"tcp",
|
||||
"--port",
|
||||
"80",
|
||||
"--cidr",
|
||||
"0.0.0.0/0",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
)
|
||||
subprocess.check_call(
|
||||
[
|
||||
"aws",
|
||||
"ec2",
|
||||
"authorize-security-group-ingress",
|
||||
"--group-id",
|
||||
security_group_id,
|
||||
"--protocol",
|
||||
"tcp",
|
||||
"--port",
|
||||
"443",
|
||||
"--cidr",
|
||||
"0.0.0.0/0",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
)
|
||||
subprocess.check_call(
|
||||
[
|
||||
"aws",
|
||||
"ec2",
|
||||
"authorize-security-group-ingress",
|
||||
"--group-id",
|
||||
security_group_id,
|
||||
"--protocol",
|
||||
"tcp",
|
||||
"--port",
|
||||
"22",
|
||||
"--cidr",
|
||||
"0.0.0.0/0",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
)
|
||||
return security_group_id
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(f"Failed to create security group: {e}")
|
||||
|
||||
|
||||
@app.command()
|
||||
def serve(model: str, tag: str = "latest", force_rebuild: bool = False):
|
||||
if ":" in model:
|
||||
model, tag = model.split(":")
|
||||
if tag == "latest":
|
||||
tag = next(iter(MODEL_INFOS[model].keys()))
|
||||
|
||||
package = MODEL_INFOS[model][tag]
|
||||
repo, branch, subdirectory, path_parts = _resolve_git_package(package)
|
||||
repo_dir = REPO_DIR.joinpath(*path_parts)
|
||||
bento_project_dir = repo_dir / subdirectory
|
||||
|
||||
if force_rebuild:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
|
||||
if not repo_dir.exists():
|
||||
repo_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
cmd = ["git", "clone", "--branch", branch, repo, str(repo_dir)]
|
||||
print(f"\n$ {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except:
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
raise
|
||||
|
||||
bento_info = _get_bento_info(f"{model}:{tag}", bento_project_dir)
|
||||
|
||||
if len(bento_info["services"]) != 1:
|
||||
raise ValueError("Only support one service currently")
|
||||
|
||||
envs = {}
|
||||
if len(bento_info.get("envs", [])) > 0:
|
||||
for env in bento_info["envs"]:
|
||||
if env["name"] == "CLLAMA_MODEL":
|
||||
envs[env["name"]] = f"{model}:{tag}"
|
||||
continue
|
||||
if env["name"] in os.environ:
|
||||
value = os.environ.get(env["name"])
|
||||
questionary.print(f"Using environment value for {env['name']}")
|
||||
elif env.get("value"):
|
||||
value = questionary.text(
|
||||
f"Enter value for {env['name']}",
|
||||
default=env["value"],
|
||||
).ask()
|
||||
else:
|
||||
value = questionary.text(
|
||||
f"Enter value for {env['name']}",
|
||||
).ask()
|
||||
envs[env["name"]] = value
|
||||
|
||||
cloud_provider = questionary.select(
|
||||
"Select a cloud provider",
|
||||
choices=[
|
||||
questionary.Choice(title="Local", value="aws"),
|
||||
questionary.Choice(title="BentoCloud", value="cloud"),
|
||||
],
|
||||
).ask()
|
||||
|
||||
if cloud_provider == "cloud":
|
||||
cloud_provider = questionary.select(
|
||||
"You haven't logged in to BentoCloud, select an action",
|
||||
choices=[
|
||||
questionary.Choice(title="Login with Token", value="login"),
|
||||
questionary.Choice(title="Sign up ($10 free credit)", value="signup"),
|
||||
],
|
||||
).ask()
|
||||
if cloud_provider == "login":
|
||||
token = questionary.text("Enter your token").ask()
|
||||
cmd = ["bentoml", "cloud", "login", "--token", token]
|
||||
# print(f"\n$ {' '.join(cmd)}")
|
||||
try:
|
||||
subprocess.check_call(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
raise RuntimeError("Failed to login")
|
||||
elif cloud_provider == "signup":
|
||||
token = questionary.text(
|
||||
"Open https://cloud.bentoml.org/signup in your browser",
|
||||
).ask()
|
||||
# cmd = ["bentoml", "cloud", "signup"]
|
||||
# print(f"\n$ {' '.join(cmd)}")
|
||||
# try:
|
||||
# subprocess.check_call(cmd)
|
||||
# except subprocess.CalledProcessError:
|
||||
# raise RuntimeError("Failed to sign up")
|
||||
|
||||
elif cloud_provider == "aws":
|
||||
try:
|
||||
cmd = ["aws", "ec2", "describe-instance-types", "--no-cli-pager"]
|
||||
print(f"\n$ {' '.join(cmd)}")
|
||||
_instance_types = subprocess.check_output(cmd, text=True)
|
||||
except subprocess.CalledProcessError:
|
||||
raise
|
||||
# print(e)
|
||||
# _cli_install_aws()
|
||||
available_it_infos = json.loads(_instance_types)["InstanceTypes"]
|
||||
# pyaml.p(available_it_infos)
|
||||
|
||||
service = bento_info["services"][0]
|
||||
if "config" not in service or "resources" not in service["config"]:
|
||||
raise ValueError("Service config is missing")
|
||||
elif "gpu" in service["config"]["resources"]:
|
||||
gpu_count = service["config"]["resources"]["gpu"]
|
||||
gpu_type = service["config"]["resources"].get("gpu_type")
|
||||
gpu_memory = service["config"]["resources"].get("gpu_memory")
|
||||
supported_its = _filter_instance_types(
|
||||
available_it_infos,
|
||||
gpu_count,
|
||||
gpu_memory,
|
||||
gpu_type,
|
||||
)
|
||||
it = questionary.select(
|
||||
"Select an instance type",
|
||||
choices=[
|
||||
questionary.Choice(
|
||||
title=_get_it_card(it_info),
|
||||
value=it_info["InstanceType"],
|
||||
)
|
||||
for it_info in supported_its
|
||||
],
|
||||
).ask()
|
||||
security_group_id = _ensure_aws_security_group()
|
||||
AMI = "ami-02623cf022763d4a1"
|
||||
|
||||
init_script_file = TEMP_DIR / f"init_script_{str(uuid.uuid4())[:8]}.sh"
|
||||
with open(init_script_file, "w") as f:
|
||||
f.write(
|
||||
INIT_SCRIPT_TEMPLATE.format(
|
||||
repo=repo,
|
||||
subdirectory=subdirectory,
|
||||
model=model,
|
||||
tag=tag,
|
||||
env_args=" ".join([f"-e {k}={v}" for k, v in envs.items()]),
|
||||
)
|
||||
)
|
||||
# grant permission
|
||||
os.chmod(init_script_file, 0o755)
|
||||
cmd = [
|
||||
"aws",
|
||||
"ec2",
|
||||
"run-instances",
|
||||
"--image-id",
|
||||
AMI,
|
||||
"--instance-type",
|
||||
it,
|
||||
"--security-group-ids",
|
||||
security_group_id,
|
||||
"--user-data",
|
||||
f"file://{init_script_file}",
|
||||
"--key-name",
|
||||
"jiang",
|
||||
"--count",
|
||||
"1",
|
||||
"--no-cli-pager",
|
||||
]
|
||||
# print(f"\n$ {' '.join(cmd)}")
|
||||
try:
|
||||
result = subprocess.check_output(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
raise RuntimeError("Failed to create instance")
|
||||
result = json.loads(result)
|
||||
instance_id = result["Instances"][0]["InstanceId"]
|
||||
print(f"Deployment {instance_id} is created")
|
||||
|
||||
cmd = [
|
||||
"aws",
|
||||
"ec2",
|
||||
"describe-instances",
|
||||
"--instance-ids",
|
||||
instance_id,
|
||||
"--no-cli-pager",
|
||||
]
|
||||
# print(f"\n$ {' '.join(cmd)}")
|
||||
result = subprocess.check_output(cmd)
|
||||
result = json.loads(result)
|
||||
public_ip = result["Reservations"][0]["Instances"][0]["PublicIpAddress"]
|
||||
print(f"Public IP: {public_ip}")
|
||||
|
||||
server_start_time = time.time()
|
||||
print("Server is starting...")
|
||||
with prompt_toolkit.shortcuts.ProgressBar() as pb:
|
||||
for _ in pb(range(100)):
|
||||
start_time = time.time()
|
||||
try:
|
||||
with bentoml.SyncHTTPClient(f"http://{public_ip}"):
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(max(0, 6 - (time.time() - start_time)))
|
||||
else:
|
||||
raise RuntimeError("Instance is not ready after 10 minutes")
|
||||
print(f"Server started in {time.time() - server_start_time:.2f} seconds")
|
||||
print(f"HTTP server is ready at http://{public_ip}")
|
||||
return
|
||||
else:
|
||||
raise ValueError("GPU is required for now")
|
||||
if cloud_provider == "bentocloud":
|
||||
cmd = ["bentoml", "cloud", "current-context"]
|
||||
# print(f"\n$ {' '.join(cmd)}")
|
||||
try:
|
||||
output = subprocess.check_output(cmd, text=True)
|
||||
except subprocess.CalledProcessError:
|
||||
raise RuntimeError(
|
||||
"Failed to get bentocloud login context, please login first",
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(model: str, tag: str = "latest", force_rebuild: bool = False):
|
||||
serve(model, tag, force_rebuild)
|
||||
|
||||
|
||||
INIT_SCRIPT_TEMPLATE = """#!/bin/bash
|
||||
pip3 install bentoml
|
||||
rm -r /usr/local/cuda*
|
||||
git clone {repo} /root/bento_repo
|
||||
export BENTOML_HOME=/root/bento_repo/{subdirectory}
|
||||
bentoml containerize {model}:{tag} --image-tag {model}:{tag}
|
||||
docker run --restart always --gpus all -d -p 80:3000 {env_args} {model}:{tag}
|
||||
|
||||
nvidia-smi -q | grep -A2 "ECC Mode" | grep "Current" | grep "Enabled"
|
||||
ECC_ENABLED=$?
|
||||
|
||||
if [[ $ECC_ENABLED -eq 0 ]]; then
|
||||
echo "ECC is enabled. Disabling now..."
|
||||
nvidia-smi -e 0
|
||||
reboot
|
||||
else
|
||||
echo "ECC is not enabled. No changes made."
|
||||
fi
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
11
cllama/spec.py
Normal file
11
cllama/spec.py
Normal file
@@ -0,0 +1,11 @@
|
||||
GPU_MEMORY = {
|
||||
"nvidia-tesla-t4": 16,
|
||||
"nvidia-tesla-v100": 16,
|
||||
"nvidia-tesla-p100": 16,
|
||||
"nvidia-tesla-p4": 8,
|
||||
"nvidia-tesla-k80": 12,
|
||||
"nvidia-tesla-a100": 40,
|
||||
"nvidia-tesla-a100-80gb": 80,
|
||||
"nvidia-tesla-a10g": 24,
|
||||
"nvidia-l4": 24,
|
||||
}
|
||||
22
pyproject.toml
Normal file
22
pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=42", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "cllama"
|
||||
version = "0.0.1"
|
||||
description = "A description of your package."
|
||||
authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
|
||||
license = {file = "LICENSE"}
|
||||
dependencies = [
|
||||
"typer",
|
||||
"bentoml",
|
||||
"pyaml",
|
||||
"fastapi",
|
||||
"questionary",
|
||||
"psutil",
|
||||
"pathlib"
|
||||
]
|
||||
|
||||
[tool.typer]
|
||||
src-dir = "cllama"
|
||||
Reference in New Issue
Block a user