mirror of
https://github.com/bentoml/OpenLLM.git
synced 2025-12-23 23:57:46 -05:00
chore(cli): add a smol helpers to generate README.md tables
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
134
README.md
134
README.md
@@ -1,4 +1,6 @@
|
||||
# 🦾 OpenLLM: Self-Hosting LLMs Made Easy
|
||||
<div align="center">
|
||||
<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>
|
||||
</div>
|
||||
|
||||
[](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)
|
||||
[](https://pypi.org/project/openllm)
|
||||
@@ -25,16 +27,110 @@ openllm hello
|
||||
|
||||
OpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.
|
||||
|
||||
| Model | Parameters | Quantization | Required GPU | Start a Server |
|
||||
| ---------------- | ---------- | ------------ | ------------- | ----------------------------------- |
|
||||
| Llama 3.3 | 70B | - | 80Gx2 | `openllm serve llama3.3:70b` |
|
||||
| Llama 3.2 | 3B | - | 12G | `openllm serve llama3.2:3b` |
|
||||
| Llama 3.2 Vision | 11B | - | 80G | `openllm serve llama3.2:11b-vision` |
|
||||
| Mistral | 7B | - | 24G | `openllm serve mistral:7b` |
|
||||
| Qwen 2.5 | 1.5B | - | 12G | `openllm serve qwen2.5:1.5b` |
|
||||
| Qwen 2.5 Coder | 7B | - | 24G | `openllm serve qwen2.5-coder:7b` |
|
||||
| Gemma 2 | 9B | - | 24G | `openllm serve gemma2:9b` |
|
||||
| Phi3 | 3.8B | - | 12G | `openllm serve phi3:3.8b` |
|
||||
<table>
|
||||
<tr>
|
||||
<th>Model</th>
|
||||
<th>Parameters</th>
|
||||
<th>Required GPU</th>
|
||||
<th>Start a Server</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>deepseek-r1</td>
|
||||
<td>671B</td>
|
||||
<td>80Gx16</td>
|
||||
<td><code>openllm serve deepseek-r1:671b-fc3d</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>deepseek-r1-distill</td>
|
||||
<td>14B</td>
|
||||
<td>80G</td>
|
||||
<td><code>openllm serve deepseek-r1-distill:qwen2.5-14b-98a9</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>deepseek-v3</td>
|
||||
<td>671B</td>
|
||||
<td>80Gx16</td>
|
||||
<td><code>openllm serve deepseek-v3:671b-instruct-d7ec</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>gemma2</td>
|
||||
<td>2B</td>
|
||||
<td>12G</td>
|
||||
<td><code>openllm serve gemma2:2b-instruct-747d</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>llama3.1</td>
|
||||
<td>8B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve llama3.1:8b-instruct-3c0c</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>llama3.2</td>
|
||||
<td>1B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve llama3.2:1b-instruct-f041</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>llama3.3</td>
|
||||
<td>70B</td>
|
||||
<td>80Gx2</td>
|
||||
<td><code>openllm serve llama3.3:70b-instruct-b850</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mistral</td>
|
||||
<td>8B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve mistral:8b-instruct-50e8</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mistral-large</td>
|
||||
<td>123B</td>
|
||||
<td>80Gx4</td>
|
||||
<td><code>openllm serve mistral-large:123b-instruct-1022</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mistralai</td>
|
||||
<td>24B</td>
|
||||
<td>80G</td>
|
||||
<td><code>openllm serve mistralai:24b-small-instruct-2501-0e69</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mixtral</td>
|
||||
<td>7B</td>
|
||||
<td>80Gx2</td>
|
||||
<td><code>openllm serve mixtral:8x7b-instruct-v0.1-b752</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>phi4</td>
|
||||
<td>14B</td>
|
||||
<td>80G</td>
|
||||
<td><code>openllm serve phi4:14b-c12d</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>pixtral</td>
|
||||
<td>12B</td>
|
||||
<td>80G</td>
|
||||
<td><code>openllm serve pixtral:12b-240910-c344</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>qwen2.5</td>
|
||||
<td>7B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve qwen2.5:7b-instruct-3260</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>qwen2.5-coder</td>
|
||||
<td>7B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve qwen2.5-coder:7b-instruct-e75d</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>qwen2.5vl</td>
|
||||
<td>3B</td>
|
||||
<td>24G</td>
|
||||
<td><code>openllm serve qwen2.5vl:3b-instruct-4686</code></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
...
|
||||
|
||||
@@ -46,15 +142,16 @@ To start an LLM server locally, use the `openllm serve` command and specify the
|
||||
|
||||
> [!NOTE]
|
||||
> OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.
|
||||
>
|
||||
> 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).
|
||||
> 2. Request access to the gated model, such as [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B).
|
||||
> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).
|
||||
> 3. Set your token as an environment variable by running:
|
||||
> ```bash
|
||||
> export HF_TOKEN=<your token>
|
||||
> ```
|
||||
|
||||
```bash
|
||||
openllm serve llama3:8b
|
||||
openllm serve openllm serve llama3.2:1b-instruct-f041
|
||||
```
|
||||
|
||||
The server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:
|
||||
@@ -79,7 +176,7 @@ client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
|
||||
# print(model_list)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -94,7 +191,6 @@ for chunk in chat_completion:
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>LlamaIndex</summary>
|
||||
@@ -102,9 +198,10 @@ for chunk in chat_completion:
|
||||
```python
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Meta-Llama-3-8B-Instruct", api_key="dummy")
|
||||
llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Llama-3.2-1B-Instruct", api_key="dummy")
|
||||
...
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Chat UI
|
||||
@@ -138,7 +235,7 @@ openllm repo update
|
||||
To review a model’s information, run:
|
||||
|
||||
```bash
|
||||
openllm model get llama3:8b
|
||||
openllm model get openllm serve llama3.2:1b-instruct-f041
|
||||
```
|
||||
|
||||
### Add a model to the default model repository
|
||||
@@ -166,7 +263,7 @@ OpenLLM supports LLM cloud deployment via BentoML, the unified model serving fra
|
||||
[Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:
|
||||
|
||||
```bash
|
||||
openllm deploy llama3:8b
|
||||
openllm deploy openllm serve llama3.2:1b-instruct-f041
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -196,7 +293,6 @@ This project uses the following open-source projects:
|
||||
- [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
|
||||
- [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend
|
||||
- [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
|
||||
- [chujiezheng/chat_templates](https://github.com/chujiezheng/chat_templates)
|
||||
- [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing
|
||||
|
||||
We are grateful to the developers and contributors of these projects for their hard work and dedication.
|
||||
|
||||
210
README.md.tpl
Normal file
210
README.md.tpl
Normal file
@@ -0,0 +1,210 @@
|
||||
<div align="center">
|
||||
<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>
|
||||
</div>
|
||||
|
||||
[](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)
|
||||
[](https://pypi.org/project/openllm)
|
||||
[](https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main)
|
||||
[](https://twitter.com/bentomlai)
|
||||
[](https://l.bentoml.com/join-slack)
|
||||
|
||||
OpenLLM allows developers to run **any open-source LLMs** (Llama 3.3, Qwen2.5, Phi3 and [more](#supported-models)) or **custom models** as **OpenAI-compatible APIs** with a single command. It features a [built-in chat UI](#chat-ui), state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Docker, Kubernetes, and [BentoCloud](#deploy-to-bentocloud).
|
||||
|
||||
Understand the [design philosophy of OpenLLM](https://www.bentoml.com/blog/from-ollama-to-openllm-running-llms-in-the-cloud).
|
||||
|
||||
## Get Started
|
||||
|
||||
Run the following commands to install OpenLLM and explore it interactively.
|
||||
|
||||
```bash
|
||||
pip install openllm # or pip3 install openllm
|
||||
openllm hello
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Supported models
|
||||
|
||||
OpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Model</th>
|
||||
<th>Parameters</th>
|
||||
<th>Required GPU</th>
|
||||
<th>Start a Server</th>
|
||||
</tr>
|
||||
{%- for key, value in model_dict|items %}
|
||||
<tr>
|
||||
<td>{{key}}</td>
|
||||
<td>{{value['version'] | upper}}</td>
|
||||
<td>{{value['pretty_gpu']}}</td>
|
||||
<td><code>{{value['command']}}</code></td>
|
||||
</tr>
|
||||
{%- endfor %}
|
||||
</table>
|
||||
|
||||
...
|
||||
|
||||
For the full model list, see the [OpenLLM models repository](https://github.com/bentoml/openllm-models).
|
||||
|
||||
## Start an LLM server
|
||||
|
||||
To start an LLM server locally, use the `openllm serve` command and specify the model version.
|
||||
|
||||
> [!NOTE]
|
||||
> OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.
|
||||
>
|
||||
> 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).
|
||||
> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).
|
||||
> 3. Set your token as an environment variable by running:
|
||||
> ```bash
|
||||
> export HF_TOKEN=<your token>
|
||||
> ```
|
||||
|
||||
```bash
|
||||
openllm serve {{model_dict.get("llama3.2")["command"]}}
|
||||
```
|
||||
|
||||
The server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:
|
||||
|
||||
- **The API host address**: By default, the LLM is hosted at [http://localhost:3000](http://localhost:3000/).
|
||||
- **The model name:** The name can be different depending on the tool you use.
|
||||
- **The API key**: The API key used for client authentication. This is optional.
|
||||
|
||||
Here are some examples:
|
||||
|
||||
<details>
|
||||
|
||||
<summary>OpenAI Python client</summary>
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
|
||||
|
||||
# Use the following func to get the available models
|
||||
# model_list = client.models.list()
|
||||
# print(model_list)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Explain superconductors like I'm five years old"
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
for chunk in chat_completion:
|
||||
print(chunk.choices[0].delta.content or "", end="")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>LlamaIndex</summary>
|
||||
|
||||
```python
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Llama-3.2-1B-Instruct", api_key="dummy")
|
||||
...
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Chat UI
|
||||
|
||||
OpenLLM provides a chat UI at the `/chat` endpoint for the launched LLM server at http://localhost:3000/chat.
|
||||
|
||||
<img width="800" alt="openllm_ui" src="https://github.com/bentoml/OpenLLM/assets/5886138/8b426b2b-67da-4545-8b09-2dc96ff8a707">
|
||||
|
||||
## Chat with a model in the CLI
|
||||
|
||||
To start a chat conversation in the CLI, use the `openllm run` command and specify the model version.
|
||||
|
||||
```bash
|
||||
openllm run llama3:8b
|
||||
```
|
||||
|
||||
## Model repository
|
||||
|
||||
A model repository in OpenLLM represents a catalog of available LLMs that you can run. OpenLLM provides a default model repository that includes the latest open-source LLMs like Llama 3, Mistral, and Qwen2, hosted at [this GitHub repository](https://github.com/bentoml/openllm-models). To see all available models from the default and any added repository, use:
|
||||
|
||||
```bash
|
||||
openllm model list
|
||||
```
|
||||
|
||||
To ensure your local list of models is synchronized with the latest updates from all connected repositories, run:
|
||||
|
||||
```bash
|
||||
openllm repo update
|
||||
```
|
||||
|
||||
To review a model’s information, run:
|
||||
|
||||
```bash
|
||||
openllm model get {{model_dict.get("llama3.2")["command"]}}
|
||||
```
|
||||
|
||||
### Add a model to the default model repository
|
||||
|
||||
You can contribute to the default model repository by adding new models that others can use. This involves creating and submitting a Bento of the LLM. For more information, check out this [example pull request](https://github.com/bentoml/openllm-models/pull/1).
|
||||
|
||||
### Set up a custom repository
|
||||
|
||||
You can add your own repository to OpenLLM with custom models. To do so, follow the format in the default OpenLLM model repository with a `bentos` directory to store custom LLMs. You need to [build your Bentos with BentoML](https://docs.bentoml.com/en/latest/guides/build-options.html) and submit them to your model repository.
|
||||
|
||||
First, prepare your custom models in a `bentos` directory following the guidelines provided by [BentoML to build Bentos](https://docs.bentoml.com/en/latest/guides/build-options.html). Check out the [default model repository](https://github.com/bentoml/openllm-repo) for an example and read the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) for details.
|
||||
|
||||
Then, register your custom model repository with OpenLLM:
|
||||
|
||||
```bash
|
||||
openllm repo add <repo-name> <repo-url>
|
||||
```
|
||||
|
||||
**Note**: Currently, OpenLLM only supports adding public repositories.
|
||||
|
||||
## Deploy to BentoCloud
|
||||
|
||||
OpenLLM supports LLM cloud deployment via BentoML, the unified model serving framework, and BentoCloud, an AI inference platform for enterprise AI teams. BentoCloud provides fully-managed infrastructure optimized for LLM inference with autoscaling, model orchestration, observability, and many more, allowing you to run any AI model in the cloud.
|
||||
|
||||
[Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:
|
||||
|
||||
```bash
|
||||
openllm deploy {{model_dict.get("llama3.2")["command"]}}
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> If you are deploying a gated model, make sure to set HF_TOKEN in enviroment variables.
|
||||
|
||||
Once the deployment is complete, you can run model inference on the BentoCloud console:
|
||||
|
||||
<img width="800" alt="bentocloud_ui" src="https://github.com/bentoml/OpenLLM/assets/65327072/4f7819d9-73ea-488a-a66c-f724e5d063e6">
|
||||
|
||||
## Community
|
||||
|
||||
OpenLLM is actively maintained by the BentoML team. Feel free to reach out and join us in our pursuit to make LLMs more accessible and easy to use 👉 [Join our Slack community!](https://l.bentoml.com/join-slack)
|
||||
|
||||
## Contributing
|
||||
|
||||
As an open-source project, we welcome contributions of all kinds, such as new features, bug fixes, and documentation. Here are some of the ways to contribute:
|
||||
|
||||
- Repost a bug by [creating a GitHub issue](https://github.com/bentoml/OpenLLM/issues/new/choose).
|
||||
- [Submit a pull request](https://github.com/bentoml/OpenLLM/compare) or help review other developers’ [pull requests](https://github.com/bentoml/OpenLLM/pulls).
|
||||
- Add an LLM to the OpenLLM default model repository so that other users can run your model. See the [pull request template](https://github.com/bentoml/openllm-models/pull/1).
|
||||
- Check out the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) to learn more.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project uses the following open-source projects:
|
||||
|
||||
- [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
|
||||
- [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend
|
||||
- [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
|
||||
- [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing
|
||||
|
||||
We are grateful to the developers and contributors of these projects for their hard work and dedication.
|
||||
22
gen_readme.py
Normal file
22
gen_readme.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.11"
|
||||
# dependencies = [
|
||||
# "jinja2",
|
||||
# "uv",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import subprocess, sys, pathlib, json
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
wd = pathlib.Path('.').parent
|
||||
model_dict = subprocess.run(
|
||||
[sys.executable, '-m', 'uv', 'run', '--with-editable', '.', 'openllm', 'model', 'list', '--output', 'readme'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
E = Environment(loader=FileSystemLoader('.'))
|
||||
with (wd / 'README.md').open('w') as f:
|
||||
f.write(E.get_template('README.md.tpl').render(model_dict=json.loads(model_dict.stdout.strip())))
|
||||
@@ -230,7 +230,7 @@ def run(
|
||||
local_run(bento, port=port, timeout=timeout)
|
||||
|
||||
|
||||
@app.command(help='deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)')
|
||||
@app.command(help='deploy production-ready OpenAI API-compatible server to BentoCloud')
|
||||
def deploy(
|
||||
model: Annotated[str, typer.Argument()] = '',
|
||||
instance_type: Optional[str] = None,
|
||||
|
||||
@@ -1,29 +1,33 @@
|
||||
import re
|
||||
import typing
|
||||
from typing import Optional
|
||||
from __future__ import annotations
|
||||
|
||||
import tabulate
|
||||
import typer
|
||||
import re, typing, json
|
||||
|
||||
import tabulate, questionary, typer
|
||||
|
||||
from openllm.accelerator_spec import DeploymentTarget, can_run
|
||||
from openllm.analytic import OpenLLMTyper
|
||||
from openllm.common import VERBOSE_LEVEL, BentoInfo, output
|
||||
from openllm.common import VERBOSE_LEVEL, BentoInfo, output as output_
|
||||
from openllm.repo import ensure_repo_updated, list_repo
|
||||
|
||||
app = OpenLLMTyper(help='manage models')
|
||||
|
||||
|
||||
@app.command(help='get model')
|
||||
def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
|
||||
def get(tag: str, repo: typing.Optional[str] = None, verbose: bool = False):
|
||||
if verbose:
|
||||
VERBOSE_LEVEL.set(20)
|
||||
bento_info = ensure_bento(tag, repo_name=repo)
|
||||
if bento_info:
|
||||
output(bento_info)
|
||||
output_(bento_info)
|
||||
|
||||
|
||||
@app.command(name='list', help='list available models')
|
||||
def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False):
|
||||
def list_model(
|
||||
tag: typing.Optional[str] = None,
|
||||
repo: typing.Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
output: typing.Optional[str] = typer.Option(None, hidden=True),
|
||||
):
|
||||
if verbose:
|
||||
VERBOSE_LEVEL.set(20)
|
||||
|
||||
@@ -38,6 +42,23 @@ def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: b
|
||||
seen.add(value)
|
||||
return False
|
||||
|
||||
if output == 'readme':
|
||||
# Parse parameters from bento.tag (e.g. "model:671b-it" -> "671b", 'model:something-long-78b' -> '78b')
|
||||
version_pattern = re.compile(r'(\d+b|-[a-z]+b)')
|
||||
questionary.print(
|
||||
json.dumps({
|
||||
f'{bento.name}': dict(
|
||||
tag=bento.tag,
|
||||
version=version_pattern.search(bento.tag).group(1),
|
||||
pretty_gpu=bento.pretty_gpu,
|
||||
command=f'openllm serve {bento.tag}',
|
||||
)
|
||||
for bento in bentos
|
||||
if not is_seen(bento.name) and version_pattern.search(bento.tag)
|
||||
})
|
||||
)
|
||||
return
|
||||
|
||||
table = tabulate.tabulate(
|
||||
[
|
||||
[
|
||||
@@ -51,19 +72,21 @@ def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: b
|
||||
],
|
||||
headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],
|
||||
)
|
||||
output(table)
|
||||
output_(table)
|
||||
|
||||
|
||||
def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_name: Optional[str] = None) -> BentoInfo:
|
||||
def ensure_bento(
|
||||
model: str, target: typing.Optional[DeploymentTarget] = None, repo_name: typing.Optional[str] = None
|
||||
) -> BentoInfo:
|
||||
bentos = list_bento(model, repo_name=repo_name)
|
||||
if len(bentos) == 0:
|
||||
output(f'No model found for {model}', style='red')
|
||||
output_(f'No model found for {model}', style='red')
|
||||
raise typer.Exit(1)
|
||||
|
||||
if len(bentos) == 1:
|
||||
output(f'Found model {bentos[0]}', style='green')
|
||||
output_(f'Found model {bentos[0]}', style='green')
|
||||
if target is not None and can_run(bentos[0], target) <= 0:
|
||||
output(
|
||||
output_(
|
||||
f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '
|
||||
f'resources to run model {bentos[0]}\n',
|
||||
style='yellow',
|
||||
@@ -71,7 +94,7 @@ def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_nam
|
||||
return bentos[0]
|
||||
|
||||
# multiple models, pick one according to target
|
||||
output(f'Multiple models match {model}, did you mean one of these?', style='red')
|
||||
output_(f'Multiple models match {model}, did you mean one of these?', style='red')
|
||||
list_model(model, repo=repo_name)
|
||||
raise typer.Exit(1)
|
||||
|
||||
@@ -99,9 +122,9 @@ def list_bento(
|
||||
if repo_name is not None:
|
||||
repo_map = {repo.name: repo for repo in repo_list}
|
||||
if repo_name not in repo_map:
|
||||
output(f'Repo `{repo_name}` not found, did you mean one of these?')
|
||||
output_(f'Repo `{repo_name}` not found, did you mean one of these?')
|
||||
for repo_name in repo_map:
|
||||
output(f' {repo_name}')
|
||||
output_(f' {repo_name}')
|
||||
raise typer.Exit(1)
|
||||
|
||||
if not tag:
|
||||
|
||||
Reference in New Issue
Block a user