mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-20 23:18:16 -04:00
refactor: openllm hello
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
from typing import Annotated
|
||||
|
||||
from typing import Annotated, Optional
|
||||
from collections import defaultdict
|
||||
import sys
|
||||
import questionary
|
||||
|
||||
import typer
|
||||
|
||||
from openllm_next.accelerator_spec import (
|
||||
@@ -8,11 +10,10 @@ from openllm_next.accelerator_spec import (
|
||||
can_run,
|
||||
get_local_machine_spec,
|
||||
)
|
||||
from openllm_next.cloud import app as cloud_app
|
||||
from openllm_next.cloud import app as cloud_app, ensure_cloud_context
|
||||
from openllm_next.cloud import get_cloud_machine_spec
|
||||
from openllm_next.cloud import run as cloud_run
|
||||
from openllm_next.cloud import serve as cloud_serve
|
||||
from openllm_next.common import VERBOSE_LEVEL, BentoInfo
|
||||
from openllm_next.cloud import serve as cloud_deploy
|
||||
from openllm_next.common import VERBOSE_LEVEL, BentoInfo, FORCE, output
|
||||
from openllm_next.local import run as local_run
|
||||
from openllm_next.local import serve as local_serve
|
||||
from openllm_next.model import app as model_app
|
||||
@@ -26,97 +27,249 @@ app.add_typer(model_app, name="model")
|
||||
app.add_typer(cloud_app, name="cloud")
|
||||
|
||||
|
||||
def _pre_select(model: str) -> tuple[BentoInfo, DeploymentTarget]:
|
||||
def _pick_bento(model: str, target: Optional[DeploymentTarget] = None) -> BentoInfo:
|
||||
bentos = list_bento(model)
|
||||
if len(bentos) == 0:
|
||||
typer.echo(f"No model found for {model}", err=True)
|
||||
output(f"No model found for {model}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
|
||||
local = get_local_machine_spec()
|
||||
|
||||
if len(bentos) == 1:
|
||||
bento = bentos[0]
|
||||
if can_run(bento, local) <= 0:
|
||||
questionary.print(
|
||||
f"No deployment target found for {bento.name}:{bento.version}",
|
||||
style="red",
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
return bento, local
|
||||
if FORCE.get():
|
||||
output(f"Found model {bentos[0]}", level=10, style="green")
|
||||
return bentos[0]
|
||||
if target is None:
|
||||
return bentos[0]
|
||||
if can_run(bentos[0], target) <= 0:
|
||||
return bentos[0]
|
||||
output(f"Found model {bentos[0]}", level=10, style="green")
|
||||
return bentos[0]
|
||||
|
||||
choices = []
|
||||
choices += [questionary.Separator("Local available models")]
|
||||
choices += [
|
||||
questionary.Choice(
|
||||
f" {bento.name}:{bento.version}",
|
||||
(bento, local),
|
||||
if target is None:
|
||||
output(
|
||||
f"Multiple models match {model}, did you mean one of these?",
|
||||
level=20,
|
||||
style="red",
|
||||
)
|
||||
for bento in bentos
|
||||
if can_run(bento) > 0
|
||||
]
|
||||
choices += [questionary.Separator("Cloud available models")]
|
||||
choices += [
|
||||
questionary.Choice(
|
||||
f" {bento.name}:{bento.version}",
|
||||
(bento, None),
|
||||
)
|
||||
for bento in bentos
|
||||
]
|
||||
|
||||
choosen: tuple[BentoInfo, DeploymentTarget] = questionary.select(
|
||||
"Select a model to run",
|
||||
choices=choices,
|
||||
).ask()
|
||||
|
||||
if not choosen:
|
||||
questionary.print("No model selected", style="red")
|
||||
for bento in bentos:
|
||||
output(f" {bento}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
|
||||
bento, target = choosen
|
||||
if target is None:
|
||||
cloud_targets = get_cloud_machine_spec()
|
||||
cloud_targets = [
|
||||
target for target in cloud_targets if can_run(bento, target) > 0
|
||||
]
|
||||
if len(cloud_targets) == 0:
|
||||
questionary.print(
|
||||
f"No suitable instance type found for {bento.name}:{bento.version}",
|
||||
style="red",
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
target = questionary.select(
|
||||
"Select a cloud target",
|
||||
choices=[
|
||||
questionary.Choice(
|
||||
f" {target.name}",
|
||||
target,
|
||||
)
|
||||
for target in cloud_targets
|
||||
],
|
||||
).ask()
|
||||
if not target:
|
||||
questionary.print("No target selected", style="red")
|
||||
raise typer.Exit(1)
|
||||
filtered = [bento for bento in bentos if can_run(bento, target) > 0]
|
||||
if len(filtered) == 0:
|
||||
output(f"No deployment target found for {model}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
|
||||
return bento, target
|
||||
if len(filtered) == 0:
|
||||
output(f"No deployment target found for {model}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
|
||||
if len(bentos) > 1:
|
||||
output(
|
||||
f"Multiple models match {model}, did you mean one of these?",
|
||||
level=20,
|
||||
style="red",
|
||||
)
|
||||
for bento in bentos:
|
||||
output(f" {bento}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
|
||||
return bentos[0]
|
||||
|
||||
|
||||
def _select_bento_name(models, target):
|
||||
from tabulate import tabulate
|
||||
|
||||
options = []
|
||||
model_infos = [
|
||||
[model.repo.name, model.name, model.tag, can_run(model, target)]
|
||||
for model in models
|
||||
]
|
||||
model_name_groups = defaultdict(lambda: 0)
|
||||
for repo, name, tag, score in model_infos:
|
||||
model_name_groups[(repo, name)] += score
|
||||
table_data = [
|
||||
[name, repo, "*" if score > 0 else ""]
|
||||
for (repo, name), score in model_name_groups.items()
|
||||
]
|
||||
table = tabulate(
|
||||
table_data,
|
||||
headers=["model", "repo", "locally runnable"],
|
||||
).split("\n")
|
||||
headers = f"{table[0]}\n {table[1]}"
|
||||
|
||||
options.append(questionary.Separator(headers))
|
||||
for table_data, table_line in zip(table_data, table[2:]):
|
||||
options.append(questionary.Choice(table_line, value=table_data[:2]))
|
||||
selected = questionary.select("Select a model", options).ask()
|
||||
if selected is None:
|
||||
raise typer.Exit(1)
|
||||
return selected
|
||||
|
||||
|
||||
def _select_bento_version(models, target, bento_name, repo):
|
||||
from tabulate import tabulate
|
||||
|
||||
model_infos = [
|
||||
[model, can_run(model, target)]
|
||||
for model in models
|
||||
if model.name == bento_name and model.repo.name == repo
|
||||
]
|
||||
|
||||
table_data = [
|
||||
[model.version, "yes" if score > 0 else ""]
|
||||
for model, score in model_infos
|
||||
if model.name == bento_name and model.repo.name == repo
|
||||
]
|
||||
if not table_data:
|
||||
output(f"No model found for {bento_name} in {repo}", level=20, style="red")
|
||||
raise typer.Exit(1)
|
||||
table = tabulate(
|
||||
table_data,
|
||||
headers=["version", "locally runnable"],
|
||||
).split("\n")
|
||||
|
||||
options = []
|
||||
options.append(questionary.Separator(f"{table[0]}\n {table[1]}"))
|
||||
for table_data, table_line in zip(model_infos, table[2:]):
|
||||
options.append(questionary.Choice(table_line, value=table_data))
|
||||
selected = questionary.select("Select a version", options).ask()
|
||||
if selected is None:
|
||||
raise typer.Exit(1)
|
||||
return selected
|
||||
|
||||
|
||||
def _select_target(bento, targets):
|
||||
from tabulate import tabulate
|
||||
|
||||
options = []
|
||||
targets.sort(key=lambda x: can_run(bento, x), reverse=True)
|
||||
if not targets:
|
||||
output(
|
||||
"No available instance type, check your bentocloud account",
|
||||
level=20,
|
||||
style="red",
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
table = tabulate(
|
||||
[
|
||||
[
|
||||
target.name,
|
||||
target.accelerators_repr,
|
||||
target.price,
|
||||
"" if can_run(bento, target) else "insufficient res.",
|
||||
]
|
||||
for target in targets
|
||||
],
|
||||
headers=["instance type", "accelerator", "price", "deployable"],
|
||||
).split("\n")
|
||||
options.append(questionary.Separator(f"{table[0]}\n {table[1]}"))
|
||||
|
||||
for target, line in zip(targets, table[2:]):
|
||||
options.append(
|
||||
questionary.Choice(
|
||||
f"{line}",
|
||||
value=target,
|
||||
)
|
||||
)
|
||||
selected = questionary.select("Select an instance type", options).ask()
|
||||
if selected is None:
|
||||
raise typer.Exit(1)
|
||||
return selected
|
||||
|
||||
|
||||
def _select_action(bento, score):
|
||||
if score > 0:
|
||||
options = [
|
||||
questionary.Separator("Available actions"),
|
||||
questionary.Separator("0. Run the model in terminal"),
|
||||
questionary.Choice(f" $ openllm run {bento}", value="run"),
|
||||
questionary.Separator(" "),
|
||||
questionary.Separator("1. Serve the model locally and get a chat server"),
|
||||
questionary.Choice(f" $ openllm serve {bento}", value="serve"),
|
||||
questionary.Separator(" "),
|
||||
questionary.Separator(
|
||||
"2. Deploy the model to bentocloud and get a scalable chat server"
|
||||
),
|
||||
questionary.Choice(f" $ openllm deploy {bento}", value="deploy"),
|
||||
]
|
||||
else:
|
||||
options = [
|
||||
questionary.Separator("Available actions"),
|
||||
questionary.Separator("0. Run the model in terminal"),
|
||||
questionary.Choice(
|
||||
f" $ openllm run {bento}",
|
||||
value="run",
|
||||
disabled="insufficient resources",
|
||||
shortcut_key="0",
|
||||
),
|
||||
questionary.Separator(" "),
|
||||
questionary.Separator("1. Serve the model locally and get a chat server"),
|
||||
questionary.Choice(
|
||||
f" $ openllm serve {bento}",
|
||||
value="serve",
|
||||
disabled="insufficient resources",
|
||||
shortcut_key="1",
|
||||
),
|
||||
questionary.Separator(" "),
|
||||
questionary.Separator(
|
||||
"2. Deploy the model to bentocloud and get a scalable chat server"
|
||||
),
|
||||
questionary.Choice(
|
||||
f" $ openllm deploy {bento}",
|
||||
value="deploy",
|
||||
shortcut_key="2",
|
||||
),
|
||||
]
|
||||
action = questionary.select("Select an action", options).ask()
|
||||
if action is None:
|
||||
raise typer.Exit(1)
|
||||
if action == "run":
|
||||
local_run(bento)
|
||||
elif action == "serve":
|
||||
local_serve(bento)
|
||||
elif action == "deploy":
|
||||
ensure_cloud_context()
|
||||
targets = get_cloud_machine_spec()
|
||||
target = _select_target(bento, targets)
|
||||
cloud_deploy(bento, target)
|
||||
|
||||
|
||||
@app.command()
|
||||
def hello():
|
||||
target = get_local_machine_spec()
|
||||
output(f" Detected Platform: {target.platform}", style="green")
|
||||
if target.accelerators:
|
||||
output(" Detected Accelerators: ", style="green")
|
||||
for a in target.accelerators:
|
||||
output(f" - {a.model} {a.memory_size}GB", style="green")
|
||||
else:
|
||||
output(" Detected Accelerators: None", style="yellow")
|
||||
|
||||
models = list_bento()
|
||||
|
||||
bento_name, repo = _select_bento_name(models, target)
|
||||
bento, score = _select_bento_version(models, target, bento_name, repo)
|
||||
_select_action(bento, score)
|
||||
|
||||
|
||||
@app.command()
|
||||
def serve(model: Annotated[str, typer.Argument()] = ""):
|
||||
bento, target = _pre_select(model)
|
||||
if target and target.source == "local":
|
||||
local_serve(bento)
|
||||
else:
|
||||
cloud_serve(bento, target)
|
||||
target = get_local_machine_spec()
|
||||
bento = _pick_bento(model, target)
|
||||
local_serve(bento)
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(model: Annotated[str, typer.Argument()] = ""):
|
||||
bento, target = _pre_select(model)
|
||||
if target and target.source == "local":
|
||||
local_run(bento)
|
||||
else:
|
||||
cloud_run(bento, target)
|
||||
target = get_local_machine_spec()
|
||||
bento = _pick_bento(model, target)
|
||||
local_run(bento)
|
||||
|
||||
|
||||
@app.command()
|
||||
def deploy(model: Annotated[str, typer.Argument()] = ""):
|
||||
targets = get_cloud_machine_spec()
|
||||
|
||||
|
||||
def typer_callback(verbose: int = 0):
|
||||
@@ -125,6 +278,9 @@ def typer_callback(verbose: int = 0):
|
||||
|
||||
|
||||
def main():
|
||||
if sys.version_info < (3, 9):
|
||||
output("Python 3.8 or higher is required", level=20, style="red")
|
||||
sys.exit(1)
|
||||
app.callback()(typer_callback)
|
||||
app()
|
||||
|
||||
|
||||
@@ -21,6 +21,9 @@ class Accelerator(SimpleNamespace):
|
||||
def __eq__(self, other):
|
||||
return self.memory_size == other.memory_size
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.model}({self.memory_size}GB)"
|
||||
|
||||
|
||||
class Resource(SimpleNamespace):
|
||||
cpu: int
|
||||
@@ -93,7 +96,7 @@ def get_local_machine_spec():
|
||||
memory_info = nvmlDeviceGetMemoryInfo(handle)
|
||||
accelerators.append(
|
||||
Accelerator(
|
||||
name=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
|
||||
model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
|
||||
)
|
||||
)
|
||||
nvmlShutdown()
|
||||
|
||||
@@ -13,6 +13,7 @@ from openllm_next.common import (
|
||||
ERROR_STYLE,
|
||||
BentoInfo,
|
||||
DeploymentTarget,
|
||||
output,
|
||||
run_command,
|
||||
)
|
||||
|
||||
@@ -62,14 +63,12 @@ def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget]
|
||||
return cmd, env, None
|
||||
|
||||
|
||||
def _ensure_cloud_context():
|
||||
def ensure_cloud_context():
|
||||
cmd = ["bentoml", "cloud", "current-context"]
|
||||
try:
|
||||
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
|
||||
context = json.loads(result)
|
||||
questionary.print(
|
||||
f"BentoCloud already logged in: {context['endpoint']}", style="green"
|
||||
)
|
||||
output(f" BentoCloud already logged in: {context['endpoint']}", style="green")
|
||||
except subprocess.CalledProcessError:
|
||||
action = questionary.select(
|
||||
"BentoCloud not logged in",
|
||||
@@ -79,10 +78,9 @@ def _ensure_cloud_context():
|
||||
],
|
||||
).ask()
|
||||
if action is None:
|
||||
questionary.print("Cancelled", style=ERROR_STYLE)
|
||||
raise typer.Exit(1)
|
||||
elif action == "get an account in two minutes":
|
||||
questionary.print(
|
||||
output(
|
||||
"Please visit https://cloud.bentoml.com to get your token",
|
||||
style="yellow",
|
||||
)
|
||||
@@ -105,14 +103,13 @@ def _ensure_cloud_context():
|
||||
]
|
||||
try:
|
||||
result = subprocess.check_output(cmd)
|
||||
questionary.print("Logged in successfully", style="green")
|
||||
output(" Logged in successfully", style="green")
|
||||
except subprocess.CalledProcessError:
|
||||
questionary.print("Failed to login", style=ERROR_STYLE)
|
||||
output(" Failed to login", style=ERROR_STYLE)
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
def get_cloud_machine_spec():
|
||||
_ensure_cloud_context()
|
||||
cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
|
||||
try:
|
||||
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
|
||||
@@ -137,14 +134,6 @@ def get_cloud_machine_spec():
|
||||
|
||||
|
||||
def serve(bento: BentoInfo, target: DeploymentTarget):
|
||||
_ensure_cloud_context()
|
||||
ensure_cloud_context()
|
||||
cmd, env, cwd = _get_deploy_cmd(bento, target)
|
||||
run_command(cmd, env=env, cwd=cwd)
|
||||
|
||||
|
||||
def run(bento: BentoInfo, target: DeploymentTarget):
|
||||
questionary.print(
|
||||
"`run` with bentocloud is not supported yet, please use `serve` instead",
|
||||
style=ERROR_STYLE,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import functools
|
||||
import signal
|
||||
import io
|
||||
from collections import UserList
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
@@ -55,7 +57,30 @@ class ContextVar(typing.Generic[T]):
|
||||
self._stack.pop()
|
||||
|
||||
|
||||
VERBOSE_LEVEL = ContextVar(0)
|
||||
VERBOSE_LEVEL = ContextVar(10)
|
||||
INTERACTIVE = ContextVar(True)
|
||||
FORCE = ContextVar(False)
|
||||
|
||||
|
||||
def output(content, level=0, style=None):
|
||||
if level >= VERBOSE_LEVEL.get():
|
||||
return
|
||||
|
||||
if isinstance(content, (dict, list)):
|
||||
import pyaml
|
||||
|
||||
out = io.StringIO()
|
||||
pyaml.pprint(
|
||||
content,
|
||||
dst=out,
|
||||
sort_dicts=False,
|
||||
sort_keys=False,
|
||||
)
|
||||
questionary.print(out.getvalue(), style=style, end="")
|
||||
out.close()
|
||||
|
||||
if isinstance(content, str):
|
||||
questionary.print(content, style=style)
|
||||
|
||||
|
||||
class Config(SimpleNamespace):
|
||||
@@ -202,6 +227,16 @@ class DeploymentTarget(SimpleNamespace):
|
||||
def __hash__(self):
|
||||
return hash(self.source)
|
||||
|
||||
@property
|
||||
def accelerators_repr(self) -> str:
|
||||
accs = {a.model for a in self.accelerators}
|
||||
if len(accs) == 0:
|
||||
return "null"
|
||||
if len(accs) == 1:
|
||||
a = self.accelerators[0]
|
||||
return f"{a.model} x{len(self.accelerators)}"
|
||||
return ", ".join((f"{a.model}" for a in self.accelerators))
|
||||
|
||||
|
||||
def run_command(
|
||||
cmd,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import asyncio
|
||||
import signal
|
||||
import time
|
||||
import httpx
|
||||
|
||||
|
||||
@@ -41,20 +41,26 @@ def list_():
|
||||
)
|
||||
|
||||
|
||||
def list_bento(tag: typing.Optional[str] = None) -> typing.List[BentoInfo]:
|
||||
def list_bento(
|
||||
tag: typing.Optional[str] = None,
|
||||
repo_name: typing.Optional[str] = None,
|
||||
include_alias: bool = False,
|
||||
) -> typing.List[BentoInfo]:
|
||||
ensure_repo_updated()
|
||||
if not tag:
|
||||
glob_pattern = "bentoml/bentos/*/*"
|
||||
elif ":" in tag:
|
||||
repo_name, version = tag.split(":")
|
||||
glob_pattern = f"bentoml/bentos/{repo_name}/{version}"
|
||||
bento_name, version = tag.split(":")
|
||||
glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
|
||||
else:
|
||||
glob_pattern = f"bentoml/bentos/{tag}/*"
|
||||
|
||||
model_list = []
|
||||
config = load_config()
|
||||
for repo_name, repo_url in config.repos.items():
|
||||
repo = parse_repo_url(repo_url, repo_name)
|
||||
for _repo_name, repo_url in config.repos.items():
|
||||
if repo_name is not None and _repo_name != repo_name:
|
||||
continue
|
||||
repo = parse_repo_url(repo_url, _repo_name)
|
||||
for path in repo.path.glob(glob_pattern):
|
||||
if path.is_dir() and (path / "bento.yaml").exists():
|
||||
model = BentoInfo(
|
||||
@@ -74,7 +80,7 @@ def list_bento(tag: typing.Optional[str] = None) -> typing.List[BentoInfo]:
|
||||
if model:
|
||||
model_list.append(model)
|
||||
model_list.sort(key=lambda x: x.tag)
|
||||
if VERBOSE_LEVEL.get() <= 0:
|
||||
if not include_alias:
|
||||
seen = set()
|
||||
model_list = [
|
||||
x
|
||||
|
||||
@@ -18,6 +18,7 @@ dependencies = [
|
||||
"pip_requirements_parser",
|
||||
"nvidia-ml-py",
|
||||
"dulwich",
|
||||
"tabulate",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Reference in New Issue
Block a user