mirror of
https://github.com/bentoml/OpenLLM.git
synced 2025-12-23 23:57:46 -05:00
631 lines
20 KiB
Python
631 lines
20 KiB
Python
import typer
|
|
import typing
|
|
import collections
|
|
|
|
import prompt_toolkit
|
|
from prompt_toolkit import print_formatted_text as print
|
|
import time
|
|
import uuid
|
|
import shutil
|
|
import pydantic
|
|
from urllib.parse import urlparse
|
|
import yaml
|
|
import json
|
|
import bentoml
|
|
import questionary
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import pyaml
|
|
import pathlib
|
|
from cllama.spec import GPU_MEMORY
|
|
|
|
ERROR_STYLE = "red"
|
|
SUCCESS_STYLE = "green"
|
|
|
|
|
|
CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
|
|
REPO_DIR = CLLAMA_HOME / "repos"
|
|
TEMP_DIR = CLLAMA_HOME / "temp"
|
|
VENV_DIR = CLLAMA_HOME / "venv"
|
|
|
|
REPO_DIR.mkdir(exist_ok=True, parents=True)
|
|
TEMP_DIR.mkdir(exist_ok=True, parents=True)
|
|
VENV_DIR.mkdir(exist_ok=True, parents=True)
|
|
|
|
CONFIG_FILE = CLLAMA_HOME / "config.json"
|
|
|
|
|
|
app = typer.Typer()
|
|
repo_app = typer.Typer()
|
|
model_app = typer.Typer()
|
|
|
|
app.add_typer(repo_app, name="repo")
|
|
app.add_typer(model_app, name="model")
|
|
|
|
|
|
class Config(pydantic.BaseModel):
|
|
repos: dict[str, str] = {
|
|
"default": "git+https://github.com/bojiang/bentovllm@main#subdirectory=bentoml"
|
|
}
|
|
default_repo: str = "default"
|
|
|
|
|
|
def _load_config():
|
|
if CONFIG_FILE.exists():
|
|
with open(CONFIG_FILE) as f:
|
|
return Config(**json.load(f))
|
|
return Config()
|
|
|
|
|
|
def _save_config(config):
|
|
with open(CONFIG_FILE, "w") as f:
|
|
json.dump(config.dict(), f, indent=2)
|
|
|
|
|
|
class ModelInfo(typing.TypedDict):
|
|
repo: str
|
|
path: str
|
|
|
|
|
|
def _load_model_map() -> dict[str, dict[str, ModelInfo]]:
|
|
model_map = collections.defaultdict(dict)
|
|
config = _load_config()
|
|
for repo_name, repo_url in config.repos.items():
|
|
server, owner, repo, _ = _parse_repo_url(repo_url)
|
|
repo_dir = REPO_DIR / server / owner / repo
|
|
for path in repo_dir.glob("bentoml/bentos/*/*"):
|
|
if path.is_dir():
|
|
model_map[path.parent.name][path.name] = ModelInfo(
|
|
repo=repo_name,
|
|
path=str(path),
|
|
)
|
|
elif path.is_file():
|
|
with open(path) as f:
|
|
origin_name = f.read().strip()
|
|
origin_path = path.parent / origin_name
|
|
model_map[path.parent.name][path.name] = ModelInfo(
|
|
repo=repo_name,
|
|
path=str(origin_path),
|
|
)
|
|
return model_map
|
|
|
|
|
|
GIT_REPO_RE = re.compile(
|
|
r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
|
|
)
|
|
|
|
|
|
@repo_app.command(name="list")
|
|
def repo_list():
|
|
config = _load_config()
|
|
pyaml.pprint(config.repos)
|
|
|
|
|
|
def _parse_repo_url(repo_url):
|
|
"""
|
|
parse the git repo url to server, owner, repo name, branch
|
|
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm@main")
|
|
('github.com', 'bojiang', 'bentovllm', 'main')
|
|
|
|
>>> _parse_repo_url("git+https://github.com/bojiang/bentovllm")
|
|
('github.com', 'bojiang', 'bentovllm', 'main')
|
|
"""
|
|
match = GIT_REPO_RE.match(repo_url)
|
|
if not match:
|
|
raise ValueError(f"Invalid git repo url: {repo_url}")
|
|
return (
|
|
match.group("server"),
|
|
match.group("owner"),
|
|
match.group("repo"),
|
|
match.group("branch") or "main",
|
|
)
|
|
|
|
|
|
@repo_app.command(name="add")
|
|
def repo_add(name: str, repo: str):
|
|
name = name.lower()
|
|
if not name.isidentifier():
|
|
questionary.print(
|
|
f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
|
|
style=ERROR_STYLE,
|
|
)
|
|
return
|
|
|
|
config = _load_config()
|
|
if name in config.repos:
|
|
override = questionary.confirm(
|
|
f"Repo {name} already exists({config.repos[name]}), override?"
|
|
).ask()
|
|
if not override:
|
|
return
|
|
|
|
config.repos[name] = repo
|
|
_save_config(config)
|
|
pyaml.pprint(config.repos)
|
|
|
|
|
|
@repo_app.command(name="remove")
|
|
def repo_remove(name: str):
|
|
config = _load_config()
|
|
if name not in config.repos:
|
|
questionary.print(f"Repo {name} does not exist", style=ERROR_STYLE)
|
|
return
|
|
|
|
del config.repos[name]
|
|
_save_config(config)
|
|
pyaml.pprint(config.repos)
|
|
|
|
|
|
def _run_command(cmd, cwd=None):
|
|
questionary.print(f"\n$ {' '.join(cmd)}", style="bold")
|
|
subprocess.run(cmd, cwd=cwd, check=True)
|
|
|
|
|
|
@repo_app.command(name="update")
|
|
def repo_update():
|
|
config = _load_config()
|
|
repos_in_use = set()
|
|
for name, repo in config.repos.items():
|
|
server, owner, repo_name, branch = _parse_repo_url(repo)
|
|
repos_in_use.add((server, owner, repo_name))
|
|
repo_dir = REPO_DIR / server / owner / repo_name
|
|
if not repo_dir.exists():
|
|
repo_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
cmd = [
|
|
"git",
|
|
"clone",
|
|
"--branch",
|
|
branch,
|
|
f"https://{server}/{owner}/{repo_name}.git",
|
|
str(repo_dir),
|
|
]
|
|
_run_command(cmd)
|
|
except subprocess.CalledProcessError:
|
|
shutil.rmtree(repo_dir, ignore_errors=True)
|
|
questionary.print(f"Failed to clone repo {name}", style=ERROR_STYLE)
|
|
else:
|
|
try:
|
|
cmd = ["git", "fetch", "origin", branch]
|
|
_run_command(cmd, cwd=repo_dir)
|
|
cmd = ["git", "reset", "--hard", f"origin/{branch}"]
|
|
_run_command(cmd, cwd=repo_dir)
|
|
except:
|
|
shutil.rmtree(repo_dir, ignore_errors=True)
|
|
questionary.print(f"Failed to update repo {name}", style=ERROR_STYLE)
|
|
for repo_dir in REPO_DIR.glob("*/*/*"):
|
|
if tuple(repo_dir.parts[-3:]) not in repos_in_use:
|
|
shutil.rmtree(repo_dir, ignore_errors=True)
|
|
questionary.print(f"Removed unused repo {repo_dir}")
|
|
questionary.print("Repos updated", style=SUCCESS_STYLE)
|
|
|
|
|
|
@model_app.command(name="list")
|
|
def model_list():
|
|
pyaml.pprint(_load_model_map())
|
|
|
|
|
|
def _get_bento_info(tag):
|
|
model_map = _load_model_map()
|
|
bento, version = tag.split(":")
|
|
if bento not in model_map or version not in model_map[bento]:
|
|
questionary.print(f"Model {tag} not found", style=ERROR_STYLE)
|
|
return
|
|
model_info = model_map[bento][version]
|
|
repo_name = model_info["repo"]
|
|
path = pathlib.Path(model_info["path"])
|
|
|
|
bento_file = path / "bento.yaml"
|
|
bento_info = yaml.safe_load(bento_file.read_text())
|
|
return bento_info
|
|
|
|
|
|
@model_app.command(name="get")
|
|
def model_get(tag: str):
|
|
bento_info = _get_bento_info(tag)
|
|
if bento_info:
|
|
pyaml.pprint(bento_info)
|
|
|
|
|
|
def _filter_instance_types(
|
|
instance_types,
|
|
gpu_count,
|
|
gpu_memory=None,
|
|
gpu_type=None,
|
|
level="match",
|
|
):
|
|
if gpu_memory is None:
|
|
if gpu_type is None:
|
|
raise ValueError("Either gpu_memory or gpu_type must be provided")
|
|
gpu_memory = GPU_MEMORY[gpu_type]
|
|
|
|
def _check_instance(spec):
|
|
if gpu_count == 0 or gpu_count is None:
|
|
if "GpuInfo" in spec:
|
|
return False
|
|
else:
|
|
return True
|
|
else:
|
|
gpus = spec.get("GpuInfo", {}).get("Gpus", [])
|
|
if len(gpus) == 0:
|
|
return False
|
|
it_gpu = gpus[0]
|
|
it_gpu_mem = it_gpu["MemoryInfo"]["SizeInMiB"] / 1024
|
|
|
|
if it_gpu["Count"] == gpu_count and it_gpu_mem == gpu_memory:
|
|
return True
|
|
elif it_gpu["Count"] >= gpu_count and it_gpu_mem >= gpu_memory:
|
|
if level == "match":
|
|
return False
|
|
elif level == "usable":
|
|
return True
|
|
else:
|
|
assert False
|
|
else:
|
|
return False
|
|
|
|
def _sort_key(spec):
|
|
return (
|
|
spec["InstanceType"].split(".")[0],
|
|
spec.get("GpuInfo", {}).get("Gpus", [{}])[0].get("Count", 0),
|
|
spec.get("VCpuInfo", {}).get("DefaultVCpus", 0),
|
|
spec.get("MemoryInfo", {}).get("SizeInMiB", 0),
|
|
)
|
|
|
|
return sorted(filter(_check_instance, instance_types), key=_sort_key)
|
|
|
|
|
|
def _resolve_git_package(package):
|
|
match = REG_GITPACKAGE.match(package)
|
|
if not match:
|
|
raise ValueError(f"Invalid git package: {package}")
|
|
repo_url, branch, subdirectory = match.groups()
|
|
parsed = urlparse(repo_url)
|
|
|
|
path_parts = [parsed.netloc] + parsed.path.split("/")
|
|
|
|
return repo_url, branch, subdirectory, path_parts
|
|
|
|
|
|
def _get_it_card(spec):
|
|
"""
|
|
InstanceType: g4dn.2xlarge
|
|
VCpuInfo:
|
|
DefaultCores: 32
|
|
DefaultThreadsPerCore: 2
|
|
DefaultVCpus: 64
|
|
|
|
MemoryInfo:
|
|
SizeInMiB: 32768
|
|
|
|
GpuInfo:
|
|
Gpus:
|
|
- Count: 1
|
|
Manufacturer: NVIDIA
|
|
MemoryInfo:
|
|
SizeInMiB: 16384
|
|
Name: T4
|
|
TotalGpuMemoryInMiB: 16384
|
|
"""
|
|
return f"cpus: {spec['VCpuInfo']['DefaultVCpus']}, mem: {spec['MemoryInfo']['SizeInMiB']}, gpu: {spec['GpuInfo']['Gpus'][0]['Name']} x {spec['GpuInfo']['Gpus'][0]['Count']}, cost: $0.1/hour"
|
|
|
|
|
|
def _ensure_aws_security_group(group_name="cllama-http-default"):
|
|
try:
|
|
existing_groups = subprocess.check_output(
|
|
[
|
|
"aws",
|
|
"ec2",
|
|
"describe-security-groups",
|
|
"--filters",
|
|
f"Name=group-name,Values={group_name}",
|
|
"--no-cli-pager",
|
|
]
|
|
)
|
|
existing_groups = json.loads(existing_groups)
|
|
if existing_groups["SecurityGroups"]:
|
|
return existing_groups["SecurityGroups"][0]["GroupId"]
|
|
|
|
result = subprocess.check_output(
|
|
[
|
|
"aws",
|
|
"ec2",
|
|
"create-security-group",
|
|
"--group-name",
|
|
group_name,
|
|
"--description",
|
|
"Default VPC security group for cllama services",
|
|
"--no-cli-pager",
|
|
]
|
|
)
|
|
result = json.loads(result)
|
|
security_group_id = result["GroupId"]
|
|
|
|
subprocess.check_call(
|
|
[
|
|
"aws",
|
|
"ec2",
|
|
"authorize-security-group-ingress",
|
|
"--group-id",
|
|
security_group_id,
|
|
"--protocol",
|
|
"tcp",
|
|
"--port",
|
|
"80",
|
|
"--cidr",
|
|
"0.0.0.0/0",
|
|
"--no-cli-pager",
|
|
]
|
|
)
|
|
subprocess.check_call(
|
|
[
|
|
"aws",
|
|
"ec2",
|
|
"authorize-security-group-ingress",
|
|
"--group-id",
|
|
security_group_id,
|
|
"--protocol",
|
|
"tcp",
|
|
"--port",
|
|
"443",
|
|
"--cidr",
|
|
"0.0.0.0/0",
|
|
"--no-cli-pager",
|
|
]
|
|
)
|
|
subprocess.check_call(
|
|
[
|
|
"aws",
|
|
"ec2",
|
|
"authorize-security-group-ingress",
|
|
"--group-id",
|
|
security_group_id,
|
|
"--protocol",
|
|
"tcp",
|
|
"--port",
|
|
"22",
|
|
"--cidr",
|
|
"0.0.0.0/0",
|
|
"--no-cli-pager",
|
|
]
|
|
)
|
|
return security_group_id
|
|
except subprocess.CalledProcessError as e:
|
|
raise RuntimeError(f"Failed to create security group: {e}")
|
|
|
|
|
|
@app.command()
|
|
def serve(model: str, tag: str = "latest", force_rebuild: bool = False):
|
|
if ":" in model:
|
|
model, tag = model.split(":")
|
|
if tag == "latest":
|
|
tag = next(iter(MODEL_INFOS[model].keys()))
|
|
|
|
package = MODEL_INFOS[model][tag]
|
|
repo, branch, subdirectory, path_parts = _resolve_git_package(package)
|
|
repo_dir = REPO_DIR.joinpath(*path_parts)
|
|
bento_project_dir = repo_dir / subdirectory
|
|
|
|
if force_rebuild:
|
|
shutil.rmtree(repo_dir, ignore_errors=True)
|
|
|
|
if not repo_dir.exists():
|
|
repo_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
cmd = ["git", "clone", "--branch", branch, repo, str(repo_dir)]
|
|
print(f"\n$ {' '.join(cmd)}")
|
|
subprocess.run(cmd, check=True)
|
|
except:
|
|
shutil.rmtree(repo_dir, ignore_errors=True)
|
|
raise
|
|
|
|
bento_info = _get_bento_info(f"{model}:{tag}", bento_project_dir)
|
|
|
|
if len(bento_info["services"]) != 1:
|
|
raise ValueError("Only support one service currently")
|
|
|
|
envs = {}
|
|
if len(bento_info.get("envs", [])) > 0:
|
|
for env in bento_info["envs"]:
|
|
if env["name"] == "CLLAMA_MODEL":
|
|
envs[env["name"]] = f"{model}:{tag}"
|
|
continue
|
|
if env["name"] in os.environ:
|
|
value = os.environ.get(env["name"])
|
|
questionary.print(f"Using environment value for {env['name']}")
|
|
elif env.get("value"):
|
|
value = questionary.text(
|
|
f"Enter value for {env['name']}",
|
|
default=env["value"],
|
|
).ask()
|
|
else:
|
|
value = questionary.text(
|
|
f"Enter value for {env['name']}",
|
|
).ask()
|
|
envs[env["name"]] = value
|
|
|
|
cloud_provider = questionary.select(
|
|
"Select a cloud provider",
|
|
choices=[
|
|
questionary.Choice(title="Local", value="aws"),
|
|
questionary.Choice(title="BentoCloud", value="cloud"),
|
|
],
|
|
).ask()
|
|
|
|
if cloud_provider == "cloud":
|
|
cloud_provider = questionary.select(
|
|
"You haven't logged in to BentoCloud, select an action",
|
|
choices=[
|
|
questionary.Choice(title="Login with Token", value="login"),
|
|
questionary.Choice(title="Sign up ($10 free credit)", value="signup"),
|
|
],
|
|
).ask()
|
|
if cloud_provider == "login":
|
|
token = questionary.text("Enter your token").ask()
|
|
cmd = ["bentoml", "cloud", "login", "--token", token]
|
|
# print(f"\n$ {' '.join(cmd)}")
|
|
try:
|
|
subprocess.check_call(cmd)
|
|
except subprocess.CalledProcessError:
|
|
raise RuntimeError("Failed to login")
|
|
elif cloud_provider == "signup":
|
|
token = questionary.text(
|
|
"Open https://cloud.bentoml.org/signup in your browser",
|
|
).ask()
|
|
# cmd = ["bentoml", "cloud", "signup"]
|
|
# print(f"\n$ {' '.join(cmd)}")
|
|
# try:
|
|
# subprocess.check_call(cmd)
|
|
# except subprocess.CalledProcessError:
|
|
# raise RuntimeError("Failed to sign up")
|
|
|
|
elif cloud_provider == "aws":
|
|
try:
|
|
cmd = ["aws", "ec2", "describe-instance-types", "--no-cli-pager"]
|
|
print(f"\n$ {' '.join(cmd)}")
|
|
_instance_types = subprocess.check_output(cmd, text=True)
|
|
except subprocess.CalledProcessError:
|
|
raise
|
|
# print(e)
|
|
# _cli_install_aws()
|
|
available_it_infos = json.loads(_instance_types)["InstanceTypes"]
|
|
# pyaml.p(available_it_infos)
|
|
|
|
service = bento_info["services"][0]
|
|
if "config" not in service or "resources" not in service["config"]:
|
|
raise ValueError("Service config is missing")
|
|
elif "gpu" in service["config"]["resources"]:
|
|
gpu_count = service["config"]["resources"]["gpu"]
|
|
gpu_type = service["config"]["resources"].get("gpu_type")
|
|
gpu_memory = service["config"]["resources"].get("gpu_memory")
|
|
supported_its = _filter_instance_types(
|
|
available_it_infos,
|
|
gpu_count,
|
|
gpu_memory,
|
|
gpu_type,
|
|
)
|
|
it = questionary.select(
|
|
"Select an instance type",
|
|
choices=[
|
|
questionary.Choice(
|
|
title=_get_it_card(it_info),
|
|
value=it_info["InstanceType"],
|
|
)
|
|
for it_info in supported_its
|
|
],
|
|
).ask()
|
|
security_group_id = _ensure_aws_security_group()
|
|
AMI = "ami-02623cf022763d4a1"
|
|
|
|
init_script_file = TEMP_DIR / f"init_script_{str(uuid.uuid4())[:8]}.sh"
|
|
with open(init_script_file, "w") as f:
|
|
f.write(
|
|
INIT_SCRIPT_TEMPLATE.format(
|
|
repo=repo,
|
|
subdirectory=subdirectory,
|
|
model=model,
|
|
tag=tag,
|
|
env_args=" ".join([f"-e {k}={v}" for k, v in envs.items()]),
|
|
)
|
|
)
|
|
# grant permission
|
|
os.chmod(init_script_file, 0o755)
|
|
cmd = [
|
|
"aws",
|
|
"ec2",
|
|
"run-instances",
|
|
"--image-id",
|
|
AMI,
|
|
"--instance-type",
|
|
it,
|
|
"--security-group-ids",
|
|
security_group_id,
|
|
"--user-data",
|
|
f"file://{init_script_file}",
|
|
"--key-name",
|
|
"jiang",
|
|
"--count",
|
|
"1",
|
|
"--no-cli-pager",
|
|
]
|
|
# print(f"\n$ {' '.join(cmd)}")
|
|
try:
|
|
result = subprocess.check_output(cmd)
|
|
except subprocess.CalledProcessError:
|
|
raise RuntimeError("Failed to create instance")
|
|
result = json.loads(result)
|
|
instance_id = result["Instances"][0]["InstanceId"]
|
|
print(f"Deployment {instance_id} is created")
|
|
|
|
cmd = [
|
|
"aws",
|
|
"ec2",
|
|
"describe-instances",
|
|
"--instance-ids",
|
|
instance_id,
|
|
"--no-cli-pager",
|
|
]
|
|
# print(f"\n$ {' '.join(cmd)}")
|
|
result = subprocess.check_output(cmd)
|
|
result = json.loads(result)
|
|
public_ip = result["Reservations"][0]["Instances"][0]["PublicIpAddress"]
|
|
print(f"Public IP: {public_ip}")
|
|
|
|
server_start_time = time.time()
|
|
print("Server is starting...")
|
|
with prompt_toolkit.shortcuts.ProgressBar() as pb:
|
|
for _ in pb(range(100)):
|
|
start_time = time.time()
|
|
try:
|
|
with bentoml.SyncHTTPClient(f"http://{public_ip}"):
|
|
break
|
|
except Exception:
|
|
time.sleep(max(0, 6 - (time.time() - start_time)))
|
|
else:
|
|
raise RuntimeError("Instance is not ready after 10 minutes")
|
|
print(f"Server started in {time.time() - server_start_time:.2f} seconds")
|
|
print(f"HTTP server is ready at http://{public_ip}")
|
|
return
|
|
else:
|
|
raise ValueError("GPU is required for now")
|
|
if cloud_provider == "bentocloud":
|
|
cmd = ["bentoml", "cloud", "current-context"]
|
|
# print(f"\n$ {' '.join(cmd)}")
|
|
try:
|
|
output = subprocess.check_output(cmd, text=True)
|
|
except subprocess.CalledProcessError:
|
|
raise RuntimeError(
|
|
"Failed to get bentocloud login context, please login first",
|
|
)
|
|
|
|
|
|
@app.command()
|
|
def run(model: str, tag: str = "latest", force_rebuild: bool = False):
|
|
serve(model, tag, force_rebuild)
|
|
|
|
|
|
INIT_SCRIPT_TEMPLATE = """#!/bin/bash
|
|
pip3 install bentoml
|
|
rm -r /usr/local/cuda*
|
|
git clone {repo} /root/bento_repo
|
|
export BENTOML_HOME=/root/bento_repo/{subdirectory}
|
|
bentoml containerize {model}:{tag} --image-tag {model}:{tag}
|
|
docker run --restart always --gpus all -d -p 80:3000 {env_args} {model}:{tag}
|
|
|
|
nvidia-smi -q | grep -A2 "ECC Mode" | grep "Current" | grep "Enabled"
|
|
ECC_ENABLED=$?
|
|
|
|
if [[ $ECC_ENABLED -eq 0 ]]; then
|
|
echo "ECC is enabled. Disabling now..."
|
|
nvidia-smi -e 0
|
|
reboot
|
|
else
|
|
echo "ECC is not enabled. No changes made."
|
|
fi
|
|
"""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|