chore: cleanup code and env requirements

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-12-23 23:57:46 -05:00 · 2025-04-09 21:36:46 -04:00
parent 578308bb9c
commit d8fb4ae4a5
19 changed files with 1670 additions and 1368 deletions
--- a/.github/workflows/dependabot-auto-merge.yml
+++ b/.github/workflows/dependabot-auto-merge.yml
@@ -12,7 +12,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.3.0
+        uses: dependabot/fetch-metadata@d7267f607e9d3fb96fc2fbe83e0af444713e90b7 # ratchet:dependabot/fetch-metadata@v2.3.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
      - name: Enable auto-merge for Dependabot PRs
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,35 @@
+name: Run Tests
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.12"]
+
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # ratchet:actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # ratchet:actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv
+        run: |
+          pip install uv
+
+      - name: Install dependencies with uv
+        run: |
+          uv pip install -e .
+          uv pip install pytest pexpect
+
+      - name: Run tests
+        run: |
+          pytest tests -v
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 venv/
 .envrc
 _version.py
+.cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ default_language_version:
  python: python3.11 # NOTE: sync with .python-version-default
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.11.2"
+    rev: "v0.11.4"
    hooks:
      - id: ruff
        alias: r
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -1,7 +1,7 @@
 extend-include = ["*.ipynb"]
 preview = true
-line-length = 119
-indent-width = 4
+line-length = 100
+indent-width = 2

 [format]
 preview = true
@@ -18,21 +18,16 @@ ignore = [
 ]
 select = [
  "F",
-  "G",    # flake8-logging-format
-  "PERF", # perflint
-  "RUF",  # Ruff-specific rules
+  "G",     # flake8-logging-format
+  "PERF",  # perflint
+  "RUF",   # Ruff-specific rules
  "W6",
  "E71",
  "E72",
  "E112",
  "E113",
-  # "E124",
  "E203",
  "E272",
-  # "E303",
-  # "E304",
-  # "E501",
-  # "E502",
  "E702",
  "E703",
  "E731",
--- a/gen_readme.py
+++ b/gen_readme.py
@@ -9,30 +9,30 @@
 import subprocess, sys, pathlib, json, jinja2

 if __name__ == '__main__':
-    with (pathlib.Path('.').parent / 'README.md').open('w') as f:
-        f.write(
-            jinja2.Environment(loader=jinja2.FileSystemLoader('.'))
-            .get_template('README.md.tpl')
-            .render(
-                model_dict=json.loads(
-                    subprocess.run(
-                        [
-                            sys.executable,
-                            '-m',
-                            'uv',
-                            'run',
-                            '--with-editable',
-                            '.',
-                            'openllm',
-                            'model',
-                            'list',
-                            '--output',
-                            'readme',
-                        ],
-                        text=True,
-                        check=True,
-                        capture_output=True,
-                    ).stdout.strip()
-                )
-            )
+  with (pathlib.Path('.').parent / 'README.md').open('w') as f:
+    f.write(
+      jinja2.Environment(loader=jinja2.FileSystemLoader('.'))
+      .get_template('README.md.tpl')
+      .render(
+        model_dict=json.loads(
+          subprocess.run(
+            [
+              sys.executable,
+              '-m',
+              'uv',
+              'run',
+              '--with-editable',
+              '.',
+              'openllm',
+              'model',
+              'list',
+              '--output',
+              'readme',
+            ],
+            text=True,
+            check=True,
+            capture_output=True,
+          ).stdout.strip()
        )
+      )
+    )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
  "uv",
  "openai==1.70.0",
  "huggingface-hub",
+  "hf-xet",
  "typing-extensions>=4.12.2",
 ]
 keywords = [
@@ -87,6 +88,12 @@ src-dir = "src/openllm"
 requires = ["hatchling==1.27.0", "hatch-vcs==0.4.0"]
 build-backend = 'hatchling.build'

+[dependency-groups]
+tests = [
+    "pexpect>=4.9.0",
+    "pytest>=8.3.5",
+]
+
 [tool.hatch.version]
 source = "vcs"
 fallback-version = "0.0.0"
--- a/src/openllm/main.py
+++ b/src/openllm/main.py
@@ -14,12 +14,12 @@ from openllm.model import app as model_app, ensure_bento, list_bento
 from openllm.repo import app as repo_app, cmd_update

 if typing.TYPE_CHECKING:
-    from openllm.common import DeploymentTarget
+  from openllm.common import DeploymentTarget

 app = OpenLLMTyper(
-    help='`openllm hello` to get started. '
-    'OpenLLM is a CLI tool to manage and deploy open source LLMs and'
-    ' get an OpenAI API compatible chat server in seconds.'
+  help='`openllm hello` to get started. '
+  'OpenLLM is a CLI tool to manage and deploy open source LLMs and'
+  ' get an OpenAI API compatible chat server in seconds.'
 )

 app.add_typer(repo_app, name='repo')
@@ -28,263 +28,274 @@ app.add_typer(clean_app, name='clean')


 def _select_bento_name(models: list[BentoInfo], target: DeploymentTarget) -> tuple[str, str]:
-    from tabulate import tabulate
+  from tabulate import tabulate

-    model_infos = [(model.repo.name, model.name, can_run(model, target)) for model in models]
-    model_name_groups: defaultdict[tuple[str, str], float] = defaultdict(lambda: 0.0)
-    for repo, name, score in model_infos:
-        model_name_groups[repo, name] += score
-    table_data = [(name, repo, CHECKED if score > 0 else '') for (repo, name), score in model_name_groups.items()]
-    if not table_data:
-        output('No model found', style='red')
-        raise typer.Exit(1)
-    table: list[str] = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\n')
+  model_infos = [(model.repo.name, model.name, can_run(model, target)) for model in models]
+  model_name_groups: defaultdict[tuple[str, str], float] = defaultdict(lambda: 0.0)
+  for repo, name, score in model_infos:
+    model_name_groups[repo, name] += score
+  table_data = [
+    (name, repo, CHECKED if score > 0 else '') for (repo, name), score in model_name_groups.items()
+  ]
+  if not table_data:
+    output('No model found', style='red')
+    raise typer.Exit(1)
+  table: list[str] = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\n')

-    selected: tuple[str, str] | None = questionary.select(
-        'Select a model',
-        [
-            questionary.Separator(f'{table[0]}\n   {table[1]}'),
-            *[questionary.Choice(line, value=value[:2]) for value, line in zip(table_data, table[2:])],
-        ],
-    ).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
+  selected: tuple[str, str] | None = questionary.select(
+    'Select a model',
+    [
+      questionary.Separator(f'{table[0]}\n   {table[1]}'),
+      *[questionary.Choice(line, value=value[:2]) for value, line in zip(table_data, table[2:])],
+    ],
+  ).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected


 def _select_bento_version(
-    models: list[BentoInfo], target: DeploymentTarget | None, bento_name: str, repo: str
+  models: list[BentoInfo], target: DeploymentTarget | None, bento_name: str, repo: str
 ) -> tuple[BentoInfo, float]:
-    from tabulate import tabulate
+  from tabulate import tabulate

-    model_infos: list[tuple[BentoInfo, float]] = [
-        (model, can_run(model, target)) for model in models if model.name == bento_name and model.repo.name == repo
-    ]
+  model_infos: list[tuple[BentoInfo, float]] = [
+    (model, can_run(model, target))
+    for model in models
+    if model.name == bento_name and model.repo.name == repo
+  ]

-    table_data = [
-        [model.tag, CHECKED if score > 0 else '']
-        for model, score in model_infos
-        if model.name == bento_name and model.repo.name == repo
-    ]
-    if not table_data:
-        output(f'No model found for {bento_name} in {repo}', style='red')
-        raise typer.Exit(1)
-    table: list[str] = tabulate(table_data, headers=['version', 'locally runnable']).split('\n')
+  table_data = [
+    [model.tag, CHECKED if score > 0 else '']
+    for model, score in model_infos
+    if model.name == bento_name and model.repo.name == repo
+  ]
+  if not table_data:
+    output(f'No model found for {bento_name} in {repo}', style='red')
+    raise typer.Exit(1)
+  table: list[str] = tabulate(table_data, headers=['version', 'locally runnable']).split('\n')

-    selected: tuple[BentoInfo, float] | None = questionary.select(
-        'Select a version',
-        [
-            questionary.Separator(f'{table[0]}\n   {table[1]}'),
-            *[questionary.Choice(line, value=value[:2]) for value, line in zip(model_infos, table[2:])],
-        ],
-    ).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
+  selected: tuple[BentoInfo, float] | None = questionary.select(
+    'Select a version',
+    [
+      questionary.Separator(f'{table[0]}\n   {table[1]}'),
+      *[questionary.Choice(line, value=value[:2]) for value, line in zip(model_infos, table[2:])],
+    ],
+  ).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected


 def _select_target(bento: BentoInfo, targets: list[DeploymentTarget]) -> DeploymentTarget:
-    from tabulate import tabulate
+  from tabulate import tabulate

-    targets.sort(key=lambda x: can_run(bento, x), reverse=True)
-    if not targets:
-        output('No available instance type, check your bentocloud account', style='red')
-        raise typer.Exit(1)
+  targets.sort(key=lambda x: can_run(bento, x), reverse=True)
+  if not targets:
+    output('No available instance type, check your bentocloud account', style='red')
+    raise typer.Exit(1)

-    table = tabulate(
-        [
-            [
-                target.name,
-                target.accelerators_repr,
-                f'${target.price}',
-                CHECKED if can_run(bento, target) else 'insufficient res.',
-            ]
-            for target in targets
-        ],
-        headers=['instance type', 'accelerator', 'price/hr', 'deployable'],
-    ).split('\n')
+  table = tabulate(
+    [
+      [
+        target.name,
+        target.accelerators_repr,
+        f'${target.price}',
+        CHECKED if can_run(bento, target) else 'insufficient res.',
+      ]
+      for target in targets
+    ],
+    headers=['instance type', 'accelerator', 'price/hr', 'deployable'],
+  ).split('\n')

-    selected: DeploymentTarget | None = questionary.select(
-        'Select an instance type',
-        [
-            questionary.Separator(f'{table[0]}\n   {table[1]}'),
-            *[questionary.Choice(f'{line}', value=target) for target, line in zip(targets, table[2:])],
-        ],
-    ).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
+  selected: DeploymentTarget | None = questionary.select(
+    'Select an instance type',
+    [
+      questionary.Separator(f'{table[0]}\n   {table[1]}'),
+      *[questionary.Choice(f'{line}', value=target) for target, line in zip(targets, table[2:])],
+    ],
+  ).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected


 def _select_action(bento: BentoInfo, score: float) -> None:
-    if score > 0:
-        options: list[typing.Any] = [
-            questionary.Separator('Available actions'),
-            questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),
-            questionary.Separator(f'  $ openllm run {bento}'),
-            questionary.Separator(' '),
-            questionary.Choice('1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'),
-            questionary.Separator(f'  $ openllm serve {bento}'),
-            questionary.Separator(' '),
-            questionary.Choice(
-                '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
-            ),
-            questionary.Separator(f'  $ openllm deploy {bento}'),
-        ]
-    else:
-        options = [
-            questionary.Separator('Available actions'),
-            questionary.Choice(
-                '0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'
-            ),
-            questionary.Separator(f'  $ openllm run {bento}'),
-            questionary.Separator(' '),
-            questionary.Choice(
-                '1. Serve the model locally and get a chat server',
-                value='serve',
-                disabled='insufficient res.',
-                shortcut_key='1',
-            ),
-            questionary.Separator(f'  $ openllm serve {bento}'),
-            questionary.Separator(' '),
-            questionary.Choice(
-                '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
-            ),
-            questionary.Separator(f'  $ openllm deploy {bento}'),
-        ]
-    action: str | None = questionary.select('Select an action', options).ask()
-    if action is None:
-        raise typer.Exit(1)
-    if action == 'run':
-        try:
-            port = random.randint(30000, 40000)
-            local_run(bento, port=port)
-        finally:
-            output('\nUse this command to run the action again:', style='green')
-            output(f'  $ openllm run {bento}', style='orange')
-    elif action == 'serve':
-        try:
-            local_serve(bento)
-        finally:
-            output('\nUse this command to run the action again:', style='green')
-            output(f'  $ openllm serve {bento}', style='orange')
-    elif action == 'deploy':
-        ensure_cloud_context()
-        targets = get_cloud_machine_spec()
-        target = _select_target(bento, targets)
-        try:
-            cloud_deploy(bento, target)
-        finally:
-            output('\nUse this command to run the action again:', style='green')
-            output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')
+  if score > 0:
+    options: list[typing.Any] = [
+      questionary.Separator('Available actions'),
+      questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),
+      questionary.Separator(f'  $ openllm run {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'
+      ),
+      questionary.Separator(f'  $ openllm serve {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '2. Deploy the model to bentocloud and get a scalable chat server',
+        value='deploy',
+        shortcut_key='2',
+      ),
+      questionary.Separator(f'  $ openllm deploy {bento}'),
+    ]
+  else:
+    options = [
+      questionary.Separator('Available actions'),
+      questionary.Choice(
+        '0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'
+      ),
+      questionary.Separator(f'  $ openllm run {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '1. Serve the model locally and get a chat server',
+        value='serve',
+        disabled='insufficient res.',
+        shortcut_key='1',
+      ),
+      questionary.Separator(f'  $ openllm serve {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '2. Deploy the model to bentocloud and get a scalable chat server',
+        value='deploy',
+        shortcut_key='2',
+      ),
+      questionary.Separator(f'  $ openllm deploy {bento}'),
+    ]
+  action: str | None = questionary.select('Select an action', options).ask()
+  if action is None:
+    raise typer.Exit(1)
+  if action == 'run':
+    try:
+      port = random.randint(30000, 40000)
+      local_run(bento, port=port)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm run {bento}', style='orange')
+  elif action == 'serve':
+    try:
+      local_serve(bento)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm serve {bento}', style='orange')
+  elif action == 'deploy':
+    ensure_cloud_context()
+    targets = get_cloud_machine_spec()
+    target = _select_target(bento, targets)
+    try:
+      cloud_deploy(bento, target)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')


@app.command(help='get started interactively')
-def hello() -> None:
-    INTERACTIVE.set(True)
+def hello(repo: typing.Optional[str] = None) -> None:
+  cmd_update()
+  INTERACTIVE.set(True)

-    target = get_local_machine_spec()
-    output(f'  Detected Platform: {target.platform}', style='green')
-    if target.accelerators:
-        output('  Detected Accelerators: ', style='green')
-        for a in target.accelerators:
-            output(f'   - {a.model} {a.memory_size}GB', style='green')
-    else:
-        output('  Detected Accelerators: None', style='yellow')
+  target = get_local_machine_spec()
+  output(f'  Detected Platform: {target.platform}', style='green')
+  if target.accelerators:
+    output('  Detected Accelerators: ', style='green')
+    for a in target.accelerators:
+      output(f'   - {a.model} {a.memory_size}GB', style='green')
+  else:
+    output('  Detected Accelerators: None', style='green')

-    models = list_bento()
-    if not models:
-        output('No model found, you probably need to update the model repo:', style='red')
-        output('  $ openllm repo update', style='orange')
-        raise typer.Exit(1)
+  models = list_bento(repo_name=repo)
+  if not models:
+    output('No model found, you probably need to update the model repo:', style='red')
+    output('  $ openllm repo update', style='orange')
+    raise typer.Exit(1)

-    bento_name, repo = _select_bento_name(models, target)
-    bento, score = _select_bento_version(models, target, bento_name, repo)
-    _select_action(bento, score)
+  bento_name, repo = _select_bento_name(models, target)
+  bento, score = _select_bento_version(models, target, bento_name, repo)
+  _select_action(bento, score)


@app.command(help='start an OpenAI API compatible chat server and chat in browser')
 def serve(
-    model: typing.Annotated[str, typer.Argument()] = '',
-    repo: typing.Optional[str] = None,
-    port: int = 3000,
-    verbose: bool = False,
+  model: typing.Annotated[str, typer.Argument()] = '',
+  repo: typing.Optional[str] = None,
+  port: int = 3000,
+  verbose: bool = False,
 ) -> None:
-    cmd_update()
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    target = get_local_machine_spec()
-    bento = ensure_bento(model, target=target, repo_name=repo)
-    local_serve(bento, port=port)
+  cmd_update()
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  target = get_local_machine_spec()
+  bento = ensure_bento(model, target=target, repo_name=repo)
+  local_serve(bento, port=port)


@app.command(help='run the model and chat in terminal')
 def run(
-    model: typing.Annotated[str, typer.Argument()] = '',
-    repo: typing.Optional[str] = None,
-    port: typing.Optional[int] = None,
-    timeout: int = 600,
-    verbose: bool = False,
+  model: typing.Annotated[str, typer.Argument()] = '',
+  repo: typing.Optional[str] = None,
+  port: typing.Optional[int] = None,
+  timeout: int = 600,
+  verbose: bool = False,
 ) -> None:
-    cmd_update()
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    target = get_local_machine_spec()
-    bento = ensure_bento(model, target=target, repo_name=repo)
-    if port is None:
-        port = random.randint(30000, 40000)
-    local_run(bento, port=port, timeout=timeout)
+  cmd_update()
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  target = get_local_machine_spec()
+  bento = ensure_bento(model, target=target, repo_name=repo)
+  if port is None:
+    port = random.randint(30000, 40000)
+  local_run(bento, port=port, timeout=timeout)


@app.command(help='deploy production-ready OpenAI API-compatible server to BentoCloud')
 def deploy(
-    model: typing.Annotated[str, typer.Argument()] = '',
-    instance_type: typing.Optional[str] = None,
-    repo: typing.Optional[str] = None,
-    verbose: bool = False,
-    env: typing.Optional[list[str]] = typer.Option(
-        None,
-        '--env',
-        help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',
-    ),
+  model: typing.Annotated[str, typer.Argument()] = '',
+  instance_type: typing.Optional[str] = None,
+  repo: typing.Optional[str] = None,
+  verbose: bool = False,
+  env: typing.Optional[list[str]] = typer.Option(
+    None,
+    '--env',
+    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',
+  ),
 ) -> None:
-    cmd_update()
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    bento = ensure_bento(model, repo_name=repo)
-    if instance_type is not None:
-        return cloud_deploy(bento, DeploymentTarget(accelerators=[], name=instance_type), cli_envs=env)
-    targets = sorted(
-        filter(lambda x: can_run(bento, x) > 0, get_cloud_machine_spec()),
-        key=lambda x: can_run(bento, x),
-        reverse=True,
-    )
-    if not targets:
-        output('No available instance type, check your bentocloud account', style='red')
-        raise typer.Exit(1)
-    target = targets[0]
-    output(f'Recommended instance type: {target.name}', style='green')
-    cloud_deploy(bento, target, cli_envs=env)
+  cmd_update()
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  bento = ensure_bento(model, repo_name=repo)
+  if instance_type is not None:
+    return cloud_deploy(bento, DeploymentTarget(accelerators=[], name=instance_type), cli_envs=env)
+  targets = sorted(
+    filter(lambda x: can_run(bento, x) > 0, get_cloud_machine_spec()),
+    key=lambda x: can_run(bento, x),
+    reverse=True,
+  )
+  if not targets:
+    output('No available instance type, check your bentocloud account', style='red')
+    raise typer.Exit(1)
+  target = targets[0]
+  output(f'Recommended instance type: {target.name}', style='green')
+  cloud_deploy(bento, target, cli_envs=env)


@app.callback(invoke_without_command=True)
 def typer_callback(
-    verbose: int = 0,
-    do_not_track: bool = typer.Option(
-        False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK
-    ),
-    version: bool = typer.Option(False, '--version', '-v', help='Show version'),
+  verbose: int = 0,
+  do_not_track: bool = typer.Option(
+    False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK
+  ),
+  version: bool = typer.Option(False, '--version', '-v', help='Show version'),
 ) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(verbose)
-    if version:
-        output(
-            f'openllm, {importlib.metadata.version("openllm")}\nPython ({platform.python_implementation()}) {platform.python_version()}'
-        )
-        sys.exit(0)
-    if do_not_track:
-        os.environ[DO_NOT_TRACK] = str(True)
+  if verbose:
+    VERBOSE_LEVEL.set(verbose)
+  if version:
+    output(
+      f'openllm, {importlib.metadata.version("openllm")}\nPython ({platform.python_implementation()}) {platform.python_version()}'
+    )
+    sys.exit(0)
+  if do_not_track:
+    os.environ[DO_NOT_TRACK] = str(True)


 if __name__ == '__main__':
-    app()
+  app()
--- a/src/openllm/accelerator_spec.py
+++ b/src/openllm/accelerator_spec.py
@@ -9,129 +9,141 @@ from openllm.common import BentoInfo, DeploymentTarget, output, Accelerator


 def parse_memory_string(v: typing.Any) -> typing.Any:
-    """Parse memory strings like "60Gi" into float."""
-    if isinstance(v, str):
-        match = re.match(r'(\d+(\.\d+)?)\s*Gi$', v, re.IGNORECASE)
-        if match:
-            return float(match.group(1))
-    # Pass other types (including numbers or other strings for standard float conversion) through
-    return v
+  """Parse memory strings like "60Gi" into float."""
+  if isinstance(v, str):
+    match = re.match(r'(\d+(\.\d+)?)\s*Gi$', v, re.IGNORECASE)
+    if match:
+      return float(match.group(1))
+  # Pass other types (including numbers or other strings for standard float conversion) through
+  return v


 class Resource(pydantic.BaseModel):
-    memory: typing.Annotated[float, BeforeValidator(parse_memory_string)] = 0.0
-    cpu: int = 0
-    gpu: int = 0
-    gpu_type: str = ''
+  memory: typing.Annotated[float, BeforeValidator(parse_memory_string)] = 0.0
+  cpu: int = 0
+  gpu: int = 0
+  gpu_type: str = ''

-    @override
-    def __hash__(self) -> int:
-        return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
+  @override
+  def __hash__(self) -> int:
+    return hash((self.cpu, self.memory, self.gpu, self.gpu_type))

-    def __bool__(self) -> bool:
-        return any(value is not None for value in self.__dict__.values())
+  def __bool__(self) -> bool:
+    return any(value is not None for value in self.__dict__.values())


 ACCELERATOR_SPECS: dict[str, Accelerator] = {
-    'nvidia-gtx-1650': Accelerator(model='GTX 1650', memory_size=4.0),
-    'nvidia-gtx-1060': Accelerator(model='GTX 1060', memory_size=6.0),
-    'nvidia-gtx-1080-ti': Accelerator(model='GTX 1080 Ti', memory_size=11.0),
-    'nvidia-rtx-3060': Accelerator(model='RTX 3060', memory_size=12.0),
-    'nvidia-rtx-3060-ti': Accelerator(model='RTX 3060 Ti', memory_size=8.0),
-    'nvidia-rtx-3070-ti': Accelerator(model='RTX 3070 Ti', memory_size=8.0),
-    'nvidia-rtx-3080': Accelerator(model='RTX 3080', memory_size=10.0),
-    'nvidia-rtx-3080-ti': Accelerator(model='RTX 3080 Ti', memory_size=12.0),
-    'nvidia-rtx-3090': Accelerator(model='RTX 3090', memory_size=24.0),
-    'nvidia-rtx-4070-ti': Accelerator(model='RTX 4070 Ti', memory_size=12.0),
-    'nvidia-tesla-p4': Accelerator(model='P4', memory_size=8.0),
-    'nvidia-tesla-p100': Accelerator(model='P100', memory_size=16.0),
-    'nvidia-tesla-k80': Accelerator(model='K80', memory_size=12.0),
-    'nvidia-tesla-t4': Accelerator(model='T4', memory_size=16.0),
-    'nvidia-tesla-v100': Accelerator(model='V100', memory_size=16.0),
-    'nvidia-l4': Accelerator(model='L4', memory_size=24.0),
-    'nvidia-tesla-l4': Accelerator(model='L4', memory_size=24.0),
-    'nvidia-tesla-a10g': Accelerator(model='A10G', memory_size=24.0),
-    'nvidia-a100-80g': Accelerator(model='A100', memory_size=80.0),
-    'nvidia-a100-80gb': Accelerator(model='A100', memory_size=80.0),
-    'nvidia-tesla-a100': Accelerator(model='A100', memory_size=40.0),
+  'nvidia-gtx-1650': Accelerator(model='GTX 1650', memory_size=4.0),
+  'nvidia-gtx-1060': Accelerator(model='GTX 1060', memory_size=6.0),
+  'nvidia-gtx-1080-ti': Accelerator(model='GTX 1080 Ti', memory_size=11.0),
+  'nvidia-rtx-3060': Accelerator(model='RTX 3060', memory_size=12.0),
+  'nvidia-rtx-3060-ti': Accelerator(model='RTX 3060 Ti', memory_size=8.0),
+  'nvidia-rtx-3070-ti': Accelerator(model='RTX 3070 Ti', memory_size=8.0),
+  'nvidia-rtx-3080': Accelerator(model='RTX 3080', memory_size=10.0),
+  'nvidia-rtx-3080-ti': Accelerator(model='RTX 3080 Ti', memory_size=12.0),
+  'nvidia-rtx-3090': Accelerator(model='RTX 3090', memory_size=24.0),
+  'nvidia-rtx-4070-ti': Accelerator(model='RTX 4070 Ti', memory_size=12.0),
+  'nvidia-tesla-p4': Accelerator(model='P4', memory_size=8.0),
+  'nvidia-tesla-p100': Accelerator(model='P100', memory_size=16.0),
+  'nvidia-tesla-k80': Accelerator(model='K80', memory_size=12.0),
+  'nvidia-tesla-t4': Accelerator(model='T4', memory_size=16.0),
+  'nvidia-tesla-v100': Accelerator(model='V100', memory_size=16.0),
+  'nvidia-l4': Accelerator(model='L4', memory_size=24.0),
+  'nvidia-tesla-l4': Accelerator(model='L4', memory_size=24.0),
+  'nvidia-tesla-a10g': Accelerator(model='A10G', memory_size=24.0),
+  'nvidia-a100-80g': Accelerator(model='A100', memory_size=80.0),
+  'nvidia-a100-80gb': Accelerator(model='A100', memory_size=80.0),
+  'nvidia-tesla-a100': Accelerator(model='A100', memory_size=40.0),
+  'nvidia-tesla-h100': Accelerator(model='H100', memory_size=80.0),
+  'nvidia-h200-141gb': Accelerator(model='H200', memory_size=141.0),
+  'nvidia-blackwell-b100': Accelerator(model='B100', memory_size=192.0),
+  'nvidia-blackwell-gb200': Accelerator(model='GB200', memory_size=192.0),
 }


@functools.lru_cache
 def get_local_machine_spec() -> DeploymentTarget:
-    if psutil.MACOS:
-        return DeploymentTarget(accelerators=[], source='local', platform='macos')
+  if psutil.MACOS:
+    return DeploymentTarget(accelerators=[], source='local', platform='macos')

-    if psutil.WINDOWS:
-        platform = 'windows'
-    elif psutil.LINUX:
-        platform = 'linux'
-    else:
-        raise NotImplementedError('Unsupported platform')
+  if psutil.WINDOWS:
+    platform = 'windows'
+  elif psutil.LINUX:
+    platform = 'linux'
+  else:
+    raise NotImplementedError('Unsupported platform')

-    from pynvml import (
-        nvmlDeviceGetCount,
-        nvmlDeviceGetCudaComputeCapability,
-        nvmlDeviceGetHandleByIndex,
-        nvmlDeviceGetMemoryInfo,
-        nvmlDeviceGetName,
-        nvmlInit,
-        nvmlShutdown,
-    )
+  from pynvml import (
+    nvmlDeviceGetCount,
+    nvmlDeviceGetCudaComputeCapability,
+    nvmlDeviceGetHandleByIndex,
+    nvmlDeviceGetMemoryInfo,
+    nvmlDeviceGetName,
+    nvmlInit,
+    nvmlShutdown,
+  )

-    try:
-        nvmlInit()
-        device_count = nvmlDeviceGetCount()
-        accelerators: list[Accelerator] = []
-        for i in range(device_count):
-            handle = nvmlDeviceGetHandleByIndex(i)
-            name = nvmlDeviceGetName(handle)
-            memory_info = nvmlDeviceGetMemoryInfo(handle)
-            accelerators.append(Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)))
-            compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
-            if compute_capability < (7, 5):
-                output(
-                    f'GPU {name} with compute capability {compute_capability} '
-                    'may not be supported, 7.5 or higher is recommended. check '
-                    'https://developer.nvidia.com/cuda-gpus for more information',
-                    style='yellow',
-                )
-        nvmlShutdown()
-        return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)
-    except Exception as e:
+  try:
+    nvmlInit()
+    device_count = nvmlDeviceGetCount()
+    accelerators: list[Accelerator] = []
+    for i in range(device_count):
+      handle = nvmlDeviceGetHandleByIndex(i)
+      name = nvmlDeviceGetName(handle)
+      memory_info = nvmlDeviceGetMemoryInfo(handle)
+      accelerators.append(
+        Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3))
+      )
+      compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
+      if compute_capability < (7, 5):
        output(
-            'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment',
-            style='yellow',
+          f'GPU {name} with compute capability {compute_capability} '
+          'may not be supported, 7.5 or higher is recommended. check '
+          'https://developer.nvidia.com/cuda-gpus for more information',
+          style='yellow',
        )
-        output(f'Error: {e}', style='red', level=20)
-        return DeploymentTarget(accelerators=[], source='local', platform=platform)
+    nvmlShutdown()
+    return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)
+  except Exception as e:
+    output(
+      'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment',
+      style='yellow',
+    )
+    output(f'Error: {e}', style='red', level=20)
+    return DeploymentTarget(accelerators=[], source='local', platform=platform)


@functools.lru_cache(typed=True)
 def can_run(bento: BentoInfo, target: DeploymentTarget | None = None) -> float:
-    """
-    Calculate if the bento can be deployed on the target.
-    """
-    if target is None:
-        target = get_local_machine_spec()
+  """
+  Calculate if the bento can be deployed on the target.
+  """
+  if target is None:
+    target = get_local_machine_spec()

-    resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))
-    labels = bento.bento_yaml.get('labels', {})
-    platforms = labels.get('platforms', 'linux').split(',')
+  resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))
+  labels = bento.bento_yaml.get('labels', {})
+  platforms = labels.get('platforms', 'linux').split(',')

-    if target.platform not in platforms:
-        return 0.0
+  if target.platform not in platforms:
+    return 0.0

-    # return 1.0 if no resource is specified
-    if not resource_spec:
-        return 0.5
+  # return 1.0 if no resource is specified
+  if not resource_spec:
+    return 0.5

-    if resource_spec.gpu > 0:
-        required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
-        filtered_accelerators = [ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size]
-        if resource_spec.gpu > len(filtered_accelerators):
-            return 0.0
-        return required_gpu.memory_size * resource_spec.gpu / sum(ac.memory_size for ac in target.accelerators)
-    if target.accelerators:
-        return 0.01 / sum(ac.memory_size for ac in target.accelerators)
-    return 1.0
+  if resource_spec.gpu > 0:
+    required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
+    filtered_accelerators = [
+      ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size
+    ]
+    if resource_spec.gpu > len(filtered_accelerators):
+      return 0.0
+    return (
+      required_gpu.memory_size
+      * resource_spec.gpu
+      / sum(ac.memory_size for ac in target.accelerators)
+    )
+  if target.accelerators:
+    return 0.01 / sum(ac.memory_size for ac in target.accelerators)
+  return 1.0
--- a/src/openllm/analytic.py
+++ b/src/openllm/analytic.py
@@ -7,99 +7,99 @@ DO_NOT_TRACK = 'BENTOML_DO_NOT_TRACK'


 class EventMeta(abc.ABC):
-    @property
-    def event_name(self) -> str:
-        # camel case to snake case
-        event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()
-        # remove "_event" suffix
-        suffix_to_remove = '_event'
-        if event_name.endswith(suffix_to_remove):
-            event_name = event_name[: -len(suffix_to_remove)]
-        return event_name
+  @property
+  def event_name(self) -> str:
+    # camel case to snake case
+    event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()
+    # remove "_event" suffix
+    suffix_to_remove = '_event'
+    if event_name.endswith(suffix_to_remove):
+      event_name = event_name[: -len(suffix_to_remove)]
+    return event_name


@attr.define
 class CliEvent(EventMeta):
-    cmd_group: str
-    cmd_name: str
-    duration_in_ms: float = attr.field(default=0)
-    error_type: typing.Optional[str] = attr.field(default=None)
-    return_code: typing.Optional[int] = attr.field(default=None)
+  cmd_group: str
+  cmd_name: str
+  duration_in_ms: float = attr.field(default=0)
+  error_type: typing.Optional[str] = attr.field(default=None)
+  return_code: typing.Optional[int] = attr.field(default=None)


@attr.define
 class OpenllmCliEvent(CliEvent):
-    pass
+  pass


 class OrderedCommands(typer.core.TyperGroup):
-    def list_commands(self, ctx: click.Context) -> list[str]:
-        return list(self.commands)
+  def list_commands(self, ctx: click.Context) -> list[str]:
+    return list(self.commands)


 class OpenLLMTyper(typer.Typer):
-    def __init__(self, *args: typing.Any, **kwargs: typing.Any):
-        no_args_is_help: bool = kwargs.pop('no_args_is_help', True)
-        context_settings: dict[str, typing.Any] = kwargs.pop('context_settings', {})
-        if 'help_option_names' not in context_settings:
-            context_settings['help_option_names'] = ('-h', '--help')
-        if 'max_content_width' not in context_settings:
-            context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))
-        klass = kwargs.pop('cls', OrderedCommands)
+  def __init__(self, *args: typing.Any, **kwargs: typing.Any):
+    no_args_is_help: bool = kwargs.pop('no_args_is_help', True)
+    context_settings: dict[str, typing.Any] = kwargs.pop('context_settings', {})
+    if 'help_option_names' not in context_settings:
+      context_settings['help_option_names'] = ('-h', '--help')
+    if 'max_content_width' not in context_settings:
+      context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))
+    klass = kwargs.pop('cls', OrderedCommands)

-        super().__init__(
-            *args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs
-        )
+    super().__init__(
+      *args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs
+    )

-    # NOTE: Since OpenLLMTyper only wraps command to add analytics, the default type-hint for @app.command
-    # does not change, hence the below hijacking.
-    if typing.TYPE_CHECKING:
-        command = typer.Typer.command
-    else:
+  # NOTE: Since OpenLLMTyper only wraps command to add analytics, the default type-hint for @app.command
+  # does not change, hence the below hijacking.
+  if typing.TYPE_CHECKING:
+    command = typer.Typer.command
+  else:

-        def command(self, *args: typing.Any, **kwargs: typing.Any):
-            def decorator(f):
-                @functools.wraps(f)
-                @click.pass_context
-                def wrapped(ctx: click.Context, *args, **kwargs):
-                    from bentoml._internal.utils.analytics import track
+    def command(self, *args: typing.Any, **kwargs: typing.Any):
+      def decorator(f):
+        @functools.wraps(f)
+        @click.pass_context
+        def wrapped(ctx: click.Context, *args, **kwargs):
+          from bentoml._internal.utils.analytics import track

-                    do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'
+          do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'

-                    # so we know that the root program is openllm
-                    command_name = ctx.info_name
-                    if ctx.parent.parent is not None:
-                        # openllm model list
-                        command_group = ctx.parent.info_name
-                    elif ctx.parent.info_name == ctx.find_root().info_name:
-                        # openllm run
-                        command_group = 'openllm'
+          # so we know that the root program is openllm
+          command_name = ctx.info_name
+          if ctx.parent.parent is not None:
+            # openllm model list
+            command_group = ctx.parent.info_name
+          elif ctx.parent.info_name == ctx.find_root().info_name:
+            # openllm run
+            command_group = 'openllm'

-                    if do_not_track:
-                        return f(*args, **kwargs)
-                    start_time = time.time_ns()
-                    try:
-                        return_value = f(*args, **kwargs)
-                        duration_in_ns = time.time_ns() - start_time
-                        track(
-                            OpenllmCliEvent(
-                                cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6
-                            )
-                        )
-                        return return_value
-                    except BaseException as e:
-                        duration_in_ns = time.time_ns() - start_time
-                        track(
-                            OpenllmCliEvent(
-                                cmd_group=command_group,
-                                cmd_name=command_name,
-                                duration_in_ms=duration_in_ns / 1e6,
-                                error_type=type(e).__name__,
-                                return_code=(2 if isinstance(e, KeyboardInterrupt) else 1),
-                            )
-                        )
-                        raise
+          if do_not_track:
+            return f(*args, **kwargs)
+          start_time = time.time_ns()
+          try:
+            return_value = f(*args, **kwargs)
+            duration_in_ns = time.time_ns() - start_time
+            track(
+              OpenllmCliEvent(
+                cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6
+              )
+            )
+            return return_value
+          except BaseException as e:
+            duration_in_ns = time.time_ns() - start_time
+            track(
+              OpenllmCliEvent(
+                cmd_group=command_group,
+                cmd_name=command_name,
+                duration_in_ms=duration_in_ns / 1e6,
+                error_type=type(e).__name__,
+                return_code=(2 if isinstance(e, KeyboardInterrupt) else 1),
+              )
+            )
+            raise

-                return typer.Typer.command(self, *args, **kwargs)(wrapped)
+        return typer.Typer.command(self, *args, **kwargs)(wrapped)

-            return decorator
+      return decorator
--- a/src/openllm/clean.py
+++ b/src/openllm/clean.py
@@ -12,72 +12,72 @@ HUGGINGFACE_CACHE = pathlib.Path.home() / '.cache' / 'huggingface' / 'hub'


 def _du(path: pathlib.Path) -> int:
-    seen_paths = set()
-    used_space = 0
+  seen_paths = set()
+  used_space = 0

-    for f in path.rglob('*'):
-        if os.name == 'nt':  # Windows system
-            # On Windows, directly add file sizes without considering hard links
-            used_space += f.stat().st_size
-        else:
-            # On non-Windows systems, use inodes to avoid double counting
-            stat = f.stat()
-            if stat.st_ino not in seen_paths:
-                seen_paths.add(stat.st_ino)
-                used_space += stat.st_size
-    return used_space
+  for f in path.rglob('*'):
+    if os.name == 'nt':  # Windows system
+      # On Windows, directly add file sizes without considering hard links
+      used_space += f.stat().st_size
+    else:
+      # On non-Windows systems, use inodes to avoid double counting
+      stat = f.stat()
+      if stat.st_ino not in seen_paths:
+        seen_paths.add(stat.st_ino)
+        used_space += stat.st_size
+  return used_space


@app.command(help='Clean up all the cached models from huggingface')
 def model_cache(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    used_space = _du(HUGGINGFACE_CACHE)
-    sure = questionary.confirm(
-        f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
-    ).ask()
-    if not sure:
-        return
-    shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
-    output('All models cached by Huggingface have been removed', style='green')
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  used_space = _du(HUGGINGFACE_CACHE)
+  sure = questionary.confirm(
+    f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
+  ).ask()
+  if not sure:
+    return
+  shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
+  output('All models cached by Huggingface have been removed', style='green')


@app.command(help='Clean up all the virtual environments created by OpenLLM')
 def venvs(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
+  if verbose:
+    VERBOSE_LEVEL.set(20)

-    used_space = _du(VENV_DIR)
-    sure = questionary.confirm(
-        f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
-    ).ask()
-    if not sure:
-        return
-    shutil.rmtree(VENV_DIR, ignore_errors=True)
-    output('All virtual environments have been removed', style='green')
+  used_space = _du(VENV_DIR)
+  sure = questionary.confirm(
+    f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
+  ).ask()
+  if not sure:
+    return
+  shutil.rmtree(VENV_DIR, ignore_errors=True)
+  output('All virtual environments have been removed', style='green')


@app.command(help='Clean up all the repositories cloned by OpenLLM')
 def repos(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    shutil.rmtree(REPO_DIR, ignore_errors=True)
-    output('All repositories have been removed', style='green')
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  shutil.rmtree(REPO_DIR, ignore_errors=True)
+  output('All repositories have been removed', style='green')


@app.command(help='Reset configurations to default')
 def configs(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    shutil.rmtree(CONFIG_FILE, ignore_errors=True)
-    output('All configurations have been reset', style='green')
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  shutil.rmtree(CONFIG_FILE, ignore_errors=True)
+  output('All configurations have been reset', style='green')


@app.command(name='all', help='Clean up all above and bring OpenLLM to a fresh start')
 def all_cache(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    repos()
-    venvs()
-    model_cache()
-    configs()
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  repos()
+  venvs()
+  model_cache()
+  configs()
--- a/src/openllm/cloud.py
+++ b/src/openllm/cloud.py
@@ -11,158 +11,171 @@ app = OpenLLMTyper()


 def resolve_cloud_config() -> pathlib.Path:
-    env = os.environ.get('BENTOML_HOME')
-    if env is not None:
-        return pathlib.Path(env) / '.yatai.yaml'
-    return pathlib.Path.home() / 'bentoml' / '.yatai.yaml'
+  env = os.environ.get('BENTOML_HOME')
+  if env is not None:
+    return pathlib.Path(env) / '.yatai.yaml'
+  return pathlib.Path.home() / 'bentoml' / '.yatai.yaml'


 def _get_deploy_cmd(
-    bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None, cli_envs: typing.Optional[list[str]] = None
+  bento: BentoInfo,
+  target: typing.Optional[DeploymentTarget] = None,
+  cli_envs: typing.Optional[list[str]] = None,
 ) -> tuple[list[str], EnvVars]:
-    cmd = ['bentoml', 'deploy', bento.bentoml_tag]
-    env = EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})
-
-    # Process CLI env vars first to determine overrides
-    explicit_envs: dict[str, str] = {}
-    if cli_envs:
-        for env_var in cli_envs:
-            if '=' in env_var:
-                name, value = env_var.split('=', 1)
-                explicit_envs[name] = value
-            else:
-                name = env_var
-                value = typing.cast(str, os.environ.get(name))
-                if value is None:
-                    output(
-                        f"Environment variable '{name}' specified via --env but not found in the current environment.",
-                        style='red',
-                    )
-                    raise typer.Exit(1)
-                explicit_envs[name] = value
-
-    # Process envs defined in bento.yaml, skipping those overridden by CLI
-    required_envs = bento.bento_yaml.get('envs', [])
-    required_env_names = [env['name'] for env in required_envs if 'name' in env and env['name'] not in explicit_envs]
-    if required_env_names:
-        output(
-            f'This model requires the following environment variables to run (unless overridden via --env): {required_env_names!r}',
-            style='yellow',
-        )
-
-    for env_info in required_envs:
-        name = typing.cast(str, env_info.get('name'))
-        if not name or name in explicit_envs:
-            continue
-
-        if os.environ.get(name):
-            default = os.environ[name]
-        elif 'value' in env_info:
-            default = env_info['value']
-        else:
-            default = ''
-
-        if INTERACTIVE.get():
-            import questionary
-
-            value = questionary.text(f'{name}: (from bento.yaml)', default=default).ask()
-        else:
-            if default == '':
-                output(f'Environment variable {name} (from bento.yaml) is required but not provided', style='red')
-                raise typer.Exit(1)
-            else:
-                value = default
+  cmd = ['bentoml', 'deploy', bento.bentoml_tag]
+  env = EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})

+  # Process CLI env vars first to determine overrides
+  explicit_envs: dict[str, str] = {}
+  if cli_envs:
+    for env_var in cli_envs:
+      if '=' in env_var:
+        name, value = env_var.split('=', 1)
+        explicit_envs[name] = value
+      else:
+        name = env_var
+        value = typing.cast(str, os.environ.get(name))
        if value is None:
-            raise typer.Exit(1)
-        cmd += ['--env', f'{name}={value}']
+          output(
+            f"Environment variable '{name}' specified via --env but not found in the current environment.",
+            style='red',
+          )
+          raise typer.Exit(1)
+        explicit_envs[name] = value

-    # Add explicitly provided env vars from CLI
-    for name, value in explicit_envs.items():
-        cmd += ['--env', f'{name}={value}']
+  # Process envs defined in bento.yaml, skipping those overridden by CLI
+  required_envs = bento.bento_yaml.get('envs', [])
+  required_env_names = [
+    env['name']
+    for env in required_envs
+    if 'name' in env and env['name'] not in explicit_envs and not env.get('value')
+  ]
+  if required_env_names:
+    output(
+      f'This model requires the following environment variables to run (unless overridden via --env): {required_env_names!r}',
+      style='green',
+    )

-    if target:
-        cmd += ['--instance-type', target.name]
+  for env_info in required_envs:
+    name = typing.cast(str, env_info.get('name'))
+    if not name or name in explicit_envs or env_info.get('value', None) is not None:
+      continue

-    base_config = resolve_cloud_config()
-    if not base_config.exists():
-        raise Exception('Cannot find cloud config.')
-    # remove before copy
-    if (bento.repo.path / 'bentoml' / '.yatai.yaml').exists():
-        (bento.repo.path / 'bentoml' / '.yatai.yaml').unlink()
-    shutil.copy(base_config, bento.repo.path / 'bentoml' / '.yatai.yaml')
+    if os.environ.get(name):
+      default = os.environ[name]
+    elif 'value' in env_info:
+      default = env_info['value']
+    else:
+      default = ''

-    return cmd, env
+    if INTERACTIVE.get():
+      import questionary
+
+      value = questionary.text(f'{name}: (from bento.yaml)', default=default).ask()
+    else:
+      if default == '':
+        output(
+          f'Environment variable {name} (from bento.yaml) is required but not provided', style='red'
+        )
+        raise typer.Exit(1)
+      else:
+        value = default
+
+    if value is None:
+      raise typer.Exit(1)
+    cmd += ['--env', f'{name}={value}']
+
+  # Add explicitly provided env vars from CLI
+  for name, value in explicit_envs.items():
+    cmd += ['--env', f'{name}={value}']
+
+  if target:
+    cmd += ['--instance-type', target.name]
+
+  base_config = resolve_cloud_config()
+  if not base_config.exists():
+    raise Exception('Cannot find cloud config.')
+  # remove before copy
+  if (bento.repo.path / 'bentoml' / '.yatai.yaml').exists():
+    (bento.repo.path / 'bentoml' / '.yatai.yaml').unlink()
+  shutil.copy(base_config, bento.repo.path / 'bentoml' / '.yatai.yaml')
+
+  return cmd, env


 def ensure_cloud_context() -> None:
-    import questionary
+  import questionary

-    cmd = ['bentoml', 'cloud', 'current-context']
-    try:
-        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
-        context = json.loads(result)
-        output(f'  bentoml already logged in: {context["endpoint"]}', style='green', level=20)
-    except subprocess.CalledProcessError:
-        output('  bentoml not logged in', style='red')
-        if not INTERACTIVE.get():
-            output('\n  get bentoml logged in by:')
-            output('    $ bentoml cloud login', style='orange')
-            output('')
-            output(
-                """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
-                style='yellow',
-            )
-            raise typer.Exit(1)
-        else:
-            action = questionary.select(
-                'Choose an action:', choices=['I have a BentoCloud account', 'get an account in two minutes']
-            ).ask()
-            if action is None:
-                raise typer.Exit(1)
-            elif action == 'get an account in two minutes':
-                output('Please visit https://cloud.bentoml.com to get your token', style='yellow')
-            endpoint = questionary.text('Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)').ask()
-            if endpoint is None:
-                raise typer.Exit(1)
-            token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()
-            if token is None:
-                raise typer.Exit(1)
-            cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]
-            try:
-                result = subprocess.check_output(cmd)
-                output('  Logged in successfully', style='green')
-            except subprocess.CalledProcessError:
-                output('  Failed to login', style='red')
-                raise typer.Exit(1)
+  cmd = ['bentoml', 'cloud', 'current-context']
+  try:
+    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+    context = json.loads(result)
+    output(f'  bentoml already logged in: {context["endpoint"]}', style='green', level=20)
+  except subprocess.CalledProcessError:
+    output('  bentoml not logged in', style='red')
+    if not INTERACTIVE.get():
+      output('\n  get bentoml logged in by:')
+      output('    $ bentoml cloud login', style='orange')
+      output('')
+      output(
+        """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
+        style='yellow',
+      )
+      raise typer.Exit(1)
+    else:
+      action = questionary.select(
+        'Choose an action:',
+        choices=['I have a BentoCloud account', 'get an account in two minutes'],
+      ).ask()
+      if action is None:
+        raise typer.Exit(1)
+      elif action == 'get an account in two minutes':
+        output('Please visit https://cloud.bentoml.com to get your token', style='yellow')
+      endpoint = questionary.text(
+        'Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)'
+      ).ask()
+      if endpoint is None:
+        raise typer.Exit(1)
+      token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()
+      if token is None:
+        raise typer.Exit(1)
+      cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]
+      try:
+        result = subprocess.check_output(cmd)
+        output('  Logged in successfully', style='green')
+      except subprocess.CalledProcessError:
+        output('  Failed to login', style='red')
+        raise typer.Exit(1)


 def get_cloud_machine_spec() -> list[DeploymentTarget]:
-    ensure_cloud_context()
-    cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']
-    try:
-        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
-        instance_types = json.loads(result)
-        return [
-            DeploymentTarget(
-                source='cloud',
-                name=it['name'],
-                price=it['price'],
-                platform='linux',
-                accelerators=(
-                    [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]
-                    if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS
-                    else []
-                ),
-            )
-            for it in instance_types
-        ]
-    except (subprocess.CalledProcessError, json.JSONDecodeError):
-        output('Failed to get cloud instance types', style='red')
-        return []
+  ensure_cloud_context()
+  cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']
+  try:
+    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+    instance_types = json.loads(result)
+    return [
+      DeploymentTarget(
+        source='cloud',
+        name=it['name'],
+        price=it['price'],
+        platform='linux',
+        accelerators=(
+          [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]
+          if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS
+          else []
+        ),
+      )
+      for it in instance_types
+    ]
+  except (subprocess.CalledProcessError, json.JSONDecodeError):
+    output('Failed to get cloud instance types', style='red')
+    return []


-def deploy(bento: BentoInfo, target: DeploymentTarget, cli_envs: typing.Optional[list[str]] = None) -> None:
-    ensure_cloud_context()
-    cmd, env = _get_deploy_cmd(bento, target, cli_envs=cli_envs)
-    run_command(cmd, env=env, cwd=None)
+def deploy(
+  bento: BentoInfo, target: DeploymentTarget, cli_envs: typing.Optional[list[str]] = None
+) -> None:
+  ensure_cloud_context()
+  cmd, env = _get_deploy_cmd(bento, target, cli_envs=cli_envs)
+  run_command(cmd, env=env, cwd=None)
--- a/src/openllm/common.py
+++ b/src/openllm/common.py
@@ -31,401 +31,413 @@ T = typing.TypeVar('T')


 class ContextVar(typing.Generic[T]):
-    def __init__(self, default: T):
-        self._stack: list[T] = []
-        self._default = default
+  def __init__(self, default: T):
+    self._stack: list[T] = []
+    self._default = default

-    def get(self) -> T:
-        if self._stack:
-            return self._stack[-1]
-        return self._default
+  def get(self) -> T:
+    if self._stack:
+      return self._stack[-1]
+    return self._default

-    def set(self, value: T) -> None:
-        self._stack.append(value)
+  def set(self, value: T) -> None:
+    self._stack.append(value)

-    @contextmanager
-    def patch(self, value: T) -> typing.Iterator[None]:
-        self._stack.append(value)
-        try:
-            yield
-        finally:
-            self._stack.pop()
+  @contextmanager
+  def patch(self, value: T) -> typing.Iterator[None]:
+    self._stack.append(value)
+    try:
+      yield
+    finally:
+      self._stack.pop()


-VERBOSE_LEVEL = ContextVar(10)
+VERBOSE_LEVEL = ContextVar(0)
 INTERACTIVE = ContextVar(False)


-def output(content: typing.Any, level: int = 0, style: str | None = None, end: str | None = None) -> None:
-    if level > VERBOSE_LEVEL.get():
-        return
+def output(
+  content: typing.Any, level: int = 0, style: str | None = None, end: str | None = None
+) -> None:
+  if level > VERBOSE_LEVEL.get():
+    return

-    if not isinstance(content, str):
-        out = io.StringIO()
-        pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)
-        questionary.print(out.getvalue(), style=style, end='' if end is None else end)
-        out.close()
-    else:
-        questionary.print(content, style=style, end='\n' if end is None else end)
+  if not isinstance(content, str):
+    out = io.StringIO()
+    pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)
+    questionary.print(out.getvalue(), style=style, end='' if end is None else end)
+    out.close()
+  else:
+    questionary.print(content, style=style, end='\n' if end is None else end)


 class Config(pydantic.BaseModel):
-    repos: dict[str, str] = pydantic.Field(
-        default_factory=lambda: {'default': 'https://github.com/bentoml/openllm-models@main'}
-    )
-    default_repo: str = 'default'
+  repos: dict[str, str] = pydantic.Field(
+    default_factory=lambda: {'default': 'https://github.com/bentoml/openllm-models@main'}
+  )
+  default_repo: str = 'default'

-    def tolist(self) -> dict[str, typing.Any]:
-        return dict(repos=self.repos, default_repo=self.default_repo)
+  def tolist(self) -> dict[str, typing.Any]:
+    return dict(repos=self.repos, default_repo=self.default_repo)


 def load_config() -> Config:
-    if CONFIG_FILE.exists():
-        try:
-            with open(CONFIG_FILE) as f:
-                return Config(**json.load(f))
-        except json.JSONDecodeError:
-            return Config()
-    return Config()
+  if CONFIG_FILE.exists():
+    try:
+      with open(CONFIG_FILE) as f:
+        return Config(**json.load(f))
+    except json.JSONDecodeError:
+      return Config()
+  return Config()


 def save_config(config: Config) -> None:
-    with open(CONFIG_FILE, 'w') as f:
-        json.dump(config.tolist(), f, indent=2)
+  with open(CONFIG_FILE, 'w') as f:
+    json.dump(config.tolist(), f, indent=2)


 class BentoMetadata(typing.TypedDict):
-    name: str
-    version: str
-    labels: dict[str, str]
-    envs: list[dict[str, str]]
-    services: list[dict[str, typing.Any]]
-    schema: dict[str, typing.Any]
+  name: str
+  version: str
+  labels: dict[str, str]
+  envs: list[dict[str, str]]
+  services: list[dict[str, typing.Any]]
+  schema: dict[str, typing.Any]


 class EnvVars(UserDict[str, str]):
-    """
-    A dictionary-like object that sorted by key and only keeps the environment variables that have a value.
-    """
+  """
+  A dictionary-like object that sorted by key and only keeps the environment variables that have a value.
+  """

-    @classmethod
-    def __get_pydantic_core_schema__(
-        cls: type[EnvVars], source_type: type[typing.Any], handler: typing.Callable[..., typing.Any]
-    ) -> core_schema.DictSchema:
-        return core_schema.dict_schema(core_schema.str_schema(), core_schema.str_schema())
+  @classmethod
+  def __get_pydantic_core_schema__(
+    cls: type[EnvVars], source_type: type[typing.Any], handler: typing.Callable[..., typing.Any]
+  ) -> core_schema.DictSchema:
+    return core_schema.dict_schema(core_schema.str_schema(), core_schema.str_schema())

-    def __init__(self, data: typing.Mapping[str, str] | None = None):
-        super().__init__(data or {})
-        self.data = {k: v for k, v in sorted(self.data.items()) if v}
+  def __init__(self, data: typing.Mapping[str, str] | None = None):
+    super().__init__(data or {})
+    self.data = {k: v for k, v in sorted(self.data.items()) if v}

-    def __hash__(self) -> int:
-        return hash(tuple(sorted(self.data.items())))
+  def __hash__(self) -> int:
+    return hash(tuple(sorted(self.data.items())))


 class RepoInfo(pydantic.BaseModel):
-    name: str
-    path: pathlib.Path
-    url: str
-    server: str
-    owner: str
-    repo: str
-    branch: str
+  name: str
+  path: pathlib.Path
+  url: str
+  server: str
+  owner: str
+  repo: str
+  branch: str

-    def tolist(self) -> str | dict[str, typing.Any] | None:
-        if VERBOSE_LEVEL.get() <= 0:
-            return f'{self.name} ({self.url}@{self.branch})'
-        if VERBOSE_LEVEL.get() <= 10:
-            return dict(name=self.name, url=f'{self.url}@{self.branch}', path=str(self.path))
-        if VERBOSE_LEVEL.get() <= 20:
-            return dict(
-                name=self.name,
-                url=f'{self.url}@{self.branch}',
-                path=str(self.path),
-                server=self.server,
-                owner=self.owner,
-                repo=self.repo,
-            )
-        return None
+  def tolist(self) -> str | dict[str, typing.Any] | None:
+    if VERBOSE_LEVEL.get() <= 0:
+      return f'{self.name} ({self.url}@{self.branch})'
+    if VERBOSE_LEVEL.get() <= 10:
+      return dict(name=self.name, url=f'{self.url}@{self.branch}', path=str(self.path))
+    if VERBOSE_LEVEL.get() <= 20:
+      return dict(
+        name=self.name,
+        url=f'{self.url}@{self.branch}',
+        path=str(self.path),
+        server=self.server,
+        owner=self.owner,
+        repo=self.repo,
+      )
+    return None


 class BentoInfo(pydantic.BaseModel):
-    repo: RepoInfo
-    path: pathlib.Path
-    alias: str = ''
+  repo: RepoInfo
+  path: pathlib.Path
+  alias: str = ''

-    def __str__(self) -> str:
-        if self.repo.name == 'default':
-            return f'{self.tag}'
-        else:
-            return f'{self.repo.name}/{self.tag}'
+  def __str__(self) -> str:
+    if self.repo.name == 'default':
+      return f'{self.tag}'
+    else:
+      return f'{self.repo.name}/{self.tag}'

-    @override
-    def __hash__(self) -> int:
-        return md5(str(self.path))
+  @override
+  def __hash__(self) -> int:
+    return md5(str(self.path))

-    @property
-    def tag(self) -> str:
-        if self.alias:
-            return f'{self.path.parent.name}:{self.alias}'
-        return f'{self.path.parent.name}:{self.path.name}'
+  @property
+  def tag(self) -> str:
+    if self.alias:
+      return f'{self.path.parent.name}:{self.alias}'
+    return f'{self.path.parent.name}:{self.path.name}'

-    @property
-    def bentoml_tag(self) -> str:
-        return f'{self.path.parent.name}:{self.path.name}'
+  @property
+  def bentoml_tag(self) -> str:
+    return f'{self.path.parent.name}:{self.path.name}'

-    @property
-    def name(self) -> str:
-        return self.path.parent.name
+  @property
+  def name(self) -> str:
+    return self.path.parent.name

-    @property
-    def version(self) -> str:
-        return self.path.name
+  @property
+  def version(self) -> str:
+    return self.path.name

-    @property
-    def labels(self) -> dict[str, str]:
-        return self.bento_yaml['labels']
+  @property
+  def labels(self) -> dict[str, str]:
+    return self.bento_yaml['labels']

-    @property
-    def envs(self) -> list[dict[str, str]]:
-        return self.bento_yaml['envs']
+  @property
+  def envs(self) -> list[dict[str, str]]:
+    return self.bento_yaml['envs']

-    @functools.cached_property
-    def bento_yaml(self) -> BentoMetadata:
-        bento: BentoMetadata = yaml.safe_load((self.path / 'bento.yaml').read_text())
-        return bento
+  @functools.cached_property
+  def bento_yaml(self) -> BentoMetadata:
+    bento: BentoMetadata = yaml.safe_load((self.path / 'bento.yaml').read_text())
+    return bento

-    @functools.cached_property
-    def platforms(self) -> list[str]:
-        return self.bento_yaml['labels'].get('platforms', 'linux').split(',')
+  @functools.cached_property
+  def platforms(self) -> list[str]:
+    return self.bento_yaml['labels'].get('platforms', 'linux').split(',')

-    @functools.cached_property
-    def pretty_yaml(self) -> BentoMetadata | dict[str, typing.Any]:
-        def _pretty_routes(routes: list[dict[str, typing.Any]]) -> dict[str, typing.Any]:
-            return {
-                route['route']: {
-                    'input': {k: v['type'] for k, v in route['input']['properties'].items()},
-                    'output': route['output']['type'],
-                }
-                for route in routes
-            }
+  @functools.cached_property
+  def pretty_yaml(self) -> BentoMetadata | dict[str, typing.Any]:
+    def _pretty_routes(routes: list[dict[str, typing.Any]]) -> dict[str, typing.Any]:
+      return {
+        route['route']: {
+          'input': {k: v['type'] for k, v in route['input']['properties'].items()},
+          'output': route['output']['type'],
+        }
+        for route in routes
+      }

-        if len(self.bento_yaml['services']) == 1:
-            pretty_yaml: dict[str, typing.Any] = {
-                'apis': _pretty_routes(self.bento_yaml['schema']['routes']),
-                'resources': self.bento_yaml['services'][0]['config']['resources'],
-                'envs': self.bento_yaml['envs'],
-                'platforms': self.platforms,
-            }
-            return pretty_yaml
-        return self.bento_yaml
+    if len(self.bento_yaml['services']) == 1:
+      pretty_yaml: dict[str, typing.Any] = {
+        'apis': _pretty_routes(self.bento_yaml['schema']['routes']),
+        'resources': self.bento_yaml['services'][0]['config']['resources'],
+        'envs': self.bento_yaml['envs'],
+        'platforms': self.platforms,
+      }
+      return pretty_yaml
+    return self.bento_yaml

-    @functools.cached_property
-    def pretty_gpu(self) -> str:
-        from openllm.accelerator_spec import ACCELERATOR_SPECS
+  @functools.cached_property
+  def pretty_gpu(self) -> str:
+    from openllm.accelerator_spec import ACCELERATOR_SPECS

-        try:
-            resources = self.bento_yaml['services'][0]['config']['resources']
-            if resources['gpu'] > 1:
-                acc = ACCELERATOR_SPECS[resources['gpu_type']]
-                return f'{acc.memory_size:.0f}Gx{resources["gpu"]}'
-            elif resources['gpu'] > 0:
-                acc = ACCELERATOR_SPECS[resources['gpu_type']]
-                return f'{acc.memory_size:.0f}G'
-        except KeyError:
-            pass
-        return ''
+    try:
+      resources = self.bento_yaml['services'][0]['config']['resources']
+      if resources['gpu'] > 1:
+        acc = ACCELERATOR_SPECS[resources['gpu_type']]
+        return f'{acc.memory_size:.0f}Gx{resources["gpu"]}'
+      elif resources['gpu'] > 0:
+        acc = ACCELERATOR_SPECS[resources['gpu_type']]
+        return f'{acc.memory_size:.0f}G'
+    except KeyError:
+      pass
+    return ''

-    def tolist(self) -> str | dict[str, typing.Any] | None:
-        verbose = VERBOSE_LEVEL.get()
-        if verbose <= 0:
-            return str(self)
-        if verbose <= 10:
-            return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml)
-        if verbose <= 20:
-            return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml)
-        return None
+  def tolist(self) -> str | dict[str, typing.Any] | None:
+    verbose = VERBOSE_LEVEL.get()
+    if verbose <= 0:
+      return str(self)
+    if verbose <= 10:
+      return dict(
+        tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml
+      )
+    if verbose <= 20:
+      return dict(
+        tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml
+      )
+    return None


 class VenvSpec(pydantic.BaseModel):
-    python_version: str
-    requirements_txt: str
-    envs: EnvVars
-    name_prefix: str = ''
+  python_version: str
+  requirements_txt: str
+  envs: EnvVars
+  name_prefix: str = ''

-    @functools.cached_property
-    def normalized_requirements_txt(self) -> str:
-        parameter_lines: list[str] = []
-        dependency_lines: list[str] = []
-        comment_lines: list[str] = []
+  @functools.cached_property
+  def normalized_requirements_txt(self) -> str:
+    parameter_lines: list[str] = []
+    dependency_lines: list[str] = []
+    comment_lines: list[str] = []

-        for line in self.requirements_txt.splitlines():
-            if not line.strip():
-                continue
-            elif line.strip().startswith('#'):
-                comment_lines.append(line.strip())
-            elif line.strip().startswith('-'):
-                parameter_lines.append(line.strip())
-            else:
-                dependency_lines.append(line.strip())
+    for line in self.requirements_txt.splitlines():
+      if not line.strip():
+        continue
+      elif line.strip().startswith('#'):
+        comment_lines.append(line.strip())
+      elif line.strip().startswith('-'):
+        parameter_lines.append(line.strip())
+      else:
+        dependency_lines.append(line.strip())

-        parameter_lines.sort()
-        dependency_lines.sort()
-        return '\n'.join(parameter_lines + dependency_lines).strip()
+    parameter_lines.sort()
+    dependency_lines.sort()
+    return '\n'.join(parameter_lines + dependency_lines).strip()

-    @functools.cached_property
-    def normalized_envs(self) -> str:
-        return '\n'.join(f'{k}={v}' for k, v in sorted(self.envs.items(), key=lambda x: x[0]) if not v)
+  @functools.cached_property
+  def normalized_envs(self) -> str:
+    return '\n'.join(f'{k}={v}' for k, v in sorted(self.envs.items(), key=lambda x: x[0]) if not v)

-    @override
-    def __hash__(self) -> int:
-        return md5(self.normalized_requirements_txt, str(hash(self.normalized_envs)))
+  @override
+  def __hash__(self) -> int:
+    return md5(self.normalized_requirements_txt, str(hash(self.normalized_envs)))


 class Accelerator(pydantic.BaseModel):
-    model: str
-    memory_size: float
+  model: str
+  memory_size: float

-    def __gt__(self, other: Accelerator) -> bool:
-        return self.memory_size > other.memory_size
+  def __gt__(self, other: Accelerator) -> bool:
+    return self.memory_size > other.memory_size

-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Accelerator):
-            return NotImplemented
-        return self.memory_size == other.memory_size
+  def __eq__(self, other: object) -> bool:
+    if not isinstance(other, Accelerator):
+      return NotImplemented
+    return self.memory_size == other.memory_size

-    def __repr__(self) -> str:
-        return f'{self.model}({self.memory_size}GB)'
+  def __repr__(self) -> str:
+    return f'{self.model}({self.memory_size}GB)'


 class DeploymentTarget(pydantic.BaseModel):
-    accelerators: list[Accelerator]
-    source: str = 'local'
-    name: str = 'local'
-    price: str = ''
-    platform: str = 'linux'
+  accelerators: list[Accelerator]
+  source: str = 'local'
+  name: str = 'local'
+  price: str = ''
+  platform: str = 'linux'

-    @override
-    def __hash__(self) -> int:
-        return hash(self.source)
+  @override
+  def __hash__(self) -> int:
+    return hash(self.source)

-    @property
-    def accelerators_repr(self) -> str:
-        accs = {a.model for a in self.accelerators}
-        if len(accs) == 0:
-            return 'null'
-        if len(accs) == 1:
-            a = self.accelerators[0]
-            return f'{a.model} x{len(self.accelerators)}'
-        return ', '.join((f'{a.model}' for a in self.accelerators))
+  @property
+  def accelerators_repr(self) -> str:
+    accs = {a.model for a in self.accelerators}
+    if len(accs) == 0:
+      return 'null'
+    if len(accs) == 1:
+      a = self.accelerators[0]
+      return f'{a.model} x{len(self.accelerators)}'
+    return ', '.join((f'{a.model}' for a in self.accelerators))


 def run_command(
-    cmd: list[str],
-    cwd: str | None = None,
-    env: EnvVars | None = None,
-    copy_env: bool = True,
-    venv: pathlib.Path | None = None,
-    silent: bool = False,
+  cmd: list[str],
+  cwd: str | None = None,
+  env: EnvVars | None = None,
+  copy_env: bool = True,
+  venv: pathlib.Path | None = None,
+  silent: bool = False,
 ) -> subprocess.CompletedProcess[typing.Any]:
-    env = env or EnvVars({})
-    cmd = [str(c) for c in cmd]
-    bin_dir = 'Scripts' if os.name == 'nt' else 'bin'
-    if not silent:
-        output('\n')
-        if cwd:
-            output(f'$ cd {cwd}', style='orange')
-        if env:
-            for k, v in env.items():
-                output(f'$ export {k}={shlex.quote(v)}', style='orange')
-        if venv:
-            output(f'$ source {venv / "bin" / "activate"}', style='orange')
-        output(f'$ {" ".join(cmd)}', style='orange')
-
+  env = env or EnvVars({})
+  cmd = [str(c) for c in cmd]
+  bin_dir = 'Scripts' if os.name == 'nt' else 'bin'
+  if not silent:
+    output('\n')
+    if cwd:
+      output(f'$ cd {cwd}', style='orange')
+    if env:
+      for k, v in env.items():
+        output(f'$ export {k}={shlex.quote(v)}', style='orange')
    if venv:
-        py = venv / bin_dir / f'python{sysconfig.get_config_var("EXE")}'
+      output(f'$ source {venv / "bin" / "activate"}', style='orange')
+    output(f'$ {" ".join(cmd)}', style='orange')
+
+  if venv:
+    py = venv / bin_dir / f'python{sysconfig.get_config_var("EXE")}'
+  else:
+    py = pathlib.Path(sys.executable)
+
+  if copy_env:
+    env = EnvVars({**os.environ, **env})
+
+  if cmd and cmd[0] == 'bentoml':
+    cmd = [py.__fspath__(), '-m', 'bentoml', *cmd[1:]]
+  if cmd and cmd[0] == 'python':
+    cmd = [py.__fspath__(), *cmd[1:]]
+
+  try:
+    if silent:
+      return subprocess.run(
+        cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
+      )
    else:
-        py = pathlib.Path(sys.executable)
-
-    if copy_env:
-        env = EnvVars({**os.environ, **env})
-
-    if cmd and cmd[0] == 'bentoml':
-        cmd = [py.__fspath__(), '-m', 'bentoml'] + cmd[1:]
-    if cmd and cmd[0] == 'python':
-        cmd = [py.__fspath__()] + cmd[1:]
-
-    try:
-        if silent:
-            return subprocess.run(
-                cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
-            )
-        else:
-            return subprocess.run(cmd, cwd=cwd, env=env, check=True)
-    except Exception as e:
-        if VERBOSE_LEVEL.get() >= 20:
-            output(str(e), style='red')
-        raise typer.Exit(1)
+      return subprocess.run(cmd, cwd=cwd, env=env, check=True)
+  except Exception as e:
+    if VERBOSE_LEVEL.get() >= 20:
+      output(str(e), style='red')
+    raise typer.Exit(1)


-async def stream_command_output(stream: asyncio.streams.StreamReader | None, style: str = 'gray') -> None:
-    if stream:
-        async for line in stream:
-            output(line.decode(), style=style, end='')
+async def stream_command_output(
+  stream: asyncio.streams.StreamReader | None, style: str = 'gray'
+) -> None:
+  if stream:
+    async for line in stream:
+      output(line.decode(), style=style, end='')


@asynccontextmanager
 async def async_run_command(
-    cmd: list[str],
-    cwd: str | None = None,
-    env: EnvVars | None = None,
-    copy_env: bool = True,
-    venv: pathlib.Path | None = None,
-    silent: bool = True,
+  cmd: list[str],
+  cwd: str | None = None,
+  env: EnvVars | None = None,
+  copy_env: bool = True,
+  venv: pathlib.Path | None = None,
+  silent: bool = True,
 ) -> typing.AsyncGenerator[asyncio.subprocess.Process]:
-    env = env or EnvVars({})
-    cmd = [str(c) for c in cmd]
-
-    if not silent:
-        output('\n')
-        if cwd:
-            output(f'$ cd {cwd}', style='orange')
-        if env:
-            for k, v in env.items():
-                output(f'$ export {k}={shlex.quote(v)}', style='orange')
-        if venv:
-            output(f'$ source {venv / "bin" / "activate"}', style='orange')
-        output(f'$ {" ".join(cmd)}', style='orange')
+  env = env or EnvVars({})
+  cmd = [str(c) for c in cmd]

+  if not silent:
+    output('\n')
+    if cwd:
+      output(f'$ cd {cwd}', style='orange')
+    if env:
+      for k, v in env.items():
+        output(f'$ export {k}={shlex.quote(v)}', style='orange')
    if venv:
-        py = venv / 'bin' / 'python'
-    else:
-        py = pathlib.Path(sys.executable)
+      output(f'$ source {venv / "bin" / "activate"}', style='orange')
+    output(f'$ {" ".join(cmd)}', style='orange')

-    if copy_env:
-        env = EnvVars({**os.environ, **env})
+  if venv:
+    py = venv / 'bin' / 'python'
+  else:
+    py = pathlib.Path(sys.executable)

-    if cmd and cmd[0] == 'bentoml':
-        cmd = [py.__fspath__(), '-m', 'bentoml'] + cmd[1:]
-    if cmd and cmd[0] == 'python':
-        cmd = [py.__fspath__()] + cmd[1:]
+  if copy_env:
+    env = EnvVars({**os.environ, **env})

-    proc = None
-    try:
-        proc = await asyncio.create_subprocess_shell(
-            ' '.join(map(str, cmd)), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd, env=env
-        )
-        yield proc
-    except subprocess.CalledProcessError:
-        output('Command failed', style='red')
-        raise typer.Exit(1)
-    finally:
-        if proc:
-            proc.send_signal(signal.SIGINT)
-            await proc.wait()
+  if cmd and cmd[0] == 'bentoml':
+    cmd = [py.__fspath__(), '-m', 'bentoml', *cmd[1:]]
+  if cmd and cmd[0] == 'python':
+    cmd = [py.__fspath__(), *cmd[1:]]
+
+  proc = None
+  try:
+    proc = await asyncio.create_subprocess_shell(
+      ' '.join(map(str, cmd)),
+      stdout=asyncio.subprocess.PIPE,
+      stderr=asyncio.subprocess.PIPE,
+      cwd=cwd,
+      env=env,
+    )
+    yield proc
+  except subprocess.CalledProcessError:
+    output('Command failed', style='red')
+    raise typer.Exit(1)
+  finally:
+    if proc:
+      proc.send_signal(signal.SIGINT)
+      await proc.wait()


 def md5(*strings: str) -> int:
-    m = hashlib.md5()
-    for s in strings:
-        m.update(s.encode())
-    return int(m.hexdigest(), 16)
+  m = hashlib.md5()
+  for s in strings:
+    m.update(s.encode())
+  return int(m.hexdigest(), 16)
--- a/src/openllm/local.py
+++ b/src/openllm/local.py
@@ -4,103 +4,114 @@ import asyncio, time, typing
 import httpx, openai

 from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
-from openllm.common import BentoInfo, EnvVars, async_run_command, output, run_command, stream_command_output
+from openllm.common import (
+  BentoInfo,
+  EnvVars,
+  async_run_command,
+  output,
+  run_command,
+  stream_command_output,
+)
 from openllm.venv import ensure_venv

 if typing.TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionMessageParam
+  from openai.types.chat import ChatCompletionMessageParam


 def prep_env_vars(bento: BentoInfo) -> None:
-    import os
+  import os

-    env_vars = bento.envs
-    for env_var in env_vars:
-        if not env_var.get('value'):
-            continue
-        key = env_var['name']
-        value = env_var['value']
-        os.environ[key] = value
+  env_vars = bento.envs
+  for env_var in env_vars:
+    if not env_var.get('value'):
+      continue
+    key = env_var['name']
+    value = env_var['value']
+    os.environ[key] = value


 def _get_serve_cmd(bento: BentoInfo, port: int = 3000) -> tuple[list[str], EnvVars]:
-    cmd = ['bentoml', 'serve', bento.bentoml_tag]
-    if port != 3000:
-        cmd += ['--port', str(port)]
-    return cmd, EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})
+  cmd = ['bentoml', 'serve', bento.bentoml_tag]
+  if port != 3000:
+    cmd += ['--port', str(port)]
+  return cmd, EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})


 def serve(bento: BentoInfo, port: int = 3000) -> None:
-    prep_env_vars(bento)
-    cmd, env = _get_serve_cmd(bento, port=port)
-    venv = ensure_venv(bento, runtime_envs=env)
-    output(f'Access the Chat UI at http://localhost:{port}/chat (or with you IP)')
-    run_command(cmd, env=env, cwd=None, venv=venv)
+  prep_env_vars(bento)
+  cmd, env = _get_serve_cmd(bento, port=port)
+  venv = ensure_venv(bento, runtime_envs=env)
+  output(f'Access the Chat UI at http://localhost:{port}/chat (or with you IP)')
+  run_command(cmd, env=env, cwd=None, venv=venv)


 async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600) -> None:
-    cmd, env = _get_serve_cmd(bento, port)
-    venv = ensure_venv(bento, runtime_envs=env)
-    async with async_run_command(cmd, env=env, cwd=None, venv=venv, silent=False) as server_proc:
-        output(f'Model server started {server_proc.pid}')
+  cmd, env = _get_serve_cmd(bento, port)
+  venv = ensure_venv(bento, runtime_envs=env)
+  async with async_run_command(cmd, env=env, cwd=None, venv=venv, silent=False) as server_proc:
+    output(f'Model server started {server_proc.pid}')

-        stdout_streamer = None
-        stderr_streamer = None
-        start_time = time.time()
+    stdout_streamer = None
+    stderr_streamer = None
+    start_time = time.time()

-        output('Model loading...', style='green')
-        for _ in range(timeout):
-            try:
-                resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)
-                if resp.status_code == 200:
-                    break
-            except httpx.RequestError:
-                if time.time() - start_time > 30:
-                    if not stdout_streamer:
-                        stdout_streamer = asyncio.create_task(stream_command_output(server_proc.stdout, style='gray'))
-                    if not stderr_streamer:
-                        stderr_streamer = asyncio.create_task(
-                            stream_command_output(server_proc.stderr, style='#BD2D0F')
-                        )
-                await asyncio.sleep(1)
-        else:
-            output('Model failed to load', style='red')
-            server_proc.terminate()
-            return
+    output('Model loading...', style='green')
+    for _ in range(timeout):
+      try:
+        resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)
+        if resp.status_code == 200:
+          break
+      except httpx.RequestError:
+        if time.time() - start_time > 30:
+          if not stdout_streamer:
+            stdout_streamer = asyncio.create_task(
+              stream_command_output(server_proc.stdout, style='gray')
+            )
+          if not stderr_streamer:
+            stderr_streamer = asyncio.create_task(
+              stream_command_output(server_proc.stderr, style='#BD2D0F')
+            )
+        await asyncio.sleep(1)
+    else:
+      output('Model failed to load', style='red')
+      server_proc.terminate()
+      return

-        if stdout_streamer:
-            stdout_streamer.cancel()
-        if stderr_streamer:
-            stderr_streamer.cancel()
+    if stdout_streamer:
+      stdout_streamer.cancel()
+    if stderr_streamer:
+      stderr_streamer.cancel()

-        output('Model is ready', style='green')
-        messages: list[ChatCompletionMessageParam] = []
+    output('Model is ready', style='green')
+    messages: list[ChatCompletionMessageParam] = []

-        client = openai.AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')
-        while True:
-            try:
-                message = input('user: ')
-                if message == '':
-                    output('empty message, please enter something', style='yellow')
-                    continue
-                messages.append(ChatCompletionUserMessageParam(role='user', content=message))
-                output('assistant: ', end='', style='lightgreen')
-                assistant_message = ''
-                stream = await client.chat.completions.create(
-                    model=(await client.models.list()).data[0].id, messages=messages, stream=True
-                )
-                async for chunk in stream:
-                    text = chunk.choices[0].delta.content or ''
-                    assistant_message += text
-                    output(text, end='', style='lightgreen')
-                messages.append(ChatCompletionAssistantMessageParam(role='assistant', content=assistant_message))
-                output('')
-            except KeyboardInterrupt:
-                break
-        output('\nStopping model server...', style='green')
-    output('Stopped model server', style='green')
+    client = openai.AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')
+    while True:
+      try:
+        message = input('user: ')
+        if message == '':
+          output('empty message, please enter something', style='yellow')
+          continue
+        messages.append(ChatCompletionUserMessageParam(role='user', content=message))
+        output('assistant: ', end='', style='lightgreen')
+        assistant_message = ''
+        stream = await client.chat.completions.create(
+          model=(await client.models.list()).data[0].id, messages=messages, stream=True
+        )
+        async for chunk in stream:
+          text = chunk.choices[0].delta.content or ''
+          assistant_message += text
+          output(text, end='', style='lightgreen')
+        messages.append(
+          ChatCompletionAssistantMessageParam(role='assistant', content=assistant_message)
+        )
+        output('')
+      except KeyboardInterrupt:
+        break
+    output('\nStopping model server...', style='green')
+  output('Stopped model server', style='green')


 def run(bento: BentoInfo, port: int = 3000, timeout: int = 600) -> None:
-    prep_env_vars(bento)
-    asyncio.run(_run_model(bento, port=port, timeout=timeout))
+  prep_env_vars(bento)
+  asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/src/openllm/model.py
+++ b/src/openllm/model.py
@@ -14,155 +14,159 @@ app = OpenLLMTyper(help='manage models')

@app.command(help='get model')
 def get(tag: str, repo: typing.Optional[str] = None, verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    bento_info = ensure_bento(tag, repo_name=repo)
-    if bento_info:
-        output_(bento_info)
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  bento_info = ensure_bento(tag, repo_name=repo)
+  if bento_info:
+    output_(bento_info)


@app.command(name='list', help='list available models')
 def list_model(
-    tag: typing.Optional[str] = None,
-    repo: typing.Optional[str] = None,
-    verbose: bool = False,
-    output: typing.Optional[str] = typer.Option(None, hidden=True),
+  tag: typing.Optional[str] = None,
+  repo: typing.Optional[str] = None,
+  verbose: bool = False,
+  output: typing.Optional[str] = typer.Option(None, hidden=True),
 ) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
+  if verbose:
+    VERBOSE_LEVEL.set(20)

-    bentos = list_bento(tag=tag, repo_name=repo)
-    bentos.sort(key=lambda x: x.name)
+  bentos = list_bento(tag=tag, repo_name=repo)
+  bentos.sort(key=lambda x: x.name)

-    seen = set()
+  seen = set()

-    def is_seen(value: str) -> bool:
-        if value in seen:
-            return True
-        seen.add(value)
-        return False
+  def is_seen(value: str) -> bool:
+    if value in seen:
+      return True
+    seen.add(value)
+    return False

-    if output == 'readme':
-        # Parse parameters from bento.tag (e.g. "model:671b-it" -> "671b", 'model:something-long-78b' -> '78b')
-        questionary.print(
-            json.dumps({
-                f'{bento.name}': dict(
-                    tag=bento.tag,
-                    version=bento.tag.split(':')[-1],
-                    pretty_gpu=bento.pretty_gpu,
-                    command=f'openllm serve {bento.tag}',
-                )
-                for bento in bentos
-                if not is_seen(bento.name)
-            })
+  if output == 'readme':
+    # Parse parameters from bento.tag (e.g. "model:671b-it" -> "671b", 'model:something-long-78b' -> '78b')
+    questionary.print(
+      json.dumps({
+        f'{bento.name}': dict(
+          tag=bento.tag,
+          version=bento.tag.split(':')[-1],
+          pretty_gpu=bento.pretty_gpu,
+          command=f'openllm serve {bento.tag}',
        )
-        return
-
-    table = tabulate.tabulate(
-        [
-            [
-                '' if is_seen(bento.name) else bento.name,
-                bento.tag,
-                bento.repo.name,
-                bento.pretty_gpu,
-                ','.join(bento.platforms),
-            ]
-            for bento in bentos
-        ],
-        headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],
+        for bento in bentos
+        if not is_seen(bento.name)
+      })
    )
-    output_(table)
+    return
+
+  table = tabulate.tabulate(
+    [
+      [
+        '' if is_seen(bento.name) else bento.name,
+        bento.tag,
+        bento.repo.name,
+        bento.pretty_gpu,
+        ','.join(bento.platforms),
+      ]
+      for bento in bentos
+    ],
+    headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],
+  )
+  output_(table)


 def ensure_bento(
-    model: str, target: typing.Optional[DeploymentTarget] = None, repo_name: typing.Optional[str] = None
+  model: str,
+  target: typing.Optional[DeploymentTarget] = None,
+  repo_name: typing.Optional[str] = None,
 ) -> BentoInfo:
-    bentos = list_bento(model, repo_name=repo_name)
-    if len(bentos) == 0:
-        output_(f'No model found for {model}', style='red')
-        raise typer.Exit(1)
-
-    if len(bentos) == 1:
-        output_(f'Found model {bentos[0]}', style='green')
-        if target is not None and can_run(bentos[0], target) <= 0:
-            output_(
-                f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '
-                f'resources to run model {bentos[0]}\n',
-                style='yellow',
-            )
-        return bentos[0]
-
-    # multiple models, pick one according to target
-    output_(f'Multiple models match {model}, did you mean one of these?', style='red')
-    list_model(model, repo=repo_name)
+  bentos = list_bento(model, repo_name=repo_name)
+  if len(bentos) == 0:
+    output_(f'No model found for {model}', style='red')
    raise typer.Exit(1)

+  if len(bentos) == 1:
+    output_(f'Found model {bentos[0]}', style='green')
+    if target is not None and can_run(bentos[0], target) <= 0:
+      output_(
+        f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '
+        f'resources to run model {bentos[0]}\n',
+        style='yellow',
+      )
+    return bentos[0]
+
+  # multiple models, pick one according to target
+  output_(f'Multiple models match {model}, did you mean one of these?', style='red')
+  list_model(model, repo=repo_name)
+  raise typer.Exit(1)
+

 NUMBER_RE = re.compile(r'\d+')


 def _extract_first_number(s: str) -> int:
-    match = NUMBER_RE.search(s)
-    if match:
-        return int(match.group())
-    else:
-        return 100
+  match = NUMBER_RE.search(s)
+  if match:
+    return int(match.group())
+  else:
+    return 100


 def list_bento(
-    tag: typing.Optional[str] = None, repo_name: typing.Optional[str] = None, include_alias: bool = False
+  tag: typing.Optional[str] = None,
+  repo_name: typing.Optional[str] = None,
+  include_alias: bool = False,
 ) -> typing.List[BentoInfo]:
-    ensure_repo_updated()
+  ensure_repo_updated()

-    if repo_name is None and tag and '/' in tag:
-        repo_name, tag = tag.split('/', 1)
+  if repo_name is None and tag and '/' in tag:
+    repo_name, tag = tag.split('/', 1)

-    repo_list = list_repo(repo_name)
-    if repo_name is not None:
-        repo_map = {repo.name: repo for repo in repo_list}
-        if repo_name not in repo_map:
-            output_(f'Repo `{repo_name}` not found, did you mean one of these?')
-            for repo_name in repo_map:
-                output_(f'  {repo_name}')
-            raise typer.Exit(1)
+  repo_list = list_repo(repo_name)
+  if repo_name is not None:
+    repo_map = {repo.name: repo for repo in repo_list}
+    if repo_name not in repo_map:
+      output_(f'Repo `{repo_name}` not found, did you mean one of these?')
+      for repo_name in repo_map:
+        output_(f'  {repo_name}')
+      raise typer.Exit(1)

-    if not tag:
-        glob_pattern = 'bentoml/bentos/*/*'
-    elif ':' in tag:
-        bento_name, version = tag.split(':')
-        glob_pattern = f'bentoml/bentos/{bento_name}/{version}'
-    else:
-        glob_pattern = f'bentoml/bentos/{tag}/*'
+  if not tag:
+    glob_pattern = 'bentoml/bentos/*/*'
+  elif ':' in tag:
+    bento_name, version = tag.split(':')
+    glob_pattern = f'bentoml/bentos/{bento_name}/{version}'
+  else:
+    glob_pattern = f'bentoml/bentos/{tag}/*'

-    model_list: list[BentoInfo] = []
-    repo_list = list_repo(repo_name)
-    for repo in repo_list:
-        paths = sorted(
-            repo.path.glob(glob_pattern),
-            key=lambda x: (x.parent.name, _extract_first_number(x.name), len(x.name), x.name),
-        )
-        for path in paths:
-            if path.is_dir() and (path / 'bento.yaml').exists():
-                model = BentoInfo(repo=repo, path=path)
-            elif path.is_file():
-                with open(path) as f:
-                    origin_name = f.read().strip()
-                origin_path = path.parent / origin_name
-                model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
-            else:
-                model = None
-            if model:
-                model_list.append(model)
+  model_list: list[BentoInfo] = []
+  repo_list = list_repo(repo_name)
+  for repo in repo_list:
+    paths = sorted(
+      repo.path.glob(glob_pattern),
+      key=lambda x: (x.parent.name, _extract_first_number(x.name), len(x.name), x.name),
+    )
+    for path in paths:
+      if path.is_dir() and (path / 'bento.yaml').exists():
+        model = BentoInfo(repo=repo, path=path)
+      elif path.is_file():
+        with open(path) as f:
+          origin_name = f.read().strip()
+        origin_path = path.parent / origin_name
+        model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
+      else:
+        model = None
+      if model:
+        model_list.append(model)

-    if not include_alias:
-        seen: set[str] = set()
-        # we are calling side-effect in seen here.
-        model_list = [
-            x
-            for x in model_list
-            if not (
-                f'{x.bento_yaml["name"]}:{x.bento_yaml["version"]}' in seen
-                or seen.add(f'{x.bento_yaml["name"]}:{x.bento_yaml["version"]}')  # type: ignore
-            )
-        ]
-    return model_list
+  if not include_alias:
+    seen: set[str] = set()
+    # we are calling side-effect in seen here.
+    model_list = [
+      x
+      for x in model_list
+      if not (
+        f'{x.bento_yaml["name"]}:{x.bento_yaml["version"]}' in seen
+        or seen.add(f'{x.bento_yaml["name"]}:{x.bento_yaml["version"]}')  # type: ignore
+      )
+    ]
+  return model_list
--- a/src/openllm/repo.py
+++ b/src/openllm/repo.py
@@ -4,7 +4,15 @@ import datetime, subprocess, re, shutil, typing, os, pathlib
 import pyaml, questionary, typer

 from openllm.analytic import OpenLLMTyper
-from openllm.common import INTERACTIVE, REPO_DIR, VERBOSE_LEVEL, RepoInfo, load_config, output, save_config
+from openllm.common import (
+  INTERACTIVE,
+  REPO_DIR,
+  VERBOSE_LEVEL,
+  RepoInfo,
+  load_config,
+  output,
+  save_config,
+)

 UPDATE_INTERVAL = datetime.timedelta(days=3)
 TEST_REPO = os.getenv('OPENLLM_TEST_REPO', None)  # for testing
@@ -15,223 +23,248 @@ app = OpenLLMTyper(help='manage repos')

@app.command(name='list', help='list available repo')
 def cmd_list(verbose: bool = False) -> None:
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    pyaml.pprint(list_repo(), sort_dicts=False, sort_keys=False)
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  pyaml.pprint(list_repo(), sort_dicts=False, sort_keys=False)


@app.command(name='remove', help='remove given repo')
 def cmd_remove(name: str) -> None:
-    if TEST_REPO:
-        return
-    config = load_config()
-    if name not in config.repos:
-        output(f'Repo {name} does not exist', style='red')
-        return
+  if TEST_REPO:
+    return
+  config = load_config()
+  if name not in config.repos:
+    output(f'Repo {name} does not exist', style='red')
+    return

-    del config.repos[name]
-    save_config(config)
-    output(f'Repo {name} removed', style='green')
+  del config.repos[name]
+  save_config(config)
+  output(f'Repo {name} removed', style='green')


@app.command(name='update', help='update default repo')
 def cmd_update() -> None:
-    if TEST_REPO:
-        return
-    repos_in_use = set()
-    for repo in list_repo():
-        repos_in_use.add((repo.server, repo.owner, repo.repo, repo.branch))
-        if repo.path.exists():
-            shutil.rmtree(repo.path, ignore_errors=True)
-        repo.path.parent.mkdir(parents=True, exist_ok=True)
-        try:
-            _clone_repo(repo)
-            output('')
-            output(f'Repo `{repo.name}` updated', style='green')
-        except Exception as e:
-            shutil.rmtree(repo.path, ignore_errors=True)
-            output(f'Failed to clone repo {repo.name}', style='red')
-            output(e)
-    for c in REPO_DIR.glob('*/*/*/*'):
-        repo_spec = tuple(c.parts[-4:])
-        if repo_spec not in repos_in_use:
-            shutil.rmtree(c, ignore_errors=True)
-            output(f'Removed unused repo cache {c}')
-    with open(REPO_DIR / 'last_update', 'w') as f:
-        f.write(datetime.datetime.now().isoformat())
-    for repo in list_repo():
-        _complete_alias(repo.name)
+  if TEST_REPO:
+    return
+
+  repos_in_use = set()
+  for repo in list_repo():
+    # Show simplified output if not in verbose mode
+    if VERBOSE_LEVEL.get() <= 0:
+      output(f'updating repo {repo.name}', style='green')
+
+    repos_in_use.add((repo.server, repo.owner, repo.repo, repo.branch))
+    if repo.path.exists():
+      shutil.rmtree(repo.path, ignore_errors=True)
+    repo.path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+      _clone_repo(repo)
+      if VERBOSE_LEVEL.get() > 0:
+        output('')
+        output(f'Repo `{repo.name}` updated', style='green')
+    except Exception as e:
+      shutil.rmtree(repo.path, ignore_errors=True)
+      if VERBOSE_LEVEL.get() > 0:
+        output(f'Failed to clone repo {repo.name}', style='red')
+        output(e)
+  for c in REPO_DIR.glob('*/*/*/*'):
+    repo_spec = tuple(c.parts[-4:])
+    if repo_spec not in repos_in_use:
+      shutil.rmtree(c, ignore_errors=True)
+      if VERBOSE_LEVEL.get() > 0:
+        output(f'Removed unused repo cache {c}')
+  with open(REPO_DIR / 'last_update', 'w') as f:
+    f.write(datetime.datetime.now().isoformat())
+  for repo in list_repo():
+    _complete_alias(repo.name)


@app.command(name='add', help='add new repo')
 def cmd_add(name: str, repo: str) -> None:
-    if TEST_REPO:
-        return
-    name = name.lower()
-    if not name.isidentifier():
-        output(f'Invalid repo name: {name}, should only contain letters, numbers and underscores', style='red')
-        return
+  if TEST_REPO:
+    return
+  name = name.lower()
+  if not name.isidentifier():
+    output(
+      f'Invalid repo name: {name}, should only contain letters, numbers and underscores',
+      style='red',
+    )
+    return

-    try:
-        parse_repo_url(repo)
-    except ValueError:
-        output(f'Invalid repo url: {repo}', style='red')
-        return
+  try:
+    parse_repo_url(repo)
+  except ValueError:
+    output(f'Invalid repo url: {repo}', style='red')
+    return

-    config = load_config()
-    if name in config.repos:
-        override = questionary.confirm(f'Repo {name} already exists({config.repos[name]}), override?').ask()
-        if not override:
-            return
+  config = load_config()
+  if name in config.repos:
+    override = questionary.confirm(
+      f'Repo {name} already exists({config.repos[name]}), override?'
+    ).ask()
+    if not override:
+      return

-    config.repos[name] = repo
-    save_config(config)
-    output(f'Repo {name} added', style='green')
+  config.repos[name] = repo
+  save_config(config)
+  output(f'Repo {name} added', style='green')


@app.command(name='default', help='get default repo path')
 def default() -> typing.Optional[pathlib.Path]:
-    if TEST_REPO:
-        return None
-    output((info := parse_repo_url(load_config().repos['default'], 'default')).path)
-    return info.path
+  if TEST_REPO:
+    return None
+  output((info := parse_repo_url(load_config().repos['default'], 'default')).path)
+  return info.path


 def list_repo(repo_name: typing.Optional[str] = None) -> typing.List[RepoInfo]:
-    if TEST_REPO:
-        return [
-            RepoInfo(
-                name='default',
-                url='',
-                server='test',
-                owner='test',
-                repo='test',
-                branch='main',
-                path=pathlib.Path(TEST_REPO),
-            )
-        ]
-    config = load_config()
-    repos = []
-    for _repo_name, repo_url in config.repos.items():
-        if repo_name is not None and _repo_name != repo_name:
-            continue
-        repo = parse_repo_url(repo_url, _repo_name)
-        repos.append(repo)
-    return repos
+  if TEST_REPO:
+    return [
+      RepoInfo(
+        name='default',
+        url='',
+        server='test',
+        owner='test',
+        repo='test',
+        branch='main',
+        path=pathlib.Path(TEST_REPO),
+      )
+    ]
+  config = load_config()
+  repos = []
+  for _repo_name, repo_url in config.repos.items():
+    if repo_name is not None and _repo_name != repo_name:
+      continue
+    repo = parse_repo_url(repo_url, _repo_name)
+    repos.append(repo)
+  return repos


 def _complete_alias(repo_name: str) -> None:
-    from openllm.model import list_bento
+  from openllm.model import list_bento

-    for bento in list_bento(repo_name=repo_name):
-        alias = bento.labels.get('aliases', '').strip()
-        if alias:
-            for a in alias.split(','):
-                with open(bento.path.parent / a, 'w') as f:
-                    f.write(bento.version)
+  for bento in list_bento(repo_name=repo_name):
+    alias = bento.labels.get('aliases', '').strip()
+    if alias:
+      for a in alias.split(','):
+        with open(bento.path.parent / a, 'w') as f:
+          f.write(bento.version)


 def _clone_repo(repo: RepoInfo) -> None:
-    try:
-        subprocess.run(['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)], check=True)
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        import dulwich
-        import dulwich.porcelain
+  try:
+    # Suppress output if verbosity level is low
+    if VERBOSE_LEVEL.get() <= 0:
+      subprocess.run(
+        ['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)],
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+      )
+    else:
+      subprocess.run(
+        ['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)], check=True
+      )
+  except (subprocess.CalledProcessError, FileNotFoundError):
+    import dulwich
+    import dulwich.porcelain

-        dulwich.porcelain.clone(repo.url, str(repo.path), checkout=True, depth=1, branch=repo.branch)
+    # Dulwich doesn't have easy output suppression, but we rarely get here
+    dulwich.porcelain.clone(repo.url, str(repo.path), checkout=True, depth=1, branch=repo.branch)


 def ensure_repo_updated() -> None:
-    if TEST_REPO:
-        return
-    last_update_file = REPO_DIR / 'last_update'
-    if not last_update_file.exists():
-        if INTERACTIVE.get():
-            choice = questionary.confirm(
-                'The repo cache is never updated, do you want to update it to fetch the latest model list?'
-            ).ask()
-            if choice:
-                cmd_update()
-            return
-        else:
-            output(
-                'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list',
-                style='red',
-            )
-            raise typer.Exit(1)
-    last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
-    if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
-        if INTERACTIVE.get():
-            choice = questionary.confirm(
-                'The repo cache is outdated, do you want to update it to fetch the latest model list?'
-            ).ask()
-            if choice:
-                cmd_update()
-        else:
-            output(
-                'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list',
-                style='yellow',
-            )
+  if TEST_REPO:
+    return
+  last_update_file = REPO_DIR / 'last_update'
+  if not last_update_file.exists():
+    if INTERACTIVE.get():
+      choice = questionary.confirm(
+        'The repo cache is never updated, do you want to update it to fetch the latest model list?'
+      ).ask()
+      if choice:
+        cmd_update()
+      return
+    else:
+      output(
+        'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list',
+        style='red',
+      )
+      raise typer.Exit(1)
+  last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
+  if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
+    if INTERACTIVE.get():
+      choice = questionary.confirm(
+        'The repo cache is outdated, do you want to update it to fetch the latest model list?'
+      ).ask()
+      if choice:
+        cmd_update()
+    else:
+      output(
+        'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list',
+        style='yellow',
+      )


 GIT_HTTP_RE = re.compile(
-    r'(?P<schema>git|ssh|http|https):\/\/(?P<server>[\.\w\d\-]+)\/(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
+  r'(?P<schema>git|ssh|http|https):\/\/(?P<server>[\.\w\d\-]+)\/(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
 )
 GIT_SSH_RE = re.compile(
-    r'git@(?P<server>[\.\w\d-]+):(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
+  r'git@(?P<server>[\.\w\d-]+):(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
 )


 def parse_repo_url(repo_url: str, repo_name: typing.Optional[str] = None) -> RepoInfo:
-    """
-    parse the git repo url to server, owner, repo name, branch
-    >>> parse_repo_url('https://github.com/bentoml/bentovllm@main')
-    ('github.com', 'bentoml', 'bentovllm', 'main')
+  """
+  parse the git repo url to server, owner, repo name, branch
+  >>> parse_repo_url('https://github.com/bentoml/bentovllm@main')
+  ('github.com', 'bentoml', 'bentovllm', 'main')

-    >>> parse_repo_url('https://github.com/bentoml/bentovllm.git@main')
-    ('github.com', 'bentoml', 'bentovllm', 'main')
+  >>> parse_repo_url('https://github.com/bentoml/bentovllm.git@main')
+  ('github.com', 'bentoml', 'bentovllm', 'main')

-    >>> parse_repo_url('https://github.com/bentoml/bentovllm')
-    ('github.com', 'bentoml', 'bentovllm', 'main')
+  >>> parse_repo_url('https://github.com/bentoml/bentovllm')
+  ('github.com', 'bentoml', 'bentovllm', 'main')

-    >>> parse_repo_url('git@github.com:bentoml/openllm-models.git')
-    ('github.com', 'bentoml', 'openllm-models', 'main')
-    """
-    match = GIT_HTTP_RE.match(repo_url)
-    if match:
-        schema = match.group('schema')
-    else:
-        match = GIT_SSH_RE.match(repo_url)
-        if not match:
-            raise ValueError(f'Invalid git repo url: {repo_url}')
-        schema = None
+  >>> parse_repo_url('git@github.com:bentoml/openllm-models.git')
+  ('github.com', 'bentoml', 'openllm-models', 'main')
+  """
+  match = GIT_HTTP_RE.match(repo_url)
+  if match:
+    schema = match.group('schema')
+  else:
+    match = GIT_SSH_RE.match(repo_url)
+    if not match:
+      raise ValueError(f'Invalid git repo url: {repo_url}')
+    schema = None

-    if match.group('branch') is not None:
-        repo_url = repo_url[: match.start('branch') - 1]
+  if match.group('branch') is not None:
+    repo_url = repo_url[: match.start('branch') - 1]

-    server = match.group('server')
-    owner = match.group('owner')
-    repo = match.group('repo')
-    if repo.endswith('.git'):
-        repo = repo[:-4]
-    branch = match.group('branch') or 'main'
+  server = match.group('server')
+  owner = match.group('owner')
+  repo = match.group('repo')
+  if repo.endswith('.git'):
+    repo = repo[:-4]
+  branch = match.group('branch') or 'main'

-    if schema is not None:
-        repo_url = f'{schema}://{server}/{owner}/{repo}'
-    else:
-        repo_url = f'git@{server}:{owner}/{repo}'
+  if schema is not None:
+    repo_url = f'{schema}://{server}/{owner}/{repo}'
+  else:
+    repo_url = f'git@{server}:{owner}/{repo}'

-    path = REPO_DIR / server / owner / repo / branch
-    return RepoInfo(
-        name=repo if repo_name is None else repo_name,
-        url=repo_url,
-        server=server,
-        owner=owner,
-        repo=repo,
-        branch=branch,
-        path=path,
-    )
+  path = REPO_DIR / server / owner / repo / branch
+  return RepoInfo(
+    name=repo if repo_name is None else repo_name,
+    url=repo_url,
+    server=server,
+    owner=owner,
+    repo=repo,
+    branch=branch,
+    path=path,
+  )


 if __name__ == '__main__':
-    app()
+  app()
--- a/src/openllm/venv.py
+++ b/src/openllm/venv.py
@@ -3,92 +3,100 @@ from __future__ import annotations
 import functools, os, pathlib, shutil
 import typer, yaml

-from openllm.common import VENV_DIR, VERBOSE_LEVEL, BentoInfo, EnvVars, VenvSpec, output, run_command
+from openllm.common import (
+  VENV_DIR,
+  VERBOSE_LEVEL,
+  BentoInfo,
+  EnvVars,
+  VenvSpec,
+  output,
+  run_command,
+)


@functools.lru_cache
 def _resolve_bento_venv_spec(bento: BentoInfo, runtime_envs: EnvVars | None = None) -> VenvSpec:
-    lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'
-    if not lock_file.exists():
-        lock_file = bento.path / 'env' / 'python' / 'requirements.txt'
+  lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'
+  if not lock_file.exists():
+    lock_file = bento.path / 'env' / 'python' / 'requirements.txt'

-    reqs = lock_file.read_text().strip()
-    bentofile = bento.path / 'bento.yaml'
-    data = yaml.safe_load(bentofile.read_text())
-    bento_env_list = data.get('envs', [])
-    python_version = data.get('image', {})['python_version']
-    bento_envs = {e['name']: e.get('value') for e in bento_env_list}
-    envs = {k: runtime_envs.get(k, v) for k, v in bento_envs.items()} if runtime_envs else {}
+  reqs = lock_file.read_text().strip()
+  bentofile = bento.path / 'bento.yaml'
+  data = yaml.safe_load(bentofile.read_text())
+  bento_env_list = data.get('envs', [])
+  python_version = data.get('image', {})['python_version']
+  bento_envs = {e['name']: e.get('value') for e in bento_env_list}
+  envs = {k: runtime_envs.get(k, v) for k, v in bento_envs.items()} if runtime_envs else {}

-    return VenvSpec(
-        python_version=python_version,
-        requirements_txt=reqs,
-        name_prefix=f'{bento.tag.replace(":", "_")}-1-',
-        envs=EnvVars(envs),
-    )
+  return VenvSpec(
+    python_version=python_version,
+    requirements_txt=reqs,
+    name_prefix=f'{bento.tag.replace(":", "_")}-1-',
+    envs=EnvVars(envs),
+  )


 def _ensure_venv(venv_spec: VenvSpec) -> pathlib.Path:
-    venv = VENV_DIR / str(hash(venv_spec))
-    if venv.exists() and not (venv / 'DONE').exists():
-        shutil.rmtree(venv, ignore_errors=True)
-    if not venv.exists():
-        output(f'Installing model dependencies({venv})...', style='green')
+  venv = VENV_DIR / str(hash(venv_spec))
+  if venv.exists() and not (venv / 'DONE').exists():
+    shutil.rmtree(venv, ignore_errors=True)
+  if not venv.exists():
+    output(f'Installing model dependencies({venv})...', style='green')

-        venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'
-        try:
-            run_command(
-                ['python', '-m', 'uv', 'venv', venv.__fspath__(), '-p', venv_spec.python_version],
-                silent=VERBOSE_LEVEL.get() < 10,
-            )
-            run_command(
-                ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), 'bentoml'],
-                silent=VERBOSE_LEVEL.get() < 10,
-                env=venv_spec.envs,
-            )
-            with open(venv / 'requirements.txt', 'w') as f:
-                f.write(venv_spec.normalized_requirements_txt)
-            run_command(
-                [
-                    'python',
-                    '-m',
-                    'uv',
-                    'pip',
-                    'install',
-                    '-p',
-                    str(venv_py),
-                    '-r',
-                    (venv / 'requirements.txt').__fspath__(),
-                ],
-                silent=VERBOSE_LEVEL.get() < 10,
-                env=venv_spec.envs,
-            )
-            with open(venv / 'DONE', 'w') as f:
-                f.write('DONE')
-        except Exception as e:
-            shutil.rmtree(venv, ignore_errors=True)
-            if VERBOSE_LEVEL.get() >= 10:
-                output(str(e), style='red')
-            output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')
-            raise typer.Exit(1)
-        output(f'Successfully installed dependencies to {venv}.', style='green')
-        return venv
-    else:
-        return venv
-
-
-def ensure_venv(bento: BentoInfo, runtime_envs: EnvVars | None = None) -> pathlib.Path:
-    venv_spec = _resolve_bento_venv_spec(bento, runtime_envs=EnvVars(runtime_envs))
-    venv = _ensure_venv(venv_spec)
-    assert venv is not None
+    venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'
+    try:
+      run_command(
+        ['python', '-m', 'uv', 'venv', venv.__fspath__(), '-p', venv_spec.python_version],
+        silent=VERBOSE_LEVEL.get() < 10,
+      )
+      run_command(
+        ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), 'bentoml'],
+        silent=VERBOSE_LEVEL.get() < 10,
+        env=venv_spec.envs,
+      )
+      with open(venv / 'requirements.txt', 'w') as f:
+        f.write(venv_spec.normalized_requirements_txt)
+      run_command(
+        [
+          'python',
+          '-m',
+          'uv',
+          'pip',
+          'install',
+          '-p',
+          str(venv_py),
+          '-r',
+          (venv / 'requirements.txt').__fspath__(),
+        ],
+        silent=VERBOSE_LEVEL.get() < 10,
+        env=venv_spec.envs,
+      )
+      with open(venv / 'DONE', 'w') as f:
+        f.write('DONE')
+    except Exception as e:
+      shutil.rmtree(venv, ignore_errors=True)
+      if VERBOSE_LEVEL.get() >= 10:
+        output(str(e), style='red')
+      output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')
+      raise typer.Exit(1)
+    output(f'Successfully installed dependencies to {venv}.', style='green')
+    return venv
+  else:
    return venv


+def ensure_venv(bento: BentoInfo, runtime_envs: EnvVars | None = None) -> pathlib.Path:
+  venv_spec = _resolve_bento_venv_spec(bento, runtime_envs=EnvVars(runtime_envs))
+  venv = _ensure_venv(venv_spec)
+  assert venv is not None
+  return venv
+
+
 def check_venv(bento: BentoInfo) -> bool:
-    venv_spec = _resolve_bento_venv_spec(bento)
-    venv = VENV_DIR / str(hash(venv_spec))
-    if not venv.exists():
-        return False
-    if venv.exists() and not (venv / 'DONE').exists():
-        return False
-    return True
+  venv_spec = _resolve_bento_venv_spec(bento)
+  venv = VENV_DIR / str(hash(venv_spec))
+  if not venv.exists():
+    return False
+  if venv.exists() and not (venv / 'DONE').exists():
+    return False
+  return True
--- a/tests/test_cli_flow.py
+++ b/tests/test_cli_flow.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import sys, typing
+
+import pytest, pexpect
+
+
+@pytest.fixture
+def pexpect_process() -> typing.Generator[pexpect.spawn[typing.Any], None, None]:
+  child = pexpect.spawn(
+    f'{sys.executable} -m openllm hello', encoding='utf-8', timeout=20, echo=False
+  )
+  try:
+    yield child
+  finally:
+    try:
+      child.sendcontrol('c')
+      child.close(force=True)
+    except:
+      pass
+
+
+def safe_expect(
+  child: pexpect.spawn, pattern: str, timeout: int = 10, debug_msg: str = 'Expecting pattern'
+) -> int:
+  try:
+    print(f"\n{debug_msg}: '{pattern}'")
+    index = child.expect(pattern, timeout=timeout)
+    print(f'Found match at index {index}')
+    print(f'Before match: {child.before}')
+    print(f'After match: {child.after}')
+    return index
+  except pexpect.TIMEOUT:
+    print(f'TIMEOUT while {debug_msg}')
+    print(f'Last output: {child.before}')
+    raise
+  except pexpect.EOF:
+    print(f'EOF while {debug_msg}')
+    print(f'Last output: {child.before}')
+    raise
+
+
+def test_hello_flow_to_deploy(pexpect_process: pexpect.spawn) -> None:
+  child = pexpect_process
+
+  try:
+    safe_expect(child, 'Select a model', timeout=10, debug_msg='Waiting for model selection prompt')
+
+    child.sendline('\x1b[B')
+    child.sendline('\r')
+
+    safe_expect(
+      child, 'Select a version', timeout=10, debug_msg='Waiting for version selection prompt'
+    )
+
+    child.sendline('\r')
+
+    safe_expect(
+      child, 'Select an action', timeout=10, debug_msg='Waiting for action selection prompt'
+    )
+
+    child.sendline('\x1b[B')
+    child.sendline('\x1b[B')
+
+    child.sendline('\r')
+
+    safe_expect(
+      child, 'Select an instance type', timeout=10, debug_msg='Waiting for instance type prompt'
+    )
+
+    child.sendline('\r')
+
+    child.expect('Error: .*HF_TOKEN', timeout=10)
+  except Exception as e:
+    pytest.fail(f'Test failed with exception: {e}')
--- a/uv.lock
+++ b/uv.lock
@@ -214,7 +214,7 @@ wheels = [

 [[package]]
 name = "bentoml"
-version = "1.4.5"
+version = "1.4.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "a2wsgi" },
@@ -261,9 +261,9 @@ dependencies = [
    { name = "uvicorn" },
    { name = "watchfiles" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dc/df/6e5a260aaf2ee5da3d797374f81bba087fdcb8b521c7cb7441d390e266b6/bentoml-1.4.5.tar.gz", hash = "sha256:372d6d2f93dbcef38eefd568d0a9c99bfd8b5fbb7202983d948de03efa5cc961", size = 967625 }
+sdist = { url = "https://files.pythonhosted.org/packages/87/a4/7ba2d3cfea05e4d9505b4aedfec17477771bc5dc98ed4d818f83cdc23093/bentoml-1.4.8.tar.gz", hash = "sha256:fb7e1d21a415645afdeb928f45a1950b7409960b5d9360189b777640c96f7103", size = 970299 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/23/26/64bfa28ce0b9e29e825a656e4785eb39b5ab4ca7abb6dbe1e25d856ac716/bentoml-1.4.5-py3-none-any.whl", hash = "sha256:31ecdf26e4addcf62c03a356b629925f5c3aca304d73a5cdf60c1bcbf5e19eb2", size = 1147638 },
+    { url = "https://files.pythonhosted.org/packages/cb/3e/c4adc9c48ceab6bfd8735f125f1b2ec58c6a636b4f2c092349c02e1beb71/bentoml-1.4.8-py3-none-any.whl", hash = "sha256:b33765e15101348fa6ca1fe68f07b3309ad4ea5c8823e56c2358a1b09b29edbb", size = 1150381 },
 ]

 [[package]]
@@ -681,6 +681,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 },
 ]

+[[package]]
+name = "hf-xet"
+version = "1.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/68/4c363b2e62cb3dbe12d2257ba9b22f101384692d4b9727c5f72433472cff/hf_xet-1.0.3.tar.gz", hash = "sha256:a6d16861a06dd4b8f7229c16b392c5fb8b9588ced89a6ee9bc3e66227f794353", size = 257227 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/12/ebbba4b64cb9c908bd5dee355da27f3cc5ad4f29b4b2835041d363388363/hf_xet-1.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0705e5db0da5794ab048a8662a7b3aba220f963270b26abc92e8d05abca22451", size = 4979740 },
+    { url = "https://files.pythonhosted.org/packages/58/8f/34eadc408b834bcb55886b242a9783da3f63508c4bcbfda7a4f21e61f3d1/hf_xet-1.0.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:09a9565ca84049d48c99c83a82d08fbc21d63c04811fd2f7dd088292c1185bc5", size = 4806773 },
+    { url = "https://files.pythonhosted.org/packages/a1/de/00b2e2568a39c01b0e013db3300f4d5841f2e597d7b0518923c7881bd166/hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70e18534d46ab92bbc3125addaebc145f9b27e06eecd67b40c4342f4b92b677f", size = 53812632 },
+    { url = "https://files.pythonhosted.org/packages/e2/d8/4ff790370a6795418196553c33e7bcceaa73a7d587e21e4ccb7661b54a2a/hf_xet-1.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:da28fd32213ad5b8f60771aba44ac032ba19d752928cfd95914f09146b3f51ec", size = 52277180 },
+    { url = "https://files.pythonhosted.org/packages/83/dd/7b432918a3e9e09794674b81e852acc6e14177c0a4466ac0566b7e7f47a4/hf_xet-1.0.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1b71118b8f7e9edf1ae56282388794f351163c7de5c22ea3737dffa9313f500e", size = 53309852 },
+    { url = "https://files.pythonhosted.org/packages/4d/a2/d7a5f452a3a8faaa82aeb3aceddab2e103c1b7028a00bbc4caebca5d79fe/hf_xet-1.0.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5927d1986f87b7b80616eb6353a1402be1d72c46b6b0709b01ffc7623a159563", size = 53739471 },
+    { url = "https://files.pythonhosted.org/packages/82/81/966f800933043c0be989306f5224ef058543f7848f1e78d7ef3305bd069a/hf_xet-1.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:014b5a40e62ad334f21513e5ba39b419117396031e9264dfc15dd598a1595029", size = 4123538 },
+]
+
 [[package]]
 name = "httpcore"
 version = "1.0.7"
@@ -763,6 +778,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 },
 ]

+[[package]]
+name = "iniconfig"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.5"
@@ -1167,7 +1191,7 @@ wheels = [

 [[package]]
 name = "openai"
-version = "1.66.3"
+version = "1.70.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@@ -1179,9 +1203,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a3/77/5172104ca1df35ed2ed8fb26dbc787f721c39498fc51d666c4db07756a0c/openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9", size = 397244 }
+sdist = { url = "https://files.pythonhosted.org/packages/87/f5/ae0f3cd226c2993b4ac1cc4b5f6ca099764689f403c14922c9356accec66/openai-1.70.0.tar.gz", hash = "sha256:e52a8d54c3efeb08cf58539b5b21a5abef25368b5432965e4de88cdf4e091b2b", size = 409640 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/5a/e20182f7b6171642d759c548daa0ba20a1d3ac10d2bd0a13fd75704a9ac3/openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9", size = 567400 },
+    { url = "https://files.pythonhosted.org/packages/e2/39/c4b38317d2c702c4bc763957735aaeaf30dfc43b5b824121c49a4ba7ba0f/openai-1.70.0-py3-none-any.whl", hash = "sha256:f6438d053fd8b2e05fd6bef70871e832d9bbdf55e119d0ac5b92726f1ae6f614", size = 599070 },
 ]

 [[package]]
@@ -1190,6 +1214,7 @@ source = { editable = "." }
 dependencies = [
    { name = "bentoml" },
    { name = "dulwich" },
+    { name = "hf-xet" },
    { name = "huggingface-hub" },
    { name = "nvidia-ml-py" },
    { name = "openai" },
@@ -1204,13 +1229,20 @@ dependencies = [
    { name = "uv" },
 ]

+[package.dev-dependencies]
+tests = [
+    { name = "pexpect" },
+    { name = "pytest" },
+]
+
 [package.metadata]
 requires-dist = [
-    { name = "bentoml", specifier = "==1.4.5" },
+    { name = "bentoml", specifier = "==1.4.8" },
    { name = "dulwich" },
+    { name = "hf-xet" },
    { name = "huggingface-hub" },
    { name = "nvidia-ml-py" },
-    { name = "openai", specifier = "==1.66.3" },
+    { name = "openai", specifier = "==1.70.0" },
    { name = "pathlib" },
    { name = "pip-requirements-parser" },
    { name = "psutil" },
@@ -1222,6 +1254,12 @@ requires-dist = [
    { name = "uv" },
 ]

+[package.metadata.requires-dev]
+tests = [
+    { name = "pexpect", specifier = ">=4.9.0" },
+    { name = "pytest", specifier = ">=8.3.5" },
+]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.30.0"
@@ -1345,6 +1383,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
 ]

+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ptyprocess" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 },
+]
+
 [[package]]
 name = "pip-requirements-parser"
 version = "32.0.1"
@@ -1358,6 +1408,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/54/d0/d04f1d1e064ac901439699ee097f58688caadea42498ec9c4b4ad2ef84ab/pip_requirements_parser-32.0.1-py3-none-any.whl", hash = "sha256:4659bc2a667783e7a15d190f6fccf8b2486685b6dba4c19c3876314769c57526", size = 35648 },
 ]

+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.21.1"
@@ -1483,6 +1542,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885 },
 ]

+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 },
+]
+
 [[package]]
 name = "pyaml"
 version = "25.1.0"
@@ -1633,6 +1701,23 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1c/a7/c8a2d361bf89c0d9577c934ebb7421b25dc84bf3a8e3ac0a40aed9acc547/pyparsing-3.2.1-py3-none-any.whl", hash = "sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1", size = 107716 },
 ]

+[[package]]
+name = "pytest"
+version = "8.3.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"