fix(breaking): remove embeddings and update client implementation (#500)

2026-03-05 23:56:47 -05:00 · 2023-10-14 16:04:35 -04:00
parent 9f347b189a
commit c1ca7ccd3b
19 changed files with 339 additions and 736 deletions
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -21,7 +21,6 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
 """
 from __future__ import annotations
 import functools
-import http.client
 import inspect
 import itertools
 import logging
@@ -112,7 +111,8 @@ if t.TYPE_CHECKING:

  from bentoml._internal.bento import BentoStore
  from bentoml._internal.container import DefaultBuilder
-  from openllm_core._schema import EmbeddingsOutput
+  from openllm_client._schemas import Response
+  from openllm_client._schemas import StreamResponse
  from openllm_core._typing_compat import LiteralContainerRegistry
  from openllm_core._typing_compat import LiteralContainerVersionStrategy
 else:
@@ -130,17 +130,20 @@ OPENLLM_FIGLET = '''\
 '''

 ServeCommand = t.Literal['serve', 'serve-grpc']
+
@attr.define
 class GlobalOptions:
  cloud_context: str | None = attr.field(default=None)

  def with_options(self, **attrs: t.Any) -> Self:
    return attr.evolve(self, **attrs)
+
 GrpType = t.TypeVar('GrpType', bound=click.Group)

 _object_setattr = object.__setattr__

 _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension'))
+
 class Extensions(click.MultiCommand):
  def list_commands(self, ctx: click.Context) -> list[str]:
    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])
@@ -151,6 +154,7 @@ class Extensions(click.MultiCommand):
    except ImportError:
      return None
    return mod.cli
+
 class OpenLLMCommandGroup(BentoMLCommandGroup):
  NUMBER_OF_COMMON_PARAMS = 5  # parameters in common_params + 1 faked group option header

@@ -284,10 +288,12 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
      if rows:
        with formatter.section(_('Extensions')):
          formatter.write_dl(rows)
+
@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='openllm')
-@click.version_option(
-    None, '--version', '-v', message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}"
-)
+@click.version_option(None,
+                      '--version',
+                      '-v',
+                      message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}")
 def cli() -> None:
  '''\b
   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
@@ -301,6 +307,7 @@ def cli() -> None:
  An open platform for operating large language models in production.
  Fine-tune, serve, deploy, and monitor any LLMs with ease.
  '''
+
@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'])
 def start_command() -> None:
  '''Start any LLM as a REST server.
@@ -310,6 +317,7 @@ def start_command() -> None:
  $ openllm <start|start-http> <model_name> --<options> ...
  ```
  '''
+
@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start-grpc')
 def start_grpc_command() -> None:
  '''Start any LLM as a gRPC server.
@@ -319,6 +327,7 @@ def start_grpc_command() -> None:
  $ openllm start-grpc <model_name> --<options> ...
  ```
  '''
+
 _start_mapping = {
    'start': {
        key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING
@@ -327,6 +336,7 @@ _start_mapping = {
        key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING
    }
 }
+
@cli.command(name='import', aliases=['download'])
@model_name_argument
@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
@@ -337,17 +347,9 @@ _start_mapping = {
@machine_option
@backend_option
@serialisation_option
-def import_command(
-    model_name: str,
-    model_id: str | None,
-    converter: str | None,
-    model_version: str | None,
-    output: LiteralOutput,
-    machine: bool,
-    backend: LiteralBackend,
-    quantize: LiteralQuantise | None,
-    serialisation: LiteralSerialisation | None,
-) -> bentoml.Model:
+def import_command(model_name: str, model_id: str | None, converter: str | None, model_version: str | None, output: LiteralOutput, machine: bool, backend: LiteralBackend,
+                   quantize: LiteralQuantise | None, serialisation: LiteralSerialisation | None,
+                   ) -> bentoml.Model:
  """Setup LLM interactively.

  It accepts two positional arguments: `model_name` and `model_id`. The first name determine
@@ -402,7 +404,13 @@ def import_command(
  _serialisation = openllm_core.utils.first_not_none(serialisation, default=llm_config['serialisation'])
  env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
  backend = first_not_none(backend, default=env['backend_value'])
-  llm = infer_auto_class(backend).for_model(model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, quantize=env['quantize_value'], serialisation=_serialisation)
+  llm = infer_auto_class(backend).for_model(model_name,
+                                            model_id=env['model_id_value'],
+                                            llm_config=llm_config,
+                                            model_version=model_version,
+                                            ensure_available=False,
+                                            quantize=env['quantize_value'],
+                                            serialisation=_serialisation)
  _previously_saved = False
  try:
    _ref = openllm.serialisation.get(llm)
@@ -434,66 +442,40 @@ def import_command(
@workers_per_resource_option(factory=click, build=True)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
@quantize_option(factory=cog.optgroup, build=True)
-@click.option(
-    '--enable-features',
-    multiple=True,
-    nargs=1,
-    metavar='FEATURE[,FEATURE]',
-    help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES))
-)
-@click.option(
-    '--adapter-id',
-    default=None,
-    multiple=True,
-    metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
-    help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed."
-)
+@click.option('--enable-features',
+              multiple=True,
+              nargs=1,
+              metavar='FEATURE[,FEATURE]',
+              help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES)))
+@click.option('--adapter-id',
+              default=None,
+              multiple=True,
+              metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
+              help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.")
@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
@model_version_option
@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
@serialisation_option
@container_registry_option
-@click.option(
-    '--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
-)
+@click.option('--container-version-strategy',
+              type=click.Choice(['release', 'latest', 'nightly']),
+              default='release',
+              help="Default container version strategy for the image from '--container-registry'")
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
-@cog.optgroup.option(
-    '--containerize',
-    default=False,
-    is_flag=True,
-    type=click.BOOL,
-    help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'."
-)
+@cog.optgroup.option('--containerize',
+                     default=False,
+                     is_flag=True,
+                     type=click.BOOL,
+                     help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.")
@cog.optgroup.option('--push', default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
@click.option('--force-push', default=False, is_flag=True, type=click.BOOL, help='Whether to force push.')
@click.pass_context
-def build_command(
-    ctx: click.Context,
-    /,
-    model_name: str,
-    model_id: str | None,
-    bento_version: str | None,
-    overwrite: bool,
-    output: LiteralOutput,
-    quantize: LiteralQuantise | None,
-    enable_features: tuple[str, ...] | None,
-    workers_per_resource: float | None,
-    adapter_id: tuple[str, ...],
-    build_ctx: str | None,
-    backend: LiteralBackend,
-    system_message: str | None,
-    prompt_template_file: t.IO[t.Any] | None,
-    machine: bool,
-    model_version: str | None,
-    dockerfile_template: t.TextIO | None,
-    containerize: bool,
-    push: bool,
-    serialisation: LiteralSerialisation | None,
-    container_registry: LiteralContainerRegistry,
-    container_version_strategy: LiteralContainerVersionStrategy,
-    force_push: bool,
-    **attrs: t.Any,
-) -> bentoml.Bento:
+def build_command(ctx: click.Context, /, model_name: str, model_id: str | None, bento_version: str | None, overwrite: bool, output: LiteralOutput, quantize: LiteralQuantise | None,
+                  enable_features: tuple[str, ...] | None, workers_per_resource: float | None, adapter_id: tuple[str, ...], build_ctx: str | None, backend: LiteralBackend,
+                  system_message: str | None, prompt_template_file: t.IO[t.Any] | None, machine: bool, model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool,
+                  push: bool, serialisation: LiteralSerialisation | None, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
+                  force_push: bool, **attrs: t.Any,
+                  ) -> bentoml.Bento:
  '''Package a given models into a Bento.

  \b
@@ -530,7 +512,16 @@ def build_command(
    if system_message: os.environ['OPENLLM_SYSTEM_MESSAGE'] = system_message
    if prompt_template: os.environ['OPENLLM_PROMPT_TEMPLATE'] = prompt_template

-    llm = infer_auto_class(env['backend_value']).for_model(model_name, model_id=env['model_id_value'], prompt_template=prompt_template, system_message=system_message, llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=_serialisation, **attrs)
+    llm = infer_auto_class(env['backend_value']).for_model(model_name,
+                                                           model_id=env['model_id_value'],
+                                                           prompt_template=prompt_template,
+                                                           system_message=system_message,
+                                                           llm_config=llm_config,
+                                                           ensure_available=True,
+                                                           model_version=model_version,
+                                                           quantize=env['quantize_value'],
+                                                           serialisation=_serialisation,
+                                                           **attrs)

    labels = dict(llm.identifying_params)
    labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
@@ -575,18 +566,16 @@ def build_command(
          raise bentoml.exceptions.NotFound(f'Rebuilding existing Bento {bento_tag}') from None
        _previously_built = True
      except bentoml.exceptions.NotFound:
-        bento = bundle.create_bento(
-            bento_tag,
-            llm_fs,
-            llm,
-            workers_per_resource=workers_per_resource,
-            adapter_map=adapter_map,
-            quantize=quantize,
-            extra_dependencies=enable_features,
-            dockerfile_template=dockerfile_template_path,
-            container_registry=container_registry,
-            container_version_strategy=container_version_strategy
-        )
+        bento = bundle.create_bento(bento_tag,
+                                    llm_fs,
+                                    llm,
+                                    workers_per_resource=workers_per_resource,
+                                    adapter_map=adapter_map,
+                                    quantize=quantize,
+                                    extra_dependencies=enable_features,
+                                    dockerfile_template=dockerfile_template_path,
+                                    container_registry=container_registry,
+                                    container_version_strategy=container_version_strategy)
  except Exception as err:
    raise err from None

@@ -596,12 +585,11 @@ def build_command(
      termui.echo('\n' + OPENLLM_FIGLET, fg='white')
      if not _previously_built: termui.echo(f'Successfully built {bento}.', fg='green')
      elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg='yellow')
-      termui.echo(
-          '📖 Next steps:\n\n' + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
-          f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" +
-          "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n",
-          fg='blue',
-      )
+      termui.echo('📖 Next steps:\n\n' + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
+                  f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" +
+                  "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n",
+                  fg='blue',
+                  )
  elif output == 'json':
    termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
  else:
@@ -688,7 +676,7 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
      data: list[str | tuple[str, str, list[str], str, tuple[LiteralBackend, ...]]] = []
      for m, v in json_data.items():
        data.extend([(m, v['architecture'], v['model_id'], v['installation'], v['backend'])])
-      column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6),  int(termui.COLUMNS / 4)]
+      column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4)]

      if len(data) == 0 and len(failed_initialized) > 0:
        termui.echo('Exception found while parsing models:\n', fg='yellow')
@@ -716,14 +704,17 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
      if show_available: json_data['local'] = local_models
      termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg='white')
  ctx.exit(0)
+
@cli.command()
@model_name_argument(required=False)
@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
@click.option('--include-bentos/--no-include-bentos', is_flag=True, default=False, help='Whether to also include pruning bentos.')
@inject
-def prune_command(
-    model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> None:
+def prune_command(model_name: str | None,
+                  yes: bool,
+                  include_bentos: bool,
+                  model_store: ModelStore = Provide[BentoMLContainer.model_store],
+                  bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> None:
  '''Remove all saved models, (and optionally bentos) built with OpenLLM locally.

  \b
@@ -744,6 +735,7 @@ def prune_command(
    if delete_confirmed:
      store.delete(store_item.tag)
      termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg='yellow')
+
 def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, value: list[str] | str | None) -> tuple[str, bool | str] | list[str] | str | None:
  if value is None:
    return value
@@ -762,6 +754,7 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val
    return key, values[0]
  else:
    raise click.BadParameter(f'Invalid option format: {value}')
+
 def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal['json', 'porcelain', 'pretty'] = 'pretty') -> t.Callable[[FC], FC]:
  options = [
      click.option('--endpoint', type=click.STRING, help='OpenLLM Server endpoint, i.e: http://localhost:3000', envvar='OPENLLM_ENDPOINT', default='http://localhost:3000',
@@ -770,20 +763,19 @@ def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal
      output_option(default_value=output_value),
  ]
  return compose(*options)(f) if f is not None else compose(*options)
+
@cli.command()
@click.argument('task', type=click.STRING, metavar='TASK')
@shared_client_options
@click.option('--agent', type=click.Choice(['hf']), default='hf', help='Whether to interact with Agents from given Server endpoint.', show_default=True)
@click.option('--remote', is_flag=True, default=False, help='Whether or not to use remote tools (inference endpoints) instead of local ones.', show_default=True)
-@click.option(
-    '--opt',
-    help="Define prompt options. "
-    "(format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)",
-    required=False,
-    multiple=True,
-    callback=opt_callback,
-    metavar='ARG=VALUE[,ARG=VALUE]'
-)
+@click.option('--opt',
+              help="Define prompt options. "
+              "(format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)",
+              required=False,
+              multiple=True,
+              callback=opt_callback,
+              metavar='ARG=VALUE[,ARG=VALUE]')
 def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: LiteralOutput, remote: bool, task: str, _memoized: DictStrAny, **attrs: t.Any) -> str:
  '''Instruct agents interactively for given tasks, from a terminal.

@@ -795,66 +787,37 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
  ```
  '''
  raise click.ClickException("'instruct' is currently disabled")
-  client = openllm.client.HTTPClient(endpoint, timeout=timeout)
+  # client = openllm.client.HTTPClient(endpoint, timeout=timeout)
+  #
+  # try:
+  #   client.call('metadata')
+  # except http.client.BadStatusLine:
+  #   raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
+  # if agent == 'hf':
+  #   _memoized = {k: v[0] for k, v in _memoized.items() if v}
+  #   client._hf_agent.set_stream(logger.info)
+  #   if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
+  #   result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
+  #   if output == 'json': termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg='white')
+  #   else: termui.echo(result, fg='white')
+  #   return result
+  # else:
+  #   raise click.BadOptionUsage('agent', f'Unknown agent type {agent}')

-  try:
-    client.call('metadata')
-  except http.client.BadStatusLine:
-    raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
-  if agent == 'hf':
-    _memoized = {k: v[0] for k, v in _memoized.items() if v}
-    client._hf_agent.set_stream(logger.info)
-    if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
-    result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
-    if output == 'json': termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg='white')
-    else: termui.echo(result, fg='white')
-    return result
-  else:
-    raise click.BadOptionUsage('agent', f'Unknown agent type {agent}')
-@cli.command()
-@shared_client_options(output_value='json')
-@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
-@click.argument('text', type=click.STRING, nargs=-1)
-@machine_option
-@click.pass_context
-def embed_command(
-    ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, machine: bool
-) -> EmbeddingsOutput | None:
-  '''Get embeddings interactively, from a terminal.
-
-  \b
-  ```bash
-  $ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
-  ```
-  '''
-  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == 'http' else openllm.client.GrpcClient(endpoint, timeout=timeout)
-  try:
-    gen_embed = client.embed(text)
-  except ValueError:
-    raise click.ClickException(f'Endpoint {endpoint} does not support embeddings.') from None
-  if machine: return gen_embed
-  elif output == 'pretty':
-    termui.echo('Generated embeddings: ', fg='magenta', nl=False)
-    termui.echo(gen_embed.embeddings, fg='white')
-    termui.echo('\nNumber of tokens: ', fg='magenta', nl=False)
-    termui.echo(gen_embed.num_tokens, fg='white')
-  elif output == 'json':
-    termui.echo(orjson.dumps(bentoml_cattr.unstructure(gen_embed), option=orjson.OPT_INDENT_2).decode(), fg='white')
-  else:
-    termui.echo(gen_embed.embeddings, fg='white')
-  ctx.exit(0)
@cli.command()
@shared_client_options(output_value='porcelain')
@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
@click.argument('prompt', type=click.STRING)
-@click.option(
-    '--sampling-params', help='Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)', required=False, multiple=True, callback=opt_callback, metavar='ARG=VALUE[,ARG=VALUE]'
-)
+@click.option('--sampling-params',
+              help='Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)',
+              required=False,
+              multiple=True,
+              callback=opt_callback,
+              metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
-def query_command(
-    ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, stream: bool, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
-) -> None:
+def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, stream: bool, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny,
+                  **attrs: t.Any) -> None:
  '''Ask a LLM interactively, from a terminal.

  \b
@@ -870,24 +833,32 @@ def query_command(
  if output != 'porcelain':
    termui.echo('==Input==\n', fg='white')
    termui.echo(f'{prompt}', fg=input_fg)
-  fn = client.generate_stream if stream else client.generate
-  res = fn(prompt, **{**client._config(), **_memoized})
-  if output == 'pretty':
-    termui.echo('\n\n==Responses==\n', fg='white')
-    if stream:
-      for it in res: termui.echo(it.text, fg=generated_fg, nl=False)
-    else: termui.echo(res.responses[0], fg=generated_fg)
-  elif output == 'json':
-    if stream:
-      for it in res: termui.echo(orjson.dumps(bentoml_cattr.unstructure(it), option=orjson.OPT_INDENT_2).decode(), fg='white')
-    else: termui.echo(orjson.dumps(bentoml_cattr.unstructure(res), option=orjson.OPT_INDENT_2).decode(), fg='white')
-  else:  # noqa: PLR5501
-    if stream:
-      for it in res: termui.echo(it.text, fg=generated_fg, nl=False)
-    else: termui.echo(res.responses, fg='white')
+
+  if stream:
+    stream_res: t.Iterator[StreamResponse] = client.generate_stream(prompt, **{**client._config(), **_memoized})
+    if output == 'pretty':
+      termui.echo('\n\n==Responses==\n', fg='white')
+      for it in stream_res:
+        termui.echo(it.text, fg=generated_fg, nl=False)
+    elif output == 'json':
+      for it in stream_res:
+        termui.echo(orjson.dumps(bentoml_cattr.unstructure(it), option=orjson.OPT_INDENT_2).decode(), fg='white')
+    else:
+      for it in stream_res:
+        termui.echo(it.text, fg=generated_fg, nl=False)
+  else:
+    res: Response = client.generate(prompt, **{**client._config(), **_memoized})
+    if output == 'pretty':
+      termui.echo('\n\n==Responses==\n', fg='white')
+      termui.echo(res.responses[0], fg=generated_fg)
+    elif output == 'json':
+      termui.echo(orjson.dumps(bentoml_cattr.unstructure(res), option=orjson.OPT_INDENT_2).decode(), fg='white')
+    else:
+      termui.echo(res.responses, fg='white')
  ctx.exit(0)

@cli.group(cls=Extensions, hidden=True, name='extension')
 def extension_command() -> None:
  '''Extension for OpenLLM CLI.'''
+
 if __name__ == '__main__': cli()