feat: continuous batching with vLLM (#349)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: continuous batching Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add changeloe Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add one shot generation Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-08 18:20:39 -04:00 · 2023-09-14 03:09:36 -04:00
parent e35e143093
commit ad9107958d
22 changed files with 336 additions and 232 deletions
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -112,15 +112,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]

 \b
 {orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
-''',
-                                   )
-
-  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
-    # NOTE: The model requires GPU, therefore we will return a dummy command
-    command_attrs.update({
-        'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
-    })
-    return noop_command(group, llm_config, _serve_grpc, **command_attrs)
+''')

  @group.command(**command_attrs)
  @start_decorator(llm_config, serve_grpc=_serve_grpc)
@@ -230,19 +222,6 @@ Available official model_id(s): [default: {llm_config['default_id']}]

  return start_cmd

-def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
-  context_settings = command_attrs.pop('context_settings', {})
-  context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
-  command_attrs['context_settings'] = context_settings
-  # NOTE: The model requires GPU, therefore we will return a dummy command
-  @group.command(**command_attrs)
-  def noop(**_: t.Any) -> LLMConfig:
-    termui.echo('No GPU available, therefore this command is disabled', fg='red')
-    openllm.utils.analytics.track_start_init(llm_config)
-    return llm_config
-
-  return noop
-
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(