mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-25 09:32:37 -04:00
feat: continuous batching with vLLM (#349)
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: continuous batching Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add changeloe Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add one shot generation Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -112,15 +112,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
\b
|
||||
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
|
||||
''',
|
||||
)
|
||||
|
||||
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
command_attrs.update({
|
||||
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
|
||||
})
|
||||
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
|
||||
''')
|
||||
|
||||
@group.command(**command_attrs)
|
||||
@start_decorator(llm_config, serve_grpc=_serve_grpc)
|
||||
@@ -230,19 +222,6 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
return start_cmd
|
||||
|
||||
def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
|
||||
context_settings = command_attrs.pop('context_settings', {})
|
||||
context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
|
||||
command_attrs['context_settings'] = context_settings
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
@group.command(**command_attrs)
|
||||
def noop(**_: t.Any) -> LLMConfig:
|
||||
termui.echo('No GPU available, therefore this command is disabled', fg='red')
|
||||
openllm.utils.analytics.track_start_init(llm_config)
|
||||
return llm_config
|
||||
|
||||
return noop
|
||||
|
||||
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
|
||||
composed = openllm.utils.compose(
|
||||
|
||||
@@ -651,8 +651,6 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
json_data[m] = {
|
||||
'architecture': config['architecture'],
|
||||
'model_id': config['model_ids'],
|
||||
'cpu': not config['requires_gpu'],
|
||||
'gpu': True,
|
||||
'backend': backend,
|
||||
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
|
||||
}
|
||||
@@ -680,13 +678,11 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
import tabulate
|
||||
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
# llm, architecture, url, model_id, installation, cpu, gpu, backend
|
||||
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
|
||||
# llm, architecture, url, model_id, installation, backend
|
||||
data: list[str | tuple[str, str, list[str], str, tuple[LiteralBackend, ...]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)])
|
||||
column_widths = [
|
||||
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
|
||||
]
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], v['backend'])])
|
||||
column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4)]
|
||||
|
||||
if len(data) == 0 and len(failed_initialized) > 0:
|
||||
termui.echo('Exception found while parsing models:\n', fg='yellow')
|
||||
@@ -868,6 +864,7 @@ def query_command(
|
||||
res = client.query(prompt, return_response='raw', **{**client.configuration, **_memoized})
|
||||
if output == 'pretty':
|
||||
response = client.config.postprocess_generate(prompt, res['responses'])
|
||||
if isinstance(response, dict) and 'text' in response: response = response['text']
|
||||
termui.echo('\n\n==Responses==\n', fg='white')
|
||||
termui.echo(response, fg=generated_fg)
|
||||
elif output == 'json':
|
||||
|
||||
Reference in New Issue
Block a user