feat(engine): CTranslate2 (#698)

* chore: update instruction for dependencies Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat(experimental): CTranslate2 Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-06 08:08:03 -05:00 · 2023-11-19 10:25:08 -05:00
parent 539f250c0f
commit 816c1ee80e
31 changed files with 945 additions and 350 deletions
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -450,7 +450,7 @@ def start_command(

  import torch

-  if not torch.cuda.is_available():
+  if backend == 'pt' and not torch.cuda.is_available():
    if dtype == 'auto':
      dtype = 'float'
    elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
@@ -465,7 +465,7 @@ def start_command(
    adapter_map=adapter_map,
    quantize=quantize,
    serialisation=serialisation,
-    torch_dtype=dtype,
+    dtype=dtype,
  )
  backend_warning(llm.__llm_backend__)

@@ -580,7 +580,7 @@ def start_grpc_command(

  import torch

-  if not torch.cuda.is_available():
+  if backend == 'pt' and not torch.cuda.is_available():
    if dtype == 'auto':
      dtype = 'float'
    elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
@@ -595,7 +595,7 @@ def start_grpc_command(
    adapter_map=adapter_map,
    quantize=quantize,
    serialisation=serialisation,
-    torch_dtype=dtype,
+    dtype=dtype,
    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'),
  )
  backend_warning(llm.__llm_backend__)
@@ -661,14 +661,14 @@ def process_environ(
      'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
      'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
      'OPENLLM_SERIALIZATION': serialisation,
-      'OPENLLM_BACKEND': llm.__llm_backend__,
      'OPENLLM_CONFIG': config.model_dump_json(flatten=True).decode(),
-      'TORCH_DTYPE': str(llm._torch_dtype).split('.')[-1],
+      'BACKEND': llm.__llm_backend__,
+      'DTYPE': str(llm._torch_dtype).split('.')[-1],
      'TRUST_REMOTE_CODE': str(llm.trust_remote_code),
    }
  )
  if llm.quantise:
-    environ['OPENLLM_QUANTIZE'] = str(llm.quantise)
+    environ['QUANTIZE'] = str(llm.quantise)
  if system_message:
    environ['OPENLLM_SYSTEM_MESSAGE'] = system_message
  if prompt_template:
@@ -695,10 +695,11 @@ def process_workers_per_resource(wpr: str | float | int, device: tuple[str, ...]


 def build_bento_instruction(llm, model_id, serialisation, adapter_map):
-  cmd_name = f'openllm build {model_id}'
+  cmd_name = f'openllm build {model_id} --backend {llm.__llm_backend__}'
  if llm.quantise:
    cmd_name += f' --quantize {llm.quantise}'
-  cmd_name += f' --serialization {serialisation}'
+  if llm.__llm_backend__ in {'pt', 'vllm'}:
+    cmd_name += f' --serialization {serialisation}'
  if adapter_map is not None:
    cmd_name += ' ' + ' '.join(
      [
@@ -1042,7 +1043,7 @@ def build_command(
    system_message=system_message,
    backend=backend,
    quantize=quantize,
-    torch_dtype=dtype,
+    dtype=dtype,
    serialisation=first_not_none(
      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
    ),