chore(vllm): add arguments for gpu memory utilization

Probably not going to fix anything, just delaying the problem. Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
2026-05-19 22:27:58 -04:00 · 2023-11-29 06:45:14 +00:00
parent f0fa06004b
commit 9706228956
6 changed files with 40 additions and 5 deletions
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -123,6 +123,7 @@ class LLM(t.Generic[M, T]):
  _quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
  __tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
  _serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #
+  _gpu_memory_utilization: float

  __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
  __llm_torch_dtype__: 'torch.dtype' = None
@@ -152,6 +153,7 @@ class LLM(t.Generic[M, T]):
    dtype='auto',
    low_cpu_mem_usage=True,
    max_model_len=None,
+    gpu_memory_utilization=0.9,
    _eager=True,
    **attrs,
  ):
@@ -182,7 +184,8 @@ class LLM(t.Generic[M, T]):
      adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
      serialisation=serialisation,
      local=_local,
-      max_model_len=max_model_len,
+      max_model_len=getenv('max_model_len', default=max_model_len),
+      gpu_memory_utilization=getenv('gpu_memory_utilization', default=gpu_memory_utilization),
      LLM__model_attrs=model_attrs,
      LLM__tokenizer_attrs=tokenizer_attrs,
      llm_dtype__=dtype.lower(),
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -109,7 +109,7 @@ class vLLMRunnable(bentoml.Runnable):
          tokenizer_mode='auto', tensor_parallel_size=num_gpus, #
          model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, #
          trust_remote_code=llm.trust_remote_code, dtype=llm._torch_dtype, #
-          max_model_len=llm._max_model_len,
+          max_model_len=llm._max_model_len, gpu_memory_utilization=llm._gpu_memory_utilization, #
          quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None,
        )
      )
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -8,7 +8,8 @@ from bentoml.io import JSON, Text
 logger = logging.getLogger(__name__)
 llm = openllm.LLM[t.Any, t.Any](
  model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
-  serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
+  serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code, #
+  max_model_len=svars.max_model_len, gpu_memory_utilization=svars.gpu_memory_utilization, #
 )
 svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
 llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,2 +1,3 @@
 import os, orjson, openllm_core.utils as coreutils
 model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)
+max_model_len, gpu_memory_utilization = orjson.loads(os.getenv('MAX_MODEL_LEN', orjson.dumps(None).decode())), orjson.loads(os.getenv('GPU_MEMORY_UTILIZATION', orjson.dumps(0.9).decode()))
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
 _service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
-_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
+_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code,max_model_len,gpu_memory_utilization='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__},{__max_model_len__},{__gpu_memory_utilization__}'''

 def build_editable(path, package='openllm'):
  if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None
@@ -74,7 +74,7 @@ def create_bento(
  script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
    __model_id__=llm.model_id, __model_tag__=str(llm.tag), #
    __model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
-    __model_trust_remote_code__=str(llm.trust_remote_code),
+    __model_trust_remote_code__=str(llm.trust_remote_code), __max_model_len__ = llm._max_model_len, __gpu_memory_utilization__=llm._gpu_memory_utilization, #
  )
  if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
  llm_fs.writetext('_service_vars.py', script)
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -343,9 +343,17 @@ def cli() -> None:
  '--max-model-len',
  '--max_model_len',
  'max_model_len',
+  type=int,
  default=None,
  help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.',
 )
+@click.option(
+  '--gpu-memory-utilization',
+  '--gpu_memory_utilization',
+  'gpu_memory_utilization',
+  default=0.9,
+  help='The percentage of GPU memory to be used for the model executor',
+)
@start_decorator
 def start_command(
  model_id: str,
@@ -362,6 +370,7 @@ def start_command(
  dtype: LiteralDtype,
  deprecated_model_id: str | None,
  max_model_len: int | None,
+  gpu_memory_utilization:float,
  **attrs: t.Any,
 ) -> LLMConfig | subprocess.Popen[bytes]:
  '''Start any LLM as a REST server.
@@ -412,6 +421,7 @@ def start_command(
    serialisation=serialisation,
    dtype=dtype,
    max_model_len=max_model_len,
+    gpu_memory_utilization=gpu_memory_utilization,
    trust_remote_code=check_bool_env('TRUST_REMOTE_CODE', False),
  )
  backend_warning(llm.__llm_backend__)
@@ -461,6 +471,8 @@ def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter
      'BACKEND': llm.__llm_backend__,
      'DTYPE': str(llm._torch_dtype).split('.')[-1],
      'TRUST_REMOTE_CODE': str(llm.trust_remote_code),
+      'MAX_MODEL_LEN': orjson.dumps(llm._max_model_len).decode(),
+      'GPU_MEMORY_UTILIZATION': orjson.dumps(llm._gpu_memory_utilization).decode(),
    }
  )
  if llm.quantise: environ['QUANTIZE'] = str(llm.quantise)
@@ -752,6 +764,20 @@ class BuildBentoOutput(t.TypedDict):
  help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
 )
@click.option('--force-push', default=False, is_flag=True, type=click.BOOL, help='Whether to force push.')
+@click.option(
+  '--max-model-len',
+  '--max_model_len',
+  'max_model_len',
+  default=None,
+  help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.',
+)
+@click.option(
+  '--gpu-memory-utilization',
+  '--gpu_memory_utilization',
+  'gpu_memory_utilization',
+  default=0.9,
+  help='The percentage of GPU memory to be used for the model executor',
+)
@machine_option
@click.pass_context
 def build_command(
@@ -770,6 +796,8 @@ def build_command(
  backend: LiteralBackend | None,
  model_version: str | None,
  dockerfile_template: t.TextIO | None,
+  max_model_len: int | None,
+  gpu_memory_utilization:float,
  containerize: bool,
  push: bool,
  serialisation: LiteralSerialisation | None,
@@ -820,6 +848,8 @@ def build_command(
    backend=backend,
    quantize=quantize,
    dtype=dtype,
+    max_model_len=max_model_len,
+    gpu_memory_utilization=gpu_memory_utilization,
    serialisation=first_not_none(
      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
    ),