From 9706228956a075b372aafa55096c81d0424271a7 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Wed, 29 Nov 2023 06:45:14 +0000 Subject: [PATCH] chore(vllm): add arguments for gpu memory utilization Probably not going to fix anything, just delaying the problem. Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --- openllm-python/src/openllm/_llm.py | 5 +++- openllm-python/src/openllm/_runners.py | 2 +- openllm-python/src/openllm/_service.py | 3 +- openllm-python/src/openllm/_service_vars.py | 1 + openllm-python/src/openllm/bundle/_package.py | 4 +-- openllm-python/src/openllm_cli/entrypoint.py | 30 +++++++++++++++++++ 6 files changed, 40 insertions(+), 5 deletions(-) diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 2dbbc563..d8457885 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -123,6 +123,7 @@ class LLM(t.Generic[M, T]): _quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] # __tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] # _serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] # + _gpu_memory_utilization: float __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto' __llm_torch_dtype__: 'torch.dtype' = None @@ -152,6 +153,7 @@ class LLM(t.Generic[M, T]): dtype='auto', low_cpu_mem_usage=True, max_model_len=None, + gpu_memory_utilization=0.9, _eager=True, **attrs, ): @@ -182,7 +184,8 @@ class LLM(t.Generic[M, T]): adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None, serialisation=serialisation, local=_local, - max_model_len=max_model_len, + max_model_len=getenv('max_model_len', default=max_model_len), + gpu_memory_utilization=getenv('gpu_memory_utilization', default=gpu_memory_utilization), LLM__model_attrs=model_attrs, LLM__tokenizer_attrs=tokenizer_attrs, llm_dtype__=dtype.lower(), diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index ba692b10..2b5de8f4 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -109,7 +109,7 @@ class vLLMRunnable(bentoml.Runnable): tokenizer_mode='auto', tensor_parallel_size=num_gpus, # model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, # trust_remote_code=llm.trust_remote_code, dtype=llm._torch_dtype, # - max_model_len=llm._max_model_len, + max_model_len=llm._max_model_len, gpu_memory_utilization=llm._gpu_memory_utilization, # quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None, ) ) diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 1ecea673..46c119d9 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -8,7 +8,8 @@ from bentoml.io import JSON, Text logger = logging.getLogger(__name__) llm = openllm.LLM[t.Any, t.Any]( model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, # - serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code, + serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code, # + max_model_len=svars.max_model_len, gpu_memory_utilization=svars.gpu_memory_utilization, # ) svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner]) llm_model_class = openllm.GenerationInput.from_llm_config(llm.config) diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py index 9d6f5da4..efe8afd9 100644 --- a/openllm-python/src/openllm/_service_vars.py +++ b/openllm-python/src/openllm/_service_vars.py @@ -1,2 +1,3 @@ import os, orjson, openllm_core.utils as coreutils model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False) +max_model_len, gpu_memory_utilization = orjson.loads(os.getenv('MAX_MODEL_LEN', orjson.dumps(None).decode())), orjson.loads(os.getenv('GPU_MEMORY_UTILIZATION', orjson.dumps(0.9).decode())) diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 83458f67..b3ed98db 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' _service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py' -_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}''' +_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code,max_model_len,gpu_memory_utilization='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__},{__max_model_len__},{__gpu_memory_utilization__}''' def build_editable(path, package='openllm'): if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None @@ -74,7 +74,7 @@ def create_bento( script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format( __model_id__=llm.model_id, __model_tag__=str(llm.tag), # __model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], # - __model_trust_remote_code__=str(llm.trust_remote_code), + __model_trust_remote_code__=str(llm.trust_remote_code), __max_model_len__ = llm._max_model_len, __gpu_memory_utilization__=llm._gpu_memory_utilization, # ) if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script) llm_fs.writetext('_service_vars.py', script) diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index 2b9cb25a..b93cec6f 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -343,9 +343,17 @@ def cli() -> None: '--max-model-len', '--max_model_len', 'max_model_len', + type=int, default=None, help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.', ) +@click.option( + '--gpu-memory-utilization', + '--gpu_memory_utilization', + 'gpu_memory_utilization', + default=0.9, + help='The percentage of GPU memory to be used for the model executor', +) @start_decorator def start_command( model_id: str, @@ -362,6 +370,7 @@ def start_command( dtype: LiteralDtype, deprecated_model_id: str | None, max_model_len: int | None, + gpu_memory_utilization:float, **attrs: t.Any, ) -> LLMConfig | subprocess.Popen[bytes]: '''Start any LLM as a REST server. @@ -412,6 +421,7 @@ def start_command( serialisation=serialisation, dtype=dtype, max_model_len=max_model_len, + gpu_memory_utilization=gpu_memory_utilization, trust_remote_code=check_bool_env('TRUST_REMOTE_CODE', False), ) backend_warning(llm.__llm_backend__) @@ -461,6 +471,8 @@ def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter 'BACKEND': llm.__llm_backend__, 'DTYPE': str(llm._torch_dtype).split('.')[-1], 'TRUST_REMOTE_CODE': str(llm.trust_remote_code), + 'MAX_MODEL_LEN': orjson.dumps(llm._max_model_len).decode(), + 'GPU_MEMORY_UTILIZATION': orjson.dumps(llm._gpu_memory_utilization).decode(), } ) if llm.quantise: environ['QUANTIZE'] = str(llm.quantise) @@ -752,6 +764,20 @@ class BuildBentoOutput(t.TypedDict): help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.", ) @click.option('--force-push', default=False, is_flag=True, type=click.BOOL, help='Whether to force push.') +@click.option( + '--max-model-len', + '--max_model_len', + 'max_model_len', + default=None, + help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.', +) +@click.option( + '--gpu-memory-utilization', + '--gpu_memory_utilization', + 'gpu_memory_utilization', + default=0.9, + help='The percentage of GPU memory to be used for the model executor', +) @machine_option @click.pass_context def build_command( @@ -770,6 +796,8 @@ def build_command( backend: LiteralBackend | None, model_version: str | None, dockerfile_template: t.TextIO | None, + max_model_len: int | None, + gpu_memory_utilization:float, containerize: bool, push: bool, serialisation: LiteralSerialisation | None, @@ -820,6 +848,8 @@ def build_command( backend=backend, quantize=quantize, dtype=dtype, + max_model_len=max_model_len, + gpu_memory_utilization=gpu_memory_utilization, serialisation=first_not_none( serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy' ),