diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py index 51b7748d..38fac992 100644 --- a/openllm-python/src/_openllm_tiny/_entrypoint.py +++ b/openllm-python/src/_openllm_tiny/_entrypoint.py @@ -46,8 +46,8 @@ quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTI serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION']) dtype=coreutils.getenv('dtype', default='{__model_dtype__}', var=['TORCH_DTYPE']) trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__}) -max_model_len={__max_model_len__} -gpu_memory_utilization={__gpu_memory_utilization__} +max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dumps({__max_model_len__}))) +gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION'])) services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}""")) ''' _DOCKERFILE_TEMPLATE = """\ diff --git a/openllm-python/src/_openllm_tiny/_llm.py b/openllm-python/src/_openllm_tiny/_llm.py index b6742f8b..3b4ad873 100644 --- a/openllm-python/src/_openllm_tiny/_llm.py +++ b/openllm-python/src/_openllm_tiny/_llm.py @@ -65,7 +65,6 @@ class LLM: quantise = self.quantise if self.quantise and self.quantise in {'gptq', 'awq', 'squeezellm'} else None dtype = 'float16' if quantise == 'gptq' else self.dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet. - self.engine_args.setdefault('gpu_memory_utilization', 0.9) self.engine_args.update({ 'worker_use_ray': False, 'engine_use_ray': False, @@ -81,6 +80,8 @@ class LLM: self.engine_args['disable_log_stats'] = not get_debug_mode() if 'disable_log_requests' not in self.engine_args: self.engine_args['disable_log_requests'] = not get_debug_mode() + if 'gpu_memory_utilization' not in self.engine_args: + self.engine_args['gpu_memory_utilization'] = 0.9 try: from vllm import AsyncEngineArgs, AsyncLLMEngine