diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py
index 51b7748d..38fac992 100644
--- a/openllm-python/src/_openllm_tiny/_entrypoint.py
+++ b/openllm-python/src/_openllm_tiny/_entrypoint.py
@@ -46,8 +46,8 @@ quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTI
 serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION'])
 dtype=coreutils.getenv('dtype', default='{__model_dtype__}', var=['TORCH_DTYPE'])
 trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__})
-max_model_len={__max_model_len__}
-gpu_memory_utilization={__gpu_memory_utilization__}
+max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dumps({__max_model_len__})))
+gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION']))
 services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}"""))
 '''
 _DOCKERFILE_TEMPLATE = """\
diff --git a/openllm-python/src/_openllm_tiny/_llm.py b/openllm-python/src/_openllm_tiny/_llm.py
index b6742f8b..3b4ad873 100644
--- a/openllm-python/src/_openllm_tiny/_llm.py
+++ b/openllm-python/src/_openllm_tiny/_llm.py
@@ -65,7 +65,6 @@ class LLM:
       quantise = self.quantise if self.quantise and self.quantise in {'gptq', 'awq', 'squeezellm'} else None
       dtype = 'float16' if quantise == 'gptq' else self.dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
 
-      self.engine_args.setdefault('gpu_memory_utilization', 0.9)
       self.engine_args.update({
         'worker_use_ray': False,
         'engine_use_ray': False,
@@ -81,6 +80,8 @@ class LLM:
         self.engine_args['disable_log_stats'] = not get_debug_mode()
       if 'disable_log_requests' not in self.engine_args:
         self.engine_args['disable_log_requests'] = not get_debug_mode()
+      if 'gpu_memory_utilization' not in self.engine_args:
+        self.engine_args['gpu_memory_utilization'] = 0.9
 
       try:
         from vllm import AsyncEngineArgs, AsyncLLMEngine