chore(codegen): update generated var to read from envvar

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-19 05:57:39 -04:00 · 2024-03-20 21:51:39 -04:00
parent 12ac99867f
commit 295a3b1061
2 changed files with 4 additions and 3 deletions
--- a/openllm-python/src/_openllm_tiny/_entrypoint.py
+++ b/openllm-python/src/_openllm_tiny/_entrypoint.py
@@ -46,8 +46,8 @@ quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTI
 serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION'])
 dtype=coreutils.getenv('dtype', default='{__model_dtype__}', var=['TORCH_DTYPE'])
 trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__})
-max_model_len={__max_model_len__}
-gpu_memory_utilization={__gpu_memory_utilization__}
+max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dumps({__max_model_len__})))
+gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION']))
 services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}"""))
 '''
 _DOCKERFILE_TEMPLATE = """\
--- a/openllm-python/src/_openllm_tiny/_llm.py
+++ b/openllm-python/src/_openllm_tiny/_llm.py
@@ -65,7 +65,6 @@ class LLM:
      quantise = self.quantise if self.quantise and self.quantise in {'gptq', 'awq', 'squeezellm'} else None
      dtype = 'float16' if quantise == 'gptq' else self.dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.

-      self.engine_args.setdefault('gpu_memory_utilization', 0.9)
      self.engine_args.update({
        'worker_use_ray': False,
        'engine_use_ray': False,
@@ -81,6 +80,8 @@ class LLM:
        self.engine_args['disable_log_stats'] = not get_debug_mode()
      if 'disable_log_requests' not in self.engine_args:
        self.engine_args['disable_log_requests'] = not get_debug_mode()
+      if 'gpu_memory_utilization' not in self.engine_args:
+        self.engine_args['gpu_memory_utilization'] = 0.9

      try:
        from vllm import AsyncEngineArgs, AsyncLLMEngine