chore(codegen): update generated var to read from envvar

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2024-03-20 21:51:39 -04:00
parent 12ac99867f
commit 295a3b1061
2 changed files with 4 additions and 3 deletions

View File

@@ -46,8 +46,8 @@ quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTI
serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION'])
dtype=coreutils.getenv('dtype', default='{__model_dtype__}', var=['TORCH_DTYPE'])
trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__})
max_model_len={__max_model_len__}
gpu_memory_utilization={__gpu_memory_utilization__}
max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dumps({__max_model_len__})))
gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION']))
services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}"""))
'''
_DOCKERFILE_TEMPLATE = """\

View File

@@ -65,7 +65,6 @@ class LLM:
quantise = self.quantise if self.quantise and self.quantise in {'gptq', 'awq', 'squeezellm'} else None
dtype = 'float16' if quantise == 'gptq' else self.dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
self.engine_args.setdefault('gpu_memory_utilization', 0.9)
self.engine_args.update({
'worker_use_ray': False,
'engine_use_ray': False,
@@ -81,6 +80,8 @@ class LLM:
self.engine_args['disable_log_stats'] = not get_debug_mode()
if 'disable_log_requests' not in self.engine_args:
self.engine_args['disable_log_requests'] = not get_debug_mode()
if 'gpu_memory_utilization' not in self.engine_args:
self.engine_args['gpu_memory_utilization'] = 0.9
try:
from vllm import AsyncEngineArgs, AsyncLLMEngine