mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-28 19:08:29 -04:00
chore(codegen): update generated var to read from envvar
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -46,8 +46,8 @@ quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTI
|
||||
serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION'])
|
||||
dtype=coreutils.getenv('dtype', default='{__model_dtype__}', var=['TORCH_DTYPE'])
|
||||
trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__})
|
||||
max_model_len={__max_model_len__}
|
||||
gpu_memory_utilization={__gpu_memory_utilization__}
|
||||
max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dumps({__max_model_len__})))
|
||||
gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION']))
|
||||
services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}"""))
|
||||
'''
|
||||
_DOCKERFILE_TEMPLATE = """\
|
||||
|
||||
@@ -65,7 +65,6 @@ class LLM:
|
||||
quantise = self.quantise if self.quantise and self.quantise in {'gptq', 'awq', 'squeezellm'} else None
|
||||
dtype = 'float16' if quantise == 'gptq' else self.dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
|
||||
|
||||
self.engine_args.setdefault('gpu_memory_utilization', 0.9)
|
||||
self.engine_args.update({
|
||||
'worker_use_ray': False,
|
||||
'engine_use_ray': False,
|
||||
@@ -81,6 +80,8 @@ class LLM:
|
||||
self.engine_args['disable_log_stats'] = not get_debug_mode()
|
||||
if 'disable_log_requests' not in self.engine_args:
|
||||
self.engine_args['disable_log_requests'] = not get_debug_mode()
|
||||
if 'gpu_memory_utilization' not in self.engine_args:
|
||||
self.engine_args['gpu_memory_utilization'] = 0.9
|
||||
|
||||
try:
|
||||
from vllm import AsyncEngineArgs, AsyncLLMEngine
|
||||
|
||||
Reference in New Issue
Block a user