mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-19 07:06:02 -05:00
chore(vllm): add arguments for gpu memory utilization
Probably not going to fix anything, just delaying the problem. Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -123,6 +123,7 @@ class LLM(t.Generic[M, T]):
|
||||
_quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
|
||||
__tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
|
||||
_serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #
|
||||
_gpu_memory_utilization: float
|
||||
|
||||
__llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
|
||||
__llm_torch_dtype__: 'torch.dtype' = None
|
||||
@@ -152,6 +153,7 @@ class LLM(t.Generic[M, T]):
|
||||
dtype='auto',
|
||||
low_cpu_mem_usage=True,
|
||||
max_model_len=None,
|
||||
gpu_memory_utilization=0.9,
|
||||
_eager=True,
|
||||
**attrs,
|
||||
):
|
||||
@@ -182,7 +184,8 @@ class LLM(t.Generic[M, T]):
|
||||
adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
serialisation=serialisation,
|
||||
local=_local,
|
||||
max_model_len=max_model_len,
|
||||
max_model_len=getenv('max_model_len', default=max_model_len),
|
||||
gpu_memory_utilization=getenv('gpu_memory_utilization', default=gpu_memory_utilization),
|
||||
LLM__model_attrs=model_attrs,
|
||||
LLM__tokenizer_attrs=tokenizer_attrs,
|
||||
llm_dtype__=dtype.lower(),
|
||||
|
||||
@@ -109,7 +109,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
tokenizer_mode='auto', tensor_parallel_size=num_gpus, #
|
||||
model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, #
|
||||
trust_remote_code=llm.trust_remote_code, dtype=llm._torch_dtype, #
|
||||
max_model_len=llm._max_model_len,
|
||||
max_model_len=llm._max_model_len, gpu_memory_utilization=llm._gpu_memory_utilization, #
|
||||
quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -8,7 +8,8 @@ from bentoml.io import JSON, Text
|
||||
logger = logging.getLogger(__name__)
|
||||
llm = openllm.LLM[t.Any, t.Any](
|
||||
model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
|
||||
serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
|
||||
serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code, #
|
||||
max_model_len=svars.max_model_len, gpu_memory_utilization=svars.gpu_memory_utilization, #
|
||||
)
|
||||
svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
import os, orjson, openllm_core.utils as coreutils
|
||||
model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)
|
||||
max_model_len, gpu_memory_utilization = orjson.loads(os.getenv('MAX_MODEL_LEN', orjson.dumps(None).decode())), orjson.loads(os.getenv('GPU_MEMORY_UTILIZATION', orjson.dumps(0.9).decode()))
|
||||
|
||||
@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
|
||||
_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
|
||||
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
|
||||
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code,max_model_len,gpu_memory_utilization='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__},{__max_model_len__},{__gpu_memory_utilization__}'''
|
||||
|
||||
def build_editable(path, package='openllm'):
|
||||
if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None
|
||||
@@ -74,7 +74,7 @@ def create_bento(
|
||||
script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
|
||||
__model_id__=llm.model_id, __model_tag__=str(llm.tag), #
|
||||
__model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
|
||||
__model_trust_remote_code__=str(llm.trust_remote_code),
|
||||
__model_trust_remote_code__=str(llm.trust_remote_code), __max_model_len__ = llm._max_model_len, __gpu_memory_utilization__=llm._gpu_memory_utilization, #
|
||||
)
|
||||
if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
|
||||
llm_fs.writetext('_service_vars.py', script)
|
||||
|
||||
Reference in New Issue
Block a user