fix: do not reply on env var for built bento/docker (#477)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-03-02 21:56:10 -05:00 · 2023-10-11 00:29:20 +08:00
parent d8da2cc3a5
commit bf96570eab
5 changed files with 38 additions and 17 deletions
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -9,7 +9,6 @@ import types
 import typing as t

 import attr
-import fs.path
 import inflection
 import orjson

@@ -500,10 +499,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    Returns:
        ``str``: Generated tag format that can be parsed by ``bentoml.Tag``
    '''
-    # specific branch for running in docker or kubernetes, this is very hacky,
-    # and probably need a better way to support custom path
-    if os.environ.get('BENTO_PATH') is not None: return ':'.join(fs.path.parts(model_id)[-2:])
-
    model_name = normalise_model_name(model_id)
    model_id, *maybe_revision = model_id.rsplit(':')
    if len(maybe_revision) > 0:
@@ -1122,11 +1117,12 @@ def Runner(model_name: str,
    init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
    **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
  '''
+
  if llm_config is not None:
    attrs.update({
-        'model_id': llm_config['env']['model_id_value'],
+        'model_id': attrs.get('model_id') or llm_config['env']['model_id_value'],
        'quantize': llm_config['env']['quantize_value'],
-        'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default=llm_config['serialisation']),
+        'serialisation': first_not_none(attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
        'system_message': first_not_none(os.environ.get('OPENLLM_SYSTEM_MESSAGE'), attrs.get('system_message'), None),
        'prompt_template': first_not_none(os.environ.get('OPENLLM_PROMPT_TEMPLATE'), attrs.get('prompt_template'), None),
    })
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,9 +1,9 @@
 # mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type"
 from __future__ import annotations
-import os
 import typing as t
 import warnings

+import _service_vars as svars
 import orjson

 from starlette.applications import Starlette
@@ -28,10 +28,17 @@ warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast fro
 warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
 warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')

-model = os.environ.get('OPENLLM_MODEL', '{__model_name__}')  # openllm: model name
-adapter_map = os.environ.get('OPENLLM_ADAPTER_MAP', '''{__model_adapter_map__}''')  # openllm: model adapter map
+model = svars.model
+model_id = svars.model_id
+adapter_map = svars.adapter_map
 llm_config = openllm.AutoConfig.for_model(model)
-runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
+runner = openllm.Runner(
+  model,
+  llm_config=llm_config,
+  model_id=model_id,
+  ensure_available=False,
+  adapter_map=orjson.loads(adapter_map)
+)
 generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable,  # XXX: remove arg-type once bentoml.Runner is correct set with type
                                          name='llm-generic-embedding',
                                          scheduling_strategy=openllm_core.CascadingResourceStrategy,
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+import os
+
+model = os.environ['OPENLLM_MODEL']  # openllm: model name
+model_id = os.environ['OPENLLM_MODEL_ID']  # openllm: model name
+adapter_map = os.environ['OPENLLM_ADAPTER_MAP']  # openllm: model adapter map
--- a/openllm-python/src/openllm/_service_vars_pkg.py
+++ b/openllm-python/src/openllm/_service_vars_pkg.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+model = '{__model_name__}'  # openllm: model name
+model_id = '{__model_id__}'  # openllm: model id
+adapter_map = '''{__model_adapter_map__}'''  # openllm: model adapter map
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -124,10 +124,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
  env_dict = {
      env.backend: env['backend_value'],
      env.config: f"'{llm.config.model_dump_json().decode()}'",
-      env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
-      'OPENLLM_MODEL': llm.config['model_name'],
      'OPENLLM_SERIALIZATION': serialisation,
-      'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'",
      'BENTOML_DEBUG': str(True),
      'BENTOML_QUIET': str(False),
      'BENTOML_CONFIG_OPTIONS': f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
@@ -143,6 +140,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
  return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)

 OPENLLM_MODEL_NAME = '# openllm: model name'
+OPENLLM_MODEL_ID = '# openllm: model id'
 OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'

 class ModelNameFormatter(string.Formatter):
@@ -170,21 +168,30 @@ class ModelAdapterMapFormatter(ModelNameFormatter):
  model_keyword: LiteralString = '__model_adapter_map__'

 _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
+_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'

 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
  from openllm_core.utils import DEBUG
  model_name = llm.config['model_name']
-  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
-  with open(_service_file.__fspath__(), 'r') as f:
+  model_id = llm.model_id
+  logger.debug('Generating service vars file for %s at %s (dir=%s)', model_name, '_service_vars.py', llm_fs.getsyspath('/'))
+  with open(_service_vars_file.__fspath__(), 'r') as f:
    src_contents = f.readlines()
  for it in src_contents:
    if OPENLLM_MODEL_NAME in it:
      src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
+    if OPENLLM_MODEL_ID in it:
+      src_contents[src_contents.index(it)] = (ModelIdFormatter(model_id).vformat(it)[:-(len(OPENLLM_MODEL_ID) + 3)] + '\n')
    elif OPENLLM_MODEL_ADAPTER_MAP in it:
      src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
  if DEBUG: logger.info('Generated script:\n%s', script)
-  llm_fs.writetext(llm.config['service_name'], script)
+  llm_fs.writetext('_service_vars.py', script)
+
+  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
+  with open(_service_file.__fspath__(), 'r') as f:
+    service_src = f.read()
+  llm_fs.writetext(llm.config['service_name'], service_src)

@inject
 def create_bento(bento_tag: bentoml.Tag,