fix(vllm): correctly load given model id from envvar (#181)

2026-05-24 16:44:39 -04:00 · 2023-08-03 16:34:35 -04:00
parent db8e47bc5b
commit 2cc264aa72
12 changed files with 119 additions and 75 deletions
--- a/changelog.d/181.fix.md
+++ b/changelog.d/181.fix.md
@@ -0,0 +1,6 @@
+Fixes a bug with `EnvVarMixin` where it didn't respect environment variable for specific fields
+
+This inherently provide a confusing behaviour with `--model-id`. This is now has been addressed with main
+
+The base docker will now also include a installation of xformers from source, locked at a given hash, since the latest release of xformers
+are too old and would fail with vLLM when running within the k8s
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -622,7 +622,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
  _final_value_dct["env"] = env

  # bettertransformer support
-  if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env.bettertransformer_value).upper() in ENV_VARS_TRUE_VALUES
+  if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env["bettertransformer_value"]).upper() in ENV_VARS_TRUE_VALUES
  # if requires_gpu is True, then disable BetterTransformer for quantization.
  if _settings_attr["requires_gpu"]: _final_value_dct["bettertransformer"] = False
  _final_value_dct["service_name"] = f"generated_{model_name}_service.py"
@@ -1485,4 +1485,6 @@ def structure_llm_config(data: DictStrAny, cls: type[LLMConfig]) -> LLMConfig:

 bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)

-openllm_home = os.path.expanduser(os.getenv("OPENLLM_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
+openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
+
+__all__ = ["LLMConfig", "field_env_key"]
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -419,6 +419,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
    else: func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_get('{func}') else __serialisation_{func}"
    lines.extend([f"{cached_func_name}=cls.{func}", func_call, _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),])

+  # assign vllm specific implementation
+  if cls.__llm_implementation__ == "vllm":
+    globs.update({"_vllm_generate": vllm_generate, "_vllm_postprocess_generate": vllm_postprocess_generate})
+    lines.extend([_setattr_class(it, f"_vllm_{it}") for it in {"generate", "postprocess_generate"}])
+
  # cached attribute initialisation
  interface_anns = codegen.get_annotations(LLMInterface)
  for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
@@ -432,6 +437,17 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
    anns[key] = interface_anns.get(key)
  return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)

+def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
+  return generation_result[0]["outputs"][0]["text"]
+
+def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
+  outputs: list[vllm.RequestOutput] = []
+  # TODO: support prompt_token_ids
+  self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
+  while self.model.has_unfinished_requests():
+    outputs.extend([r for r in self.model.step() if r.finished])
+  return [unmarshal_vllm_outputs(i) for i in outputs]
+
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])

@attr.define(slots=True, repr=False, init=False)
@@ -470,19 +486,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    _make_assignment_script(cls)(cls)
    if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER

-    if implementation == "vllm":
-      def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
-      def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
-        outputs: list[vllm.RequestOutput] = []
-        # TODO: support prompt_token_ids
-        self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
-        while self.model.has_unfinished_requests():
-          outputs.extend([r for r in self.model.step() if r.finished])
-        return [unmarshal_vllm_outputs(i) for i in outputs]
-
-      _object_setattr(cls, "postprocess_generate", vllm_postprocess_generate)
-      _object_setattr(cls, "generate", vllm_generate)
-
  # fmt: off
  @overload
  def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
@@ -586,10 +589,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
        **attrs: The kwargs to be passed to the model.
    """
    cfg_cls = cls.config_class
-    model_id = first_not_none(model_id, os.getenv(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
+    model_id = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
    if model_id is None: raise RuntimeError("Failed to resolve a valid model_id.")
    if validate_is_path(model_id): model_id = resolve_filepath(model_id)
-    quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(cfg_cls.__openllm_env__["quantize"])), default=None)
+    quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.environ.get(cfg_cls.__openllm_env__["quantize"])), default=None)

    # quantization setup
    if quantization_config and quantize: raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
@@ -614,10 +617,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
    except Exception as err: raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={model_id}' (lookup to see its traceback):\n{err}") from err

    return cls(
-        *args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config,
-        bettertransformer=str(first_not_none(bettertransformer, os.getenv(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
-        _runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.getenv(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__),
-        _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
+        *args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config, bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
+        _runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.environ.get(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map)
+        if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
    )

  @classmethod
@@ -640,7 +642,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
        ``str``: Generated tag format that can be parsed by ``bentoml.Tag``
    """
    # specific branch for running in docker, this is very hacky, needs change upstream
-    if in_docker() and os.getenv("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])
+    if in_docker() and os.environ.get("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])

    model_name = normalise_model_name(model_id)
    model_id, *maybe_revision = model_id.rsplit(":")
@@ -649,7 +651,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
      return f"{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}"

    tag_name = f"{cls.__llm_implementation__}-{model_name}"
-    if os.getenv("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
+    if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
    if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
    else:
      _config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))
@@ -1015,8 +1017,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
    behaviour
  """
  if llm_config is not None:
-    attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"],
-                  "serialisation": first_not_none(os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
+    attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"], "serialisation": first_not_none(os.environ.get("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})

  default_implementation = llm_config.default_implementation() if llm_config is not None else "pt"
  implementation = first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)["framework_value"])
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -86,7 +86,7 @@ def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[Fal
 def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
  """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
  if respect_env:
-    spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
+    spec = os.environ.get("CUDA_VISIBLE_DEVICES", default_var)
    if not spec: return None
  else:
    if default_var is None: raise ValueError("spec is required to be not None when parsing spec.")
@@ -370,11 +370,11 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
      if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
        thread_count = math.ceil(cpus)
        for thread_env in THREAD_ENVS:
-          environ[thread_env] = os.getenv(thread_env, str(thread_count))
+          environ[thread_env] = os.environ.get(thread_env, str(thread_count))
        logger.debug("Environ for worker %s: %s", worker_index, environ)
        return environ
      for thread_env in THREAD_ENVS:
-        environ[thread_env] = os.getenv(thread_env, "1")
+        environ[thread_env] = os.environ.get(thread_env, "1")
      return environ
    return environ

--- a/src/openllm/bundle/_package.py
+++ b/src/openllm/bundle/_package.py
@@ -135,17 +135,17 @@ def construct_docker_options(
  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
  env: EnvVarMixin = llm.config["env"]
  env_dict = {
-      env.framework: env.framework_value, env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
-      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
-      env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"}
+      env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
+      env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"
+  }
  if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")

  # We need to handle None separately here, as env from subprocess doesn't accept None value.
  _env = EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)

-  if _env.bettertransformer_value is not None: env_dict[_env.bettertransformer] = str(_env.bettertransformer_value)
-  if _env.quantize_value is not None: env_dict[_env.quantize] = _env.quantize_value
-  env_dict[_env.runtime] = _env.runtime_value
+  env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
+  if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
+  env_dict[_env.runtime] = _env["runtime_value"]
  return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)

@inject
--- a/src/openllm/bundle/oci/Dockerfile
+++ b/src/openllm/bundle/oci/Dockerfile
@@ -118,6 +118,20 @@ git fetch && git checkout ${COMMIT_HASH}
 TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
 EOT

+# NOTE: Build xformers from source since the latest xformers are too old
+FROM kernel-builder as xformers-builder
+
+ENV COMMIT_HASH 2d3a2217c263419243b70c53f725213d1c386b0f
+ARG COMMIT_HASH=${COMMIT_HASH}
+
+WORKDIR /usr/src
+
+RUN <<EOT
+git clone https://github.com/facebookresearch/xformers.git && cd xformers
+git fetch && git checkout ${COMMIT_HASH}
+TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
+EOT
+
 # base image
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container

@@ -145,6 +159,9 @@ COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x8
 # Copy build artefacts for auto-gptq
 COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

+# Copy build artefacts for xformers
+COPY --from=auto-gptq-builder /usr/src/xformers/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
 # Install required dependencies
 COPY src src
 COPY hatch.toml README.md CHANGELOG.md pyproject.toml ./
@@ -158,7 +175,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        rm -rf /var/lib/apt/lists/*

 # Install all required dependencies
-RUN pip install  "ray==2.6.0" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,falcon,chatglm]" -v --no-cache-dir
+RUN pip install  "ray==2.6.0" "einops" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir

 FROM base-container

--- a/src/openllm/cli/_factory.py
+++ b/src/openllm/cli/_factory.py
@@ -138,7 +138,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
  ) -> LLMConfig | subprocess.Popen[bytes]:
    fast = str(fast).upper() in ENV_VARS_TRUE_VALUES
-    if serialisation_format == "safetensors" and quantize is not None and os.getenv("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
+    if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
      termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow")
    adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
    config, server_attrs = llm_config.model_validate_click(**attrs)
@@ -173,14 +173,14 @@ Available official model_id(s): [default: {llm_config['default_id']}]
    start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
    if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")

-    start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.getenv("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env.runtime_value, env.framework: env.framework_value})
-    if env.model_id_value: start_env[env.model_id] = str(env.model_id_value)
+    start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
+    start_env[env.model_id] = str(env["model_id_value"])
    # NOTE: quantize and bettertransformer value is already assigned within env
-    if bettertransformer is not None: start_env[env.bettertransformer] = str(env.bettertransformer_value)
-    if quantize is not None: start_env[env.quantize] = str(env.quantize_value)
+    if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
+    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))

-    llm = infer_auto_class(env.framework_value).for_model(model, model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
-    start_env.update({env.config: llm.config.model_dump_json().decode(), env.model_id: llm.model_id})
+    llm = infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
+    start_env.update({env.config: llm.config.model_dump_json().decode()})

    server = bentoml.GrpcServer("_service.py:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service.py:svc", **server_attrs)
    analytics.track_start_init(llm.config)
--- a/src/openllm/cli/entrypoint.py
+++ b/src/openllm/cli/entrypoint.py
@@ -455,15 +455,15 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
  """
  llm_config = AutoConfig.for_model(model_name)
  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
-  impl: LiteralRuntime = first_not_none(implementation, default=env.framework_value)
-  llm = infer_auto_class(impl).for_model(model_name, llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
+  impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
+  llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
  _previously_saved = False
  try:
    _ref = serialisation.get(llm)
    _previously_saved = True
  except bentoml.exceptions.NotFound:
    if not machine and output == "pretty":
-      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.getenv('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
+      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
      termui.echo(msg, fg="yellow", nl=True)
    _ref = serialisation.get(llm, auto_import=True)
    if impl == "pt" and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
@@ -518,16 +518,16 @@ def _start(
      framework: The framework to use for this LLM. By default, this is set to ``pt``.
      additional_args: Additional arguments to pass to ``openllm start``.
  """
-  fast = os.getenv("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
+  fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
  llm_config = AutoConfig.for_model(model_name)
  _ModelEnv = EnvVarMixin(model_name, first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
-  os.environ[_ModelEnv.framework] = _ModelEnv.framework_value
+  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]

  args: ListStr = ["--runtime", runtime]
  if model_id: args.extend(["--model-id", model_id])
  if timeout: args.extend(["--server-timeout", str(timeout)])
  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
-  if device and not os.getenv("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
+  if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", str(quantize)])
  elif bettertransformer: args.append("--bettertransformer")
@@ -722,15 +722,15 @@ def build_command(
  # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
  # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
  try:
-    os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env.runtime_value), "OPENLLM_SERIALIZATION": serialisation_format})
-    os.environ[env.model_id] = str(env.model_id_value)
-    os.environ[env.quantize] = str(env.quantize_value)
-    os.environ[env.bettertransformer] = str(env.bettertransformer_value)
+    os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env["runtime_value"]), "OPENLLM_SERIALIZATION": serialisation_format})
+    os.environ[env.model_id] = str(env["model_id_value"])
+    os.environ[env.quantize] = str(env["quantize_value"])
+    os.environ[env.bettertransformer] = str(env["bettertransformer_value"])

-    llm = infer_auto_class(env.framework_value).for_model(model_name, llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
+    llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)

    labels = dict(llm.identifying_params)
-    labels.update({"_type": llm.llm_type, "_framework": env.framework_value})
+    labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
    workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])

    with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
@@ -796,7 +796,7 @@ def build_command(

  if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=True)
  elif containerize:
-    backend = t.cast("DefaultBuilder", os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker"))
+    backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker"))
    try:
      bentoml.container.health(backend)
    except subprocess.CalledProcessError:
--- a/src/openllm/cli/termui.py
+++ b/src/openllm/cli/termui.py
@@ -26,7 +26,7 @@ def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.An
  attrs["fg"], call = fg if not get_debug_mode() else None, click.echo if not _with_style else click.secho
  if not get_quiet_mode(): call(text, **attrs)

-COLUMNS = int(os.getenv("COLUMNS", str(120)))
+COLUMNS: int = int(os.environ.get("COLUMNS", str(120)))

 CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore}

--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -48,7 +48,7 @@ class BaseAutoLLMClass:
        >>> llm = openllm.AutoLLM.for_model("flan-t5")
        ```
        """
-    llm = cls.infer_class_from_name(model).from_pretrained(model_id, model_version=model_version, llm_config=llm_config, **attrs)
+    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
    if ensure_available: llm.ensure_model_id_exists()
    return llm

--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -124,7 +124,7 @@ def field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None
  return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))

 # Special debug flag controled via OPENLLMDEVDEBUG
-DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.getenv(DEV_DEBUG_VAR)))
+DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False
 SHOW_CODEGEN = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -373,10 +373,11 @@ class EnvVarMixin(ReprMixin):
  def __getitem__(self, item: t.Literal["runtime_value"]) -> t.Literal["ggml", "transformers"]: ...
  # fmt: on
  def __getitem__(self, item: str | t.Any) -> t.Any:
-    if hasattr(self, item): return getattr(self, item)
+    if item.endswith("_value") and hasattr(self, f"_{item}"): return object.__getattribute__(self, f"_{item}")()
+    elif hasattr(self, item): return getattr(self, item)
    raise KeyError(f"Key {item} not found in {self}")
-  def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None,
-              runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
+
+  def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
    """EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
    from .._configuration import field_env_key
    self.model_name = inflection.underscore(model_name)
@@ -385,20 +386,37 @@ class EnvVarMixin(ReprMixin):
    self._bettertransformer = bettertransformer
    self._quantize = quantize
    self._runtime = runtime
-    for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}: setattr(self, att, field_env_key(self.model_name, att.upper()))
+    for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}:
+      setattr(self, att, field_env_key(self.model_name, att.upper()))
+
  @property
-  def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
+  def __repr_keys__(self) -> set[str]:
+    return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
+
+  def _quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None:
+    from . import first_not_none
+    return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], first_not_none(os.environ.get(self["quantize"]), default=self._quantize))
+
+  def _framework_value(self) -> LiteralRuntime:
+    from . import first_not_none
+    return t.cast(t.Literal["pt", "tf", "flax", "vllm"], first_not_none(os.environ.get(self["framework"]), default=self._implementation))
+
+  def _bettertransformer_value(self) -> bool:
+    from . import first_not_none
+    return t.cast(bool, first_not_none(os.environ.get(self["bettertransformer"], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer))
+
+  def _model_id_value(self) -> str | None:
+    from . import first_not_none
+    return first_not_none(os.environ.get(self["model_id"]), default=self._model_id)
+
+  def _runtime_value(self) -> t.Literal["ggml", "transformers"]:
+    from . import first_not_none
+    return t.cast(t.Literal["ggml", "transformers"], first_not_none(os.environ.get(self["runtime"]), default=self._runtime))
+
  @property
-  def quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None: return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(self["quantize"], self._quantize))
+  def start_docstring(self) -> str:
+    return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
+
  @property
-  def framework_value(self) -> LiteralRuntime: return t.cast(t.Literal["pt", "tf", "flax", "vllm"], os.getenv(self["framework"], self._implementation))
-  @property
-  def bettertransformer_value(self) -> bool: return os.getenv(self["bettertransformer"], str(self._bettertransformer)).upper() in ENV_VARS_TRUE_VALUES
-  @property
-  def model_id_value(self) -> str | None: return os.getenv(self["model_id"], self._model_id)
-  @property
-  def runtime_value(self) -> t.Literal["ggml", "transformers"]: return t.cast(t.Literal["ggml", "transformers"], os.getenv(self["runtime"], self._runtime))
-  @property
-  def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
-  @property
-  def module(self) -> _AnnotatedLazyLoader[t.LiteralString]: return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
+  def module(self) -> _AnnotatedLazyLoader[t.LiteralString]:
+    return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")