fix(vllm): correctly load given model id from envvar (#181)

This commit is contained in:
Aaron Pham
2023-08-03 16:34:35 -04:00
committed by GitHub
parent db8e47bc5b
commit 2cc264aa72
12 changed files with 119 additions and 75 deletions

6
changelog.d/181.fix.md Normal file
View File

@@ -0,0 +1,6 @@
Fixes a bug with `EnvVarMixin` where it didn't respect environment variable for specific fields
This inherently provide a confusing behaviour with `--model-id`. This is now has been addressed with main
The base docker will now also include a installation of xformers from source, locked at a given hash, since the latest release of xformers
are too old and would fail with vLLM when running within the k8s

View File

@@ -622,7 +622,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
_final_value_dct["env"] = env
# bettertransformer support
if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env.bettertransformer_value).upper() in ENV_VARS_TRUE_VALUES
if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env["bettertransformer_value"]).upper() in ENV_VARS_TRUE_VALUES
# if requires_gpu is True, then disable BetterTransformer for quantization.
if _settings_attr["requires_gpu"]: _final_value_dct["bettertransformer"] = False
_final_value_dct["service_name"] = f"generated_{model_name}_service.py"
@@ -1485,4 +1485,6 @@ def structure_llm_config(data: DictStrAny, cls: type[LLMConfig]) -> LLMConfig:
bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
openllm_home = os.path.expanduser(os.getenv("OPENLLM_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
__all__ = ["LLMConfig", "field_env_key"]

View File

@@ -419,6 +419,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
else: func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_get('{func}') else __serialisation_{func}"
lines.extend([f"{cached_func_name}=cls.{func}", func_call, _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),])
# assign vllm specific implementation
if cls.__llm_implementation__ == "vllm":
globs.update({"_vllm_generate": vllm_generate, "_vllm_postprocess_generate": vllm_postprocess_generate})
lines.extend([_setattr_class(it, f"_vllm_{it}") for it in {"generate", "postprocess_generate"}])
# cached attribute initialisation
interface_anns = codegen.get_annotations(LLMInterface)
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
@@ -432,6 +437,17 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
anns[key] = interface_anns.get(key)
return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]["outputs"][0]["text"]
def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
while self.model.has_unfinished_requests():
outputs.extend([r for r in self.model.step() if r.finished])
return [unmarshal_vllm_outputs(i) for i in outputs]
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
@attr.define(slots=True, repr=False, init=False)
@@ -470,19 +486,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
_make_assignment_script(cls)(cls)
if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER
if implementation == "vllm":
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
while self.model.has_unfinished_requests():
outputs.extend([r for r in self.model.step() if r.finished])
return [unmarshal_vllm_outputs(i) for i in outputs]
_object_setattr(cls, "postprocess_generate", vllm_postprocess_generate)
_object_setattr(cls, "generate", vllm_generate)
# fmt: off
@overload
def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
@@ -586,10 +589,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
**attrs: The kwargs to be passed to the model.
"""
cfg_cls = cls.config_class
model_id = first_not_none(model_id, os.getenv(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
model_id = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
if model_id is None: raise RuntimeError("Failed to resolve a valid model_id.")
if validate_is_path(model_id): model_id = resolve_filepath(model_id)
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(cfg_cls.__openllm_env__["quantize"])), default=None)
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.environ.get(cfg_cls.__openllm_env__["quantize"])), default=None)
# quantization setup
if quantization_config and quantize: raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
@@ -614,10 +617,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
except Exception as err: raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={model_id}' (lookup to see its traceback):\n{err}") from err
return cls(
*args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config,
bettertransformer=str(first_not_none(bettertransformer, os.getenv(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
_runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.getenv(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__),
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
*args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config, bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
_runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.environ.get(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map)
if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
)
@classmethod
@@ -640,7 +642,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
``str``: Generated tag format that can be parsed by ``bentoml.Tag``
"""
# specific branch for running in docker, this is very hacky, needs change upstream
if in_docker() and os.getenv("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])
if in_docker() and os.environ.get("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])
model_name = normalise_model_name(model_id)
model_id, *maybe_revision = model_id.rsplit(":")
@@ -649,7 +651,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
return f"{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}"
tag_name = f"{cls.__llm_implementation__}-{model_name}"
if os.getenv("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
else:
_config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))
@@ -1015,8 +1017,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
behaviour
"""
if llm_config is not None:
attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"],
"serialisation": first_not_none(os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"], "serialisation": first_not_none(os.environ.get("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
default_implementation = llm_config.default_implementation() if llm_config is not None else "pt"
implementation = first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)["framework_value"])

View File

@@ -86,7 +86,7 @@ def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[Fal
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
if respect_env:
spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
spec = os.environ.get("CUDA_VISIBLE_DEVICES", default_var)
if not spec: return None
else:
if default_var is None: raise ValueError("spec is required to be not None when parsing spec.")
@@ -370,11 +370,11 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
thread_count = math.ceil(cpus)
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, str(thread_count))
environ[thread_env] = os.environ.get(thread_env, str(thread_count))
logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, "1")
environ[thread_env] = os.environ.get(thread_env, "1")
return environ
return environ

View File

@@ -135,17 +135,17 @@ def construct_docker_options(
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
env: EnvVarMixin = llm.config["env"]
env_dict = {
env.framework: env.framework_value, env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"}
env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"
}
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
# We need to handle None separately here, as env from subprocess doesn't accept None value.
_env = EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
if _env.bettertransformer_value is not None: env_dict[_env.bettertransformer] = str(_env.bettertransformer_value)
if _env.quantize_value is not None: env_dict[_env.quantize] = _env.quantize_value
env_dict[_env.runtime] = _env.runtime_value
env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
env_dict[_env.runtime] = _env["runtime_value"]
return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
@inject

View File

@@ -118,6 +118,20 @@ git fetch && git checkout ${COMMIT_HASH}
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
EOT
# NOTE: Build xformers from source since the latest xformers are too old
FROM kernel-builder as xformers-builder
ENV COMMIT_HASH 2d3a2217c263419243b70c53f725213d1c386b0f
ARG COMMIT_HASH=${COMMIT_HASH}
WORKDIR /usr/src
RUN <<EOT
git clone https://github.com/facebookresearch/xformers.git && cd xformers
git fetch && git checkout ${COMMIT_HASH}
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
EOT
# base image
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container
@@ -145,6 +159,9 @@ COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x8
# Copy build artefacts for auto-gptq
COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artefacts for xformers
COPY --from=auto-gptq-builder /usr/src/xformers/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Install required dependencies
COPY src src
COPY hatch.toml README.md CHANGELOG.md pyproject.toml ./
@@ -158,7 +175,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
rm -rf /var/lib/apt/lists/*
# Install all required dependencies
RUN pip install "ray==2.6.0" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,falcon,chatglm]" -v --no-cache-dir
RUN pip install "ray==2.6.0" "einops" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir
FROM base-container

View File

@@ -138,7 +138,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
fast = str(fast).upper() in ENV_VARS_TRUE_VALUES
if serialisation_format == "safetensors" and quantize is not None and os.getenv("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow")
adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
config, server_attrs = llm_config.model_validate_click(**attrs)
@@ -173,14 +173,14 @@ Available official model_id(s): [default: {llm_config['default_id']}]
start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.getenv("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env.runtime_value, env.framework: env.framework_value})
if env.model_id_value: start_env[env.model_id] = str(env.model_id_value)
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
start_env[env.model_id] = str(env["model_id_value"])
# NOTE: quantize and bettertransformer value is already assigned within env
if bettertransformer is not None: start_env[env.bettertransformer] = str(env.bettertransformer_value)
if quantize is not None: start_env[env.quantize] = str(env.quantize_value)
if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))
llm = infer_auto_class(env.framework_value).for_model(model, model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
start_env.update({env.config: llm.config.model_dump_json().decode(), env.model_id: llm.model_id})
llm = infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
start_env.update({env.config: llm.config.model_dump_json().decode()})
server = bentoml.GrpcServer("_service.py:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service.py:svc", **server_attrs)
analytics.track_start_init(llm.config)

View File

@@ -455,15 +455,15 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
"""
llm_config = AutoConfig.for_model(model_name)
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
impl: LiteralRuntime = first_not_none(implementation, default=env.framework_value)
llm = infer_auto_class(impl).for_model(model_name, llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
_previously_saved = False
try:
_ref = serialisation.get(llm)
_previously_saved = True
except bentoml.exceptions.NotFound:
if not machine and output == "pretty":
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.getenv('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
termui.echo(msg, fg="yellow", nl=True)
_ref = serialisation.get(llm, auto_import=True)
if impl == "pt" and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
@@ -518,16 +518,16 @@ def _start(
framework: The framework to use for this LLM. By default, this is set to ``pt``.
additional_args: Additional arguments to pass to ``openllm start``.
"""
fast = os.getenv("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
llm_config = AutoConfig.for_model(model_name)
_ModelEnv = EnvVarMixin(model_name, first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
os.environ[_ModelEnv.framework] = _ModelEnv.framework_value
os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
args: ListStr = ["--runtime", runtime]
if model_id: args.extend(["--model-id", model_id])
if timeout: args.extend(["--server-timeout", str(timeout)])
if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
if device and not os.getenv("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(["--quantize", str(quantize)])
elif bettertransformer: args.append("--bettertransformer")
@@ -722,15 +722,15 @@ def build_command(
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
try:
os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env.runtime_value), "OPENLLM_SERIALIZATION": serialisation_format})
os.environ[env.model_id] = str(env.model_id_value)
os.environ[env.quantize] = str(env.quantize_value)
os.environ[env.bettertransformer] = str(env.bettertransformer_value)
os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env["runtime_value"]), "OPENLLM_SERIALIZATION": serialisation_format})
os.environ[env.model_id] = str(env["model_id_value"])
os.environ[env.quantize] = str(env["quantize_value"])
os.environ[env.bettertransformer] = str(env["bettertransformer_value"])
llm = infer_auto_class(env.framework_value).for_model(model_name, llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
labels = dict(llm.identifying_params)
labels.update({"_type": llm.llm_type, "_framework": env.framework_value})
labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
@@ -796,7 +796,7 @@ def build_command(
if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=True)
elif containerize:
backend = t.cast("DefaultBuilder", os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker"))
backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker"))
try:
bentoml.container.health(backend)
except subprocess.CalledProcessError:

View File

@@ -26,7 +26,7 @@ def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.An
attrs["fg"], call = fg if not get_debug_mode() else None, click.echo if not _with_style else click.secho
if not get_quiet_mode(): call(text, **attrs)
COLUMNS = int(os.getenv("COLUMNS", str(120)))
COLUMNS: int = int(os.environ.get("COLUMNS", str(120)))
CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore}

View File

@@ -48,7 +48,7 @@ class BaseAutoLLMClass:
>>> llm = openllm.AutoLLM.for_model("flan-t5")
```
"""
llm = cls.infer_class_from_name(model).from_pretrained(model_id, model_version=model_version, llm_config=llm_config, **attrs)
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
if ensure_available: llm.ensure_model_id_exists()
return llm

View File

@@ -124,7 +124,7 @@ def field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None
return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
# Special debug flag controled via OPENLLMDEVDEBUG
DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.getenv(DEV_DEBUG_VAR)))
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
MYPY = False
SHOW_CODEGEN = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3

View File

@@ -373,10 +373,11 @@ class EnvVarMixin(ReprMixin):
def __getitem__(self, item: t.Literal["runtime_value"]) -> t.Literal["ggml", "transformers"]: ...
# fmt: on
def __getitem__(self, item: str | t.Any) -> t.Any:
if hasattr(self, item): return getattr(self, item)
if item.endswith("_value") and hasattr(self, f"_{item}"): return object.__getattribute__(self, f"_{item}")()
elif hasattr(self, item): return getattr(self, item)
raise KeyError(f"Key {item} not found in {self}")
def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None,
runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
"""EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
from .._configuration import field_env_key
self.model_name = inflection.underscore(model_name)
@@ -385,20 +386,37 @@ class EnvVarMixin(ReprMixin):
self._bettertransformer = bettertransformer
self._quantize = quantize
self._runtime = runtime
for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}: setattr(self, att, field_env_key(self.model_name, att.upper()))
for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}:
setattr(self, att, field_env_key(self.model_name, att.upper()))
@property
def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
def __repr_keys__(self) -> set[str]:
return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
def _quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None:
from . import first_not_none
return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], first_not_none(os.environ.get(self["quantize"]), default=self._quantize))
def _framework_value(self) -> LiteralRuntime:
from . import first_not_none
return t.cast(t.Literal["pt", "tf", "flax", "vllm"], first_not_none(os.environ.get(self["framework"]), default=self._implementation))
def _bettertransformer_value(self) -> bool:
from . import first_not_none
return t.cast(bool, first_not_none(os.environ.get(self["bettertransformer"], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer))
def _model_id_value(self) -> str | None:
from . import first_not_none
return first_not_none(os.environ.get(self["model_id"]), default=self._model_id)
def _runtime_value(self) -> t.Literal["ggml", "transformers"]:
from . import first_not_none
return t.cast(t.Literal["ggml", "transformers"], first_not_none(os.environ.get(self["runtime"]), default=self._runtime))
@property
def quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None: return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(self["quantize"], self._quantize))
def start_docstring(self) -> str:
return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
@property
def framework_value(self) -> LiteralRuntime: return t.cast(t.Literal["pt", "tf", "flax", "vllm"], os.getenv(self["framework"], self._implementation))
@property
def bettertransformer_value(self) -> bool: return os.getenv(self["bettertransformer"], str(self._bettertransformer)).upper() in ENV_VARS_TRUE_VALUES
@property
def model_id_value(self) -> str | None: return os.getenv(self["model_id"], self._model_id)
@property
def runtime_value(self) -> t.Literal["ggml", "transformers"]: return t.cast(t.Literal["ggml", "transformers"], os.getenv(self["runtime"], self._runtime))
@property
def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
@property
def module(self) -> _AnnotatedLazyLoader[t.LiteralString]: return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
def module(self) -> _AnnotatedLazyLoader[t.LiteralString]:
return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")