mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-22 16:07:24 -04:00
fix(vllm): correctly load given model id from envvar (#181)
This commit is contained in:
6
changelog.d/181.fix.md
Normal file
6
changelog.d/181.fix.md
Normal file
@@ -0,0 +1,6 @@
|
||||
Fixes a bug with `EnvVarMixin` where it didn't respect environment variable for specific fields
|
||||
|
||||
This inherently provide a confusing behaviour with `--model-id`. This is now has been addressed with main
|
||||
|
||||
The base docker will now also include a installation of xformers from source, locked at a given hash, since the latest release of xformers
|
||||
are too old and would fail with vLLM when running within the k8s
|
||||
@@ -622,7 +622,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
|
||||
_final_value_dct["env"] = env
|
||||
|
||||
# bettertransformer support
|
||||
if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env.bettertransformer_value).upper() in ENV_VARS_TRUE_VALUES
|
||||
if _settings_attr["bettertransformer"] is None: _final_value_dct["bettertransformer"] = str(env["bettertransformer_value"]).upper() in ENV_VARS_TRUE_VALUES
|
||||
# if requires_gpu is True, then disable BetterTransformer for quantization.
|
||||
if _settings_attr["requires_gpu"]: _final_value_dct["bettertransformer"] = False
|
||||
_final_value_dct["service_name"] = f"generated_{model_name}_service.py"
|
||||
@@ -1485,4 +1485,6 @@ def structure_llm_config(data: DictStrAny, cls: type[LLMConfig]) -> LLMConfig:
|
||||
|
||||
bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
|
||||
|
||||
openllm_home = os.path.expanduser(os.getenv("OPENLLM_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
|
||||
openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
|
||||
|
||||
__all__ = ["LLMConfig", "field_env_key"]
|
||||
|
||||
@@ -419,6 +419,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
|
||||
else: func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_get('{func}') else __serialisation_{func}"
|
||||
lines.extend([f"{cached_func_name}=cls.{func}", func_call, _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),])
|
||||
|
||||
# assign vllm specific implementation
|
||||
if cls.__llm_implementation__ == "vllm":
|
||||
globs.update({"_vllm_generate": vllm_generate, "_vllm_postprocess_generate": vllm_postprocess_generate})
|
||||
lines.extend([_setattr_class(it, f"_vllm_{it}") for it in {"generate", "postprocess_generate"}])
|
||||
|
||||
# cached attribute initialisation
|
||||
interface_anns = codegen.get_annotations(LLMInterface)
|
||||
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
|
||||
@@ -432,6 +437,17 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
|
||||
anns[key] = interface_anns.get(key)
|
||||
return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
|
||||
|
||||
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
|
||||
return generation_result[0]["outputs"][0]["text"]
|
||||
|
||||
def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
|
||||
outputs: list[vllm.RequestOutput] = []
|
||||
# TODO: support prompt_token_ids
|
||||
self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
outputs.extend([r for r in self.model.step() if r.finished])
|
||||
return [unmarshal_vllm_outputs(i) for i in outputs]
|
||||
|
||||
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
|
||||
|
||||
@attr.define(slots=True, repr=False, init=False)
|
||||
@@ -470,19 +486,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
_make_assignment_script(cls)(cls)
|
||||
if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER
|
||||
|
||||
if implementation == "vllm":
|
||||
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
|
||||
def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
|
||||
outputs: list[vllm.RequestOutput] = []
|
||||
# TODO: support prompt_token_ids
|
||||
self.model.add_request(request_id=str(uuid.uuid4().hex), prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
outputs.extend([r for r in self.model.step() if r.finished])
|
||||
return [unmarshal_vllm_outputs(i) for i in outputs]
|
||||
|
||||
_object_setattr(cls, "postprocess_generate", vllm_postprocess_generate)
|
||||
_object_setattr(cls, "generate", vllm_generate)
|
||||
|
||||
# fmt: off
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
|
||||
@@ -586,10 +589,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
**attrs: The kwargs to be passed to the model.
|
||||
"""
|
||||
cfg_cls = cls.config_class
|
||||
model_id = first_not_none(model_id, os.getenv(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
|
||||
model_id = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__["model_id"]), cfg_cls.__openllm_default_id__)
|
||||
if model_id is None: raise RuntimeError("Failed to resolve a valid model_id.")
|
||||
if validate_is_path(model_id): model_id = resolve_filepath(model_id)
|
||||
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(cfg_cls.__openllm_env__["quantize"])), default=None)
|
||||
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.environ.get(cfg_cls.__openllm_env__["quantize"])), default=None)
|
||||
|
||||
# quantization setup
|
||||
if quantization_config and quantize: raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
|
||||
@@ -614,10 +617,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
except Exception as err: raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={model_id}' (lookup to see its traceback):\n{err}") from err
|
||||
|
||||
return cls(
|
||||
*args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config,
|
||||
bettertransformer=str(first_not_none(bettertransformer, os.getenv(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
|
||||
_runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.getenv(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__),
|
||||
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
|
||||
*args, model_id=model_id, llm_config=llm_config, quantization_config=quantization_config, bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES,
|
||||
_runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.environ.get(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map)
|
||||
if adapter_map is not None else None, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, **attrs
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -640,7 +642,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
``str``: Generated tag format that can be parsed by ``bentoml.Tag``
|
||||
"""
|
||||
# specific branch for running in docker, this is very hacky, needs change upstream
|
||||
if in_docker() and os.getenv("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])
|
||||
if in_docker() and os.environ.get("BENTO_PATH") is not None: return ":".join(fs.path.parts(model_id)[-2:])
|
||||
|
||||
model_name = normalise_model_name(model_id)
|
||||
model_id, *maybe_revision = model_id.rsplit(":")
|
||||
@@ -649,7 +651,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
return f"{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}"
|
||||
|
||||
tag_name = f"{cls.__llm_implementation__}-{model_name}"
|
||||
if os.getenv("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
|
||||
if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
|
||||
if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
|
||||
else:
|
||||
_config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))
|
||||
@@ -1015,8 +1017,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
|
||||
behaviour
|
||||
"""
|
||||
if llm_config is not None:
|
||||
attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"],
|
||||
"serialisation": first_not_none(os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
|
||||
attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"], "serialisation": first_not_none(os.environ.get("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
|
||||
|
||||
default_implementation = llm_config.default_implementation() if llm_config is not None else "pt"
|
||||
implementation = first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)["framework_value"])
|
||||
|
||||
@@ -86,7 +86,7 @@ def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[Fal
|
||||
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
|
||||
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
|
||||
if respect_env:
|
||||
spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
|
||||
spec = os.environ.get("CUDA_VISIBLE_DEVICES", default_var)
|
||||
if not spec: return None
|
||||
else:
|
||||
if default_var is None: raise ValueError("spec is required to be not None when parsing spec.")
|
||||
@@ -370,11 +370,11 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
|
||||
thread_count = math.ceil(cpus)
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.getenv(thread_env, str(thread_count))
|
||||
environ[thread_env] = os.environ.get(thread_env, str(thread_count))
|
||||
logger.debug("Environ for worker %s: %s", worker_index, environ)
|
||||
return environ
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.getenv(thread_env, "1")
|
||||
environ[thread_env] = os.environ.get(thread_env, "1")
|
||||
return environ
|
||||
return environ
|
||||
|
||||
|
||||
@@ -135,17 +135,17 @@ def construct_docker_options(
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
env: EnvVarMixin = llm.config["env"]
|
||||
env_dict = {
|
||||
env.framework: env.framework_value, env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
|
||||
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
|
||||
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"}
|
||||
env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
|
||||
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}"
|
||||
}
|
||||
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
|
||||
|
||||
# We need to handle None separately here, as env from subprocess doesn't accept None value.
|
||||
_env = EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
|
||||
|
||||
if _env.bettertransformer_value is not None: env_dict[_env.bettertransformer] = str(_env.bettertransformer_value)
|
||||
if _env.quantize_value is not None: env_dict[_env.quantize] = _env.quantize_value
|
||||
env_dict[_env.runtime] = _env.runtime_value
|
||||
env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
|
||||
if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
|
||||
env_dict[_env.runtime] = _env["runtime_value"]
|
||||
return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
|
||||
|
||||
@inject
|
||||
|
||||
@@ -118,6 +118,20 @@ git fetch && git checkout ${COMMIT_HASH}
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
|
||||
EOT
|
||||
|
||||
# NOTE: Build xformers from source since the latest xformers are too old
|
||||
FROM kernel-builder as xformers-builder
|
||||
|
||||
ENV COMMIT_HASH 2d3a2217c263419243b70c53f725213d1c386b0f
|
||||
ARG COMMIT_HASH=${COMMIT_HASH}
|
||||
|
||||
WORKDIR /usr/src
|
||||
|
||||
RUN <<EOT
|
||||
git clone https://github.com/facebookresearch/xformers.git && cd xformers
|
||||
git fetch && git checkout ${COMMIT_HASH}
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
|
||||
EOT
|
||||
|
||||
# base image
|
||||
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container
|
||||
|
||||
@@ -145,6 +159,9 @@ COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x8
|
||||
# Copy build artefacts for auto-gptq
|
||||
COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
|
||||
# Copy build artefacts for xformers
|
||||
COPY --from=auto-gptq-builder /usr/src/xformers/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
|
||||
# Install required dependencies
|
||||
COPY src src
|
||||
COPY hatch.toml README.md CHANGELOG.md pyproject.toml ./
|
||||
@@ -158,7 +175,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install all required dependencies
|
||||
RUN pip install "ray==2.6.0" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,falcon,chatglm]" -v --no-cache-dir
|
||||
RUN pip install "ray==2.6.0" "einops" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir
|
||||
|
||||
FROM base-container
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
fast = str(fast).upper() in ENV_VARS_TRUE_VALUES
|
||||
if serialisation_format == "safetensors" and quantize is not None and os.getenv("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
|
||||
if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
|
||||
termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow")
|
||||
adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
|
||||
config, server_attrs = llm_config.model_validate_click(**attrs)
|
||||
@@ -173,14 +173,14 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
|
||||
if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
|
||||
|
||||
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.getenv("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env.runtime_value, env.framework: env.framework_value})
|
||||
if env.model_id_value: start_env[env.model_id] = str(env.model_id_value)
|
||||
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
|
||||
start_env[env.model_id] = str(env["model_id_value"])
|
||||
# NOTE: quantize and bettertransformer value is already assigned within env
|
||||
if bettertransformer is not None: start_env[env.bettertransformer] = str(env.bettertransformer_value)
|
||||
if quantize is not None: start_env[env.quantize] = str(env.quantize_value)
|
||||
if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
|
||||
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))
|
||||
|
||||
llm = infer_auto_class(env.framework_value).for_model(model, model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode(), env.model_id: llm.model_id})
|
||||
llm = infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode()})
|
||||
|
||||
server = bentoml.GrpcServer("_service.py:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service.py:svc", **server_attrs)
|
||||
analytics.track_start_init(llm.config)
|
||||
|
||||
@@ -455,15 +455,15 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
"""
|
||||
llm_config = AutoConfig.for_model(model_name)
|
||||
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
|
||||
impl: LiteralRuntime = first_not_none(implementation, default=env.framework_value)
|
||||
llm = infer_auto_class(impl).for_model(model_name, llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
|
||||
impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
|
||||
llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
|
||||
_previously_saved = False
|
||||
try:
|
||||
_ref = serialisation.get(llm)
|
||||
_previously_saved = True
|
||||
except bentoml.exceptions.NotFound:
|
||||
if not machine and output == "pretty":
|
||||
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.getenv('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
termui.echo(msg, fg="yellow", nl=True)
|
||||
_ref = serialisation.get(llm, auto_import=True)
|
||||
if impl == "pt" and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
@@ -518,16 +518,16 @@ def _start(
|
||||
framework: The framework to use for this LLM. By default, this is set to ``pt``.
|
||||
additional_args: Additional arguments to pass to ``openllm start``.
|
||||
"""
|
||||
fast = os.getenv("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
|
||||
fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
|
||||
llm_config = AutoConfig.for_model(model_name)
|
||||
_ModelEnv = EnvVarMixin(model_name, first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
|
||||
os.environ[_ModelEnv.framework] = _ModelEnv.framework_value
|
||||
os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
|
||||
|
||||
args: ListStr = ["--runtime", runtime]
|
||||
if model_id: args.extend(["--model-id", model_id])
|
||||
if timeout: args.extend(["--server-timeout", str(timeout)])
|
||||
if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
|
||||
if device and not os.getenv("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
|
||||
if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
|
||||
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
|
||||
if quantize: args.extend(["--quantize", str(quantize)])
|
||||
elif bettertransformer: args.append("--bettertransformer")
|
||||
@@ -722,15 +722,15 @@ def build_command(
|
||||
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
|
||||
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
|
||||
try:
|
||||
os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env.runtime_value), "OPENLLM_SERIALIZATION": serialisation_format})
|
||||
os.environ[env.model_id] = str(env.model_id_value)
|
||||
os.environ[env.quantize] = str(env.quantize_value)
|
||||
os.environ[env.bettertransformer] = str(env.bettertransformer_value)
|
||||
os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env["runtime_value"]), "OPENLLM_SERIALIZATION": serialisation_format})
|
||||
os.environ[env.model_id] = str(env["model_id_value"])
|
||||
os.environ[env.quantize] = str(env["quantize_value"])
|
||||
os.environ[env.bettertransformer] = str(env["bettertransformer_value"])
|
||||
|
||||
llm = infer_auto_class(env.framework_value).for_model(model_name, llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
|
||||
llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({"_type": llm.llm_type, "_framework": env.framework_value})
|
||||
labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
|
||||
workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
|
||||
|
||||
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
|
||||
@@ -796,7 +796,7 @@ def build_command(
|
||||
|
||||
if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=True)
|
||||
elif containerize:
|
||||
backend = t.cast("DefaultBuilder", os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker"))
|
||||
backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker"))
|
||||
try:
|
||||
bentoml.container.health(backend)
|
||||
except subprocess.CalledProcessError:
|
||||
|
||||
@@ -26,7 +26,7 @@ def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.An
|
||||
attrs["fg"], call = fg if not get_debug_mode() else None, click.echo if not _with_style else click.secho
|
||||
if not get_quiet_mode(): call(text, **attrs)
|
||||
|
||||
COLUMNS = int(os.getenv("COLUMNS", str(120)))
|
||||
COLUMNS: int = int(os.environ.get("COLUMNS", str(120)))
|
||||
|
||||
CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore}
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ class BaseAutoLLMClass:
|
||||
>>> llm = openllm.AutoLLM.for_model("flan-t5")
|
||||
```
|
||||
"""
|
||||
llm = cls.infer_class_from_name(model).from_pretrained(model_id, model_version=model_version, llm_config=llm_config, **attrs)
|
||||
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
|
||||
if ensure_available: llm.ensure_model_id_exists()
|
||||
return llm
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ def field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None
|
||||
return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
|
||||
|
||||
# Special debug flag controled via OPENLLMDEVDEBUG
|
||||
DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.getenv(DEV_DEBUG_VAR)))
|
||||
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
|
||||
# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
|
||||
MYPY = False
|
||||
SHOW_CODEGEN = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
|
||||
|
||||
@@ -373,10 +373,11 @@ class EnvVarMixin(ReprMixin):
|
||||
def __getitem__(self, item: t.Literal["runtime_value"]) -> t.Literal["ggml", "transformers"]: ...
|
||||
# fmt: on
|
||||
def __getitem__(self, item: str | t.Any) -> t.Any:
|
||||
if hasattr(self, item): return getattr(self, item)
|
||||
if item.endswith("_value") and hasattr(self, f"_{item}"): return object.__getattribute__(self, f"_{item}")()
|
||||
elif hasattr(self, item): return getattr(self, item)
|
||||
raise KeyError(f"Key {item} not found in {self}")
|
||||
def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None,
|
||||
runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
|
||||
|
||||
def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
|
||||
"""EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
|
||||
from .._configuration import field_env_key
|
||||
self.model_name = inflection.underscore(model_name)
|
||||
@@ -385,20 +386,37 @@ class EnvVarMixin(ReprMixin):
|
||||
self._bettertransformer = bettertransformer
|
||||
self._quantize = quantize
|
||||
self._runtime = runtime
|
||||
for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}: setattr(self, att, field_env_key(self.model_name, att.upper()))
|
||||
for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}:
|
||||
setattr(self, att, field_env_key(self.model_name, att.upper()))
|
||||
|
||||
@property
|
||||
def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
|
||||
|
||||
def _quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None:
|
||||
from . import first_not_none
|
||||
return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], first_not_none(os.environ.get(self["quantize"]), default=self._quantize))
|
||||
|
||||
def _framework_value(self) -> LiteralRuntime:
|
||||
from . import first_not_none
|
||||
return t.cast(t.Literal["pt", "tf", "flax", "vllm"], first_not_none(os.environ.get(self["framework"]), default=self._implementation))
|
||||
|
||||
def _bettertransformer_value(self) -> bool:
|
||||
from . import first_not_none
|
||||
return t.cast(bool, first_not_none(os.environ.get(self["bettertransformer"], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer))
|
||||
|
||||
def _model_id_value(self) -> str | None:
|
||||
from . import first_not_none
|
||||
return first_not_none(os.environ.get(self["model_id"]), default=self._model_id)
|
||||
|
||||
def _runtime_value(self) -> t.Literal["ggml", "transformers"]:
|
||||
from . import first_not_none
|
||||
return t.cast(t.Literal["ggml", "transformers"], first_not_none(os.environ.get(self["runtime"]), default=self._runtime))
|
||||
|
||||
@property
|
||||
def quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None: return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], os.getenv(self["quantize"], self._quantize))
|
||||
def start_docstring(self) -> str:
|
||||
return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
|
||||
|
||||
@property
|
||||
def framework_value(self) -> LiteralRuntime: return t.cast(t.Literal["pt", "tf", "flax", "vllm"], os.getenv(self["framework"], self._implementation))
|
||||
@property
|
||||
def bettertransformer_value(self) -> bool: return os.getenv(self["bettertransformer"], str(self._bettertransformer)).upper() in ENV_VARS_TRUE_VALUES
|
||||
@property
|
||||
def model_id_value(self) -> str | None: return os.getenv(self["model_id"], self._model_id)
|
||||
@property
|
||||
def runtime_value(self) -> t.Literal["ggml", "transformers"]: return t.cast(t.Literal["ggml", "transformers"], os.getenv(self["runtime"], self._runtime))
|
||||
@property
|
||||
def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
|
||||
@property
|
||||
def module(self) -> _AnnotatedLazyLoader[t.LiteralString]: return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
|
||||
def module(self) -> _AnnotatedLazyLoader[t.LiteralString]:
|
||||
return _AnnotatedLazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
|
||||
|
||||
Reference in New Issue
Block a user