diff --git a/changelog.d/669.change.md b/changelog.d/669.change.md new file mode 100644 index 00000000..e3a08931 --- /dev/null +++ b/changelog.d/669.change.md @@ -0,0 +1,6 @@ +`openllm build` from 0.4.10 will start locking packages for hemerticity + +We also remove some of the packages that is not required, since it should already be in the base image. + +Improve general codegen for service_vars to static save all variables in `_service_vars.py` to save two access call in envvar. +The envvar for all variables are still there in the container for backwards compatibility. diff --git a/mypy.ini b/mypy.ini index 4d5342fd..e04ff06e 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,4 +7,4 @@ warn_unused_configs = True ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi +files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index c61f7e26..72d60d8a 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,11 +1,8 @@ -# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type" from __future__ import annotations import logging -import os import typing as t import _service_vars as svars -import orjson import bentoml import openllm @@ -14,18 +11,17 @@ from bentoml.io import JSON, Text logger = logging.getLogger(__name__) llm = openllm.LLM[t.Any, t.Any]( - svars.model_id, + model_id=svars.model_id, model_tag=svars.model_tag, - prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), None), - system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), None), - serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), 'safetensors'), - adapter_map=orjson.loads(svars.adapter_map), - trust_remote_code=openllm.utils.check_bool_env('TRUST_REMOTE_CODE', default=False), + prompt_template=svars.prompt_template, + system_message=svars.system_message, + serialisation=svars.serialization, + adapter_map=svars.adapter_map, + trust_remote_code=svars.trust_remote_code, ) -llm_config = llm.config -svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner]) +svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner]) -llm_model_class = openllm.GenerationInput.from_llm_config(llm_config) +llm_model_class = openllm.GenerationInput.from_llm_config(llm.config) @svc.api( @@ -49,11 +45,11 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s _Metadata = openllm.MetadataOutput( - timeout=llm_config['timeout'], - model_name=llm_config['model_name'], + timeout=llm.config['timeout'], + model_name=llm.config['model_name'], backend=llm.__llm_backend__, model_id=llm.model_id, - configuration=llm_config.model_dump_json().decode(), + configuration=llm.config.model_dump_json().decode(), prompt_template=llm.runner.prompt_template, system_message=llm.runner.system_message, ) diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py index 49ceedc3..05900ff9 100644 --- a/openllm-python/src/openllm/_service_vars.py +++ b/openllm-python/src/openllm/_service_vars.py @@ -1,5 +1,13 @@ import os -model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name -model_tag = None # openllm: model tag -adapter_map = os.environ['OPENLLM_ADAPTER_MAP'] # openllm: model adapter map +import orjson + +from openllm_core.utils import ENV_VARS_TRUE_VALUES + +model_id = os.environ['OPENLLM_MODEL_ID'] +model_tag = None +adapter_map = orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))) +prompt_template = os.getenv('OPENLLM_PROMPT_TEMPLATE') +system_message = os.getenv('OPENLLM_SYSTEM_MESSAGE') +serialization = os.getenv('OPENLLM_SERIALIZATION', default='safetensors') +trust_remote_code = str(os.getenv('TRUST_REMOTE_CODE', default=str(False))).upper() in ENV_VARS_TRUE_VALUES diff --git a/openllm-python/src/openllm/_service_vars.pyi b/openllm-python/src/openllm/_service_vars.pyi new file mode 100644 index 00000000..bf89944c --- /dev/null +++ b/openllm-python/src/openllm/_service_vars.pyi @@ -0,0 +1,11 @@ +from typing import Dict, Optional + +from openllm_core._typing_compat import LiteralSerialisation + +model_id: str = ... +model_tag: Optional[str] = ... +adapter_map: Optional[Dict[str, str]] = ... +prompt_template: Optional[str] = ... +system_message: Optional[str] = ... +serialization: LiteralSerialisation = ... +trust_remote_code: bool = ... diff --git a/openllm-python/src/openllm/_service_vars_pkg.py b/openllm-python/src/openllm/_service_vars_pkg.py index f7ed217b..8b1b1d97 100644 --- a/openllm-python/src/openllm/_service_vars_pkg.py +++ b/openllm-python/src/openllm/_service_vars_pkg.py @@ -1,3 +1,9 @@ +import orjson + model_id = '{__model_id__}' # openllm: model id model_tag = '{__model_tag__}' # openllm: model tag -adapter_map = """{__model_adapter_map__}""" # openllm: model adapter map +adapter_map = orjson.loads("""{__model_adapter_map__}""") # openllm: model adapter map +serialization = '{__model_serialization__}' # openllm: model serialization +prompt_template = {__model_prompt_template__} # openllm: model prompt template +system_message = {__model_system_message__} # openllm: model system message +trust_remote_code = {__model_trust_remote_code__} # openllm: model trust remote code diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 26daf585..df5e7d51 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -54,28 +54,17 @@ def build_editable(path, package='openllm'): def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None): - packages = ['openllm', 'scipy'] # apparently bnb misses this one + packages = ['scipy', 'bentoml[tracing]==1.1.9'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] - # NOTE: add openllm to the default dependencies - # if users has openllm custom built wheels, it will still respect - # that since bentoml will always install dependencies from requirements.txt - # first, then proceed to install everything inside the wheels/ folder. if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies] - - req = llm.config['requirements'] - if req is not None: - packages.extend(req) - if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false': - packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}") - - # XXX: Currently locking this for correctness - packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']) - wheels = [] + if llm.config['requirements'] is not None: + packages.extend(llm.config['requirements']) + wheels = None built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')] if all(i for i in built_wheels): - wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) + wheels = [llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels] return PythonOptions( packages=packages, wheels=wheels, @@ -90,30 +79,25 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N def construct_docker_options( llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy ): - from openllm_cli._factory import parse_config_options + from openllm_cli.entrypoint import process_environ - environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy()) - env_dict = { - 'TORCH_DTYPE': str(llm._torch_dtype).split('.')[-1], - 'OPENLLM_BACKEND': llm.__llm_backend__, - 'OPENLLM_CONFIG': f"'{llm.config.model_dump_json(flatten=True).decode()}'", - 'OPENLLM_SERIALIZATION': serialisation, - 'BENTOML_DEBUG': str(True), - 'BENTOML_QUIET': str(False), - 'BENTOML_CONFIG_OPTIONS': f"'{environ['BENTOML_CONFIG_OPTIONS']}'", - 'TRUST_REMOTE_CODE': str(llm.trust_remote_code), - } - if adapter_map: - env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1') - if llm._system_message: - env_dict['OPENLLM_SYSTEM_MESSAGE'] = repr(llm._system_message) - if llm._prompt_template: - env_dict['OPENLLM_PROMPT_TEMPLATE'] = repr(llm._prompt_template.to_string()) - if quantize: - env_dict['OPENLLM_QUANTIZE'] = str(quantize) + environ = process_environ( + llm.config, + llm.config['timeout'], + 1.0, + None, + True, + llm.model_id, + None, + llm._serialisation, + llm, + llm._system_message, + llm._prompt_template, + use_current_env=False, + ) return DockerOptions( base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy), - env=env_dict, + env=environ, dockerfile_template=dockerfile_template, ) @@ -121,6 +105,10 @@ def construct_docker_options( OPENLLM_MODEL_ID = '# openllm: model id' OPENLLM_MODEL_TAG = '# openllm: model tag' OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' +OPENLLM_MODEL_PROMPT_TEMPLATE = '# openllm: model prompt template' +OPENLLM_MODEL_SYSTEM_MESSAGE = '# openllm: model system message' +OPENLLM_MODEL_SERIALIZATION = '# openllm: model serialization' +OPENLLM_MODEL_TRUST_REMOTE_CODE = '# openllm: model trust remote code' class _ServiceVarsFormatter(string.Formatter): @@ -156,6 +144,26 @@ class ModelAdapterMapFormatter(_ServiceVarsFormatter): identifier = OPENLLM_MODEL_ADAPTER_MAP +class ModelPromptTemplateFormatter(_ServiceVarsFormatter): + keyword = '__model_prompt_template__' + identifier = OPENLLM_MODEL_PROMPT_TEMPLATE + + +class ModelSystemMessageFormatter(_ServiceVarsFormatter): + keyword = '__model_system_message__' + identifier = OPENLLM_MODEL_SYSTEM_MESSAGE + + +class ModelSerializationFormatter(_ServiceVarsFormatter): + keyword = '__model_serialization__' + identifier = OPENLLM_MODEL_SERIALIZATION + + +class ModelTrustRemoteCodeFormatter(_ServiceVarsFormatter): + keyword = '__model_trust_remote_code__' + identifier = OPENLLM_MODEL_TRUST_REMOTE_CODE + + _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' _service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py' @@ -164,6 +172,8 @@ def write_service(llm, llm_fs, adapter_map): model_id_formatter = ModelIdFormatter(llm.model_id) model_tag_formatter = ModelTagFormatter(str(llm.tag)) adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()) + serialization_formatter = ModelSerializationFormatter(llm.config['serialisation']) + trust_remote_code_formatter = ModelTrustRemoteCodeFormatter(str(llm.trust_remote_code)) logger.debug( 'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/') @@ -177,6 +187,20 @@ def write_service(llm, llm_fs, adapter_map): src_contents[i] = model_tag_formatter.parse_line(it) elif adapter_map_formatter.identifier in it: src_contents[i] = adapter_map_formatter.parse_line(it) + elif serialization_formatter.identifier in it: + src_contents[i] = serialization_formatter.parse_line(it) + elif trust_remote_code_formatter.identifier in it: + src_contents[i] = trust_remote_code_formatter.parse_line(it) + elif OPENLLM_MODEL_PROMPT_TEMPLATE in it: + if llm._prompt_template: + src_contents[i] = ModelPromptTemplateFormatter(f'"""{llm._prompt_template.to_string()}"""').parse_line(it) + else: + src_contents[i] = ModelPromptTemplateFormatter(str(None)).parse_line(it) + elif OPENLLM_MODEL_SYSTEM_MESSAGE in it: + if llm._system_message: + src_contents[i] = ModelSystemMessageFormatter(f'"""{llm._system_message}"""').parse_line(it) + else: + src_contents[i] = ModelSystemMessageFormatter(str(None)).parse_line(it) script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents) if SHOW_CODEGEN: diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index 4d705656..91991b92 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -622,9 +622,22 @@ def start_grpc_command( def process_environ( - config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, system_message, prompt_template + config, + server_timeout, + wpr, + device, + cors, + model_id, + adapter_map, + serialisation, + llm, + system_message, + prompt_template, + use_current_env=True, ) -> t.Dict[str, t.Any]: - environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy()) + environ = parse_config_options( + config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {} + ) environ.update( { 'OPENLLM_MODEL_ID': model_id, @@ -1019,22 +1032,21 @@ def build_command( ), ) backend_warning(llm.__llm_backend__, build=True) - os.environ.update( - { - 'TORCH_DTYPE': dtype, - 'OPENLLM_BACKEND': llm.__llm_backend__, - 'OPENLLM_SERIALIZATION': llm._serialisation, - 'OPENLLM_MODEL_ID': llm.model_id, - 'OPENLLM_ADAPTER_MAP': orjson.dumps(None).decode(), - } + **process_environ( + llm.config, + llm.config['timeout'], + 1.0, + None, + True, + llm.model_id, + None, + llm._serialisation, + llm, + llm._system_message, + llm._prompt_template, + ) ) - if llm.quantise: - os.environ['OPENLLM_QUANTIZE'] = str(llm.quantise) - if system_message: - os.environ['OPENLLM_SYSTEM_MESSAGE'] = system_message - if prompt_template: - os.environ['OPENLLM_PROMPT_TEMPLATE'] = prompt_template try: assert llm.bentomodel # HACK: call it here to patch correct tag with revision and everything @@ -1049,7 +1061,7 @@ def build_command( llm_fs.writetext('Dockerfile.template', dockerfile_template.read()) dockerfile_template_path = llm_fs.getsyspath('/Dockerfile.template') - adapter_map: dict[str, str] | None = None + adapter_map = None if adapter_id and not build_ctx: ctx.fail("'build_ctx' is required when '--adapter-id' is passsed.") if adapter_id: diff --git a/ruff.toml b/ruff.toml index 0b6f16c5..584858c3 100644 --- a/ruff.toml +++ b/ruff.toml @@ -122,3 +122,4 @@ docstring-quotes = "double" "openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"] "openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"] "openllm-python/src/openllm/__init__.pyi" = ["I001"] +"openllm-python/src/openllm/_service_vars_pkg.py" = ["F821"]