diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 61952026..4e16f9c1 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -9,7 +9,6 @@ import types import typing as t import attr -import fs.path import inflection import orjson @@ -500,10 +499,6 @@ class LLM(LLMInterface[M, T], ReprMixin): Returns: ``str``: Generated tag format that can be parsed by ``bentoml.Tag`` ''' - # specific branch for running in docker or kubernetes, this is very hacky, - # and probably need a better way to support custom path - if os.environ.get('BENTO_PATH') is not None: return ':'.join(fs.path.parts(model_id)[-2:]) - model_name = normalise_model_name(model_id) model_id, *maybe_revision = model_id.rsplit(':') if len(maybe_revision) > 0: @@ -1122,11 +1117,12 @@ def Runner(model_name: str, init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local()) **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour ''' + if llm_config is not None: attrs.update({ - 'model_id': llm_config['env']['model_id_value'], + 'model_id': attrs.get('model_id') or llm_config['env']['model_id_value'], 'quantize': llm_config['env']['quantize_value'], - 'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default=llm_config['serialisation']), + 'serialisation': first_not_none(attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']), 'system_message': first_not_none(os.environ.get('OPENLLM_SYSTEM_MESSAGE'), attrs.get('system_message'), None), 'prompt_template': first_not_none(os.environ.get('OPENLLM_PROMPT_TEMPLATE'), attrs.get('prompt_template'), None), }) diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 37e2201b..e5aeea27 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,9 +1,9 @@ # mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type" from __future__ import annotations -import os import typing as t import warnings +import _service_vars as svars import orjson from starlette.applications import Starlette @@ -28,10 +28,17 @@ warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast fro warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization') warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.') -model = os.environ.get('OPENLLM_MODEL', '{__model_name__}') # openllm: model name -adapter_map = os.environ.get('OPENLLM_ADAPTER_MAP', '''{__model_adapter_map__}''') # openllm: model adapter map +model = svars.model +model_id = svars.model_id +adapter_map = svars.adapter_map llm_config = openllm.AutoConfig.for_model(model) -runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map)) +runner = openllm.Runner( + model, + llm_config=llm_config, + model_id=model_id, + ensure_available=False, + adapter_map=orjson.loads(adapter_map) +) generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type name='llm-generic-embedding', scheduling_strategy=openllm_core.CascadingResourceStrategy, diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py new file mode 100644 index 00000000..e0b56828 --- /dev/null +++ b/openllm-python/src/openllm/_service_vars.py @@ -0,0 +1,6 @@ +from __future__ import annotations +import os + +model = os.environ['OPENLLM_MODEL'] # openllm: model name +model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name +adapter_map = os.environ['OPENLLM_ADAPTER_MAP'] # openllm: model adapter map diff --git a/openllm-python/src/openllm/_service_vars_pkg.py b/openllm-python/src/openllm/_service_vars_pkg.py new file mode 100644 index 00000000..49c12c58 --- /dev/null +++ b/openllm-python/src/openllm/_service_vars_pkg.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +model = '{__model_name__}' # openllm: model name +model_id = '{__model_id__}' # openllm: model id +adapter_map = '''{__model_adapter_map__}''' # openllm: model adapter map diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index ae59052b..d167f342 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -124,10 +124,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ env_dict = { env.backend: env['backend_value'], env.config: f"'{llm.config.model_dump_json().decode()}'", - env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}', - 'OPENLLM_MODEL': llm.config['model_name'], 'OPENLLM_SERIALIZATION': serialisation, - 'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'", 'BENTOML_DEBUG': str(True), 'BENTOML_QUIET': str(False), 'BENTOML_CONFIG_OPTIONS': f"'{environ['BENTOML_CONFIG_OPTIONS']}'", @@ -143,6 +140,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template) OPENLLM_MODEL_NAME = '# openllm: model name' +OPENLLM_MODEL_ID = '# openllm: model id' OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' class ModelNameFormatter(string.Formatter): @@ -170,21 +168,30 @@ class ModelAdapterMapFormatter(ModelNameFormatter): model_keyword: LiteralString = '__model_adapter_map__' _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' +_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py' def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None: from openllm_core.utils import DEBUG model_name = llm.config['model_name'] - logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/')) - with open(_service_file.__fspath__(), 'r') as f: + model_id = llm.model_id + logger.debug('Generating service vars file for %s at %s (dir=%s)', model_name, '_service_vars.py', llm_fs.getsyspath('/')) + with open(_service_vars_file.__fspath__(), 'r') as f: src_contents = f.readlines() for it in src_contents: if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n') + if OPENLLM_MODEL_ID in it: + src_contents[src_contents.index(it)] = (ModelIdFormatter(model_id).vformat(it)[:-(len(OPENLLM_MODEL_ID) + 3)] + '\n') elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n') script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents) if DEBUG: logger.info('Generated script:\n%s', script) - llm_fs.writetext(llm.config['service_name'], script) + llm_fs.writetext('_service_vars.py', script) + + logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/')) + with open(_service_file.__fspath__(), 'r') as f: + service_src = f.read() + llm_fs.writetext(llm.config['service_name'], service_src) @inject def create_bento(bento_tag: bentoml.Tag,