From e86dc35ec5d5c4ad02492f5c8ca88a46d345a7db Mon Sep 17 00:00:00 2001 From: Aaron <29749331+aarnphm@users.noreply.github.com> Date: Thu, 1 Jun 2023 00:27:48 -0700 Subject: [PATCH] chore: migrate service to use JSON until we have attrs io descriptor, this should do it Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- src/openllm/_service.py | 7 +- src/openllm/cli.py | 159 ++++++++++++++++++----------------- src/openllm/utils/codegen.py | 5 ++ 3 files changed, 89 insertions(+), 82 deletions(-) diff --git a/src/openllm/_service.py b/src/openllm/_service.py index f54a3e8f..ebc3dd64 100644 --- a/src/openllm/_service.py +++ b/src/openllm/_service.py @@ -25,11 +25,12 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r @svc.api( - input=bentoml.io.JSON(attr_model=openllm.GenerationInput.for_model(model)), - output=bentoml.io.JSON(attr_model=openllm.GenerationOutput), + input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": {}}), + output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": {}}), route="/v1/generate", ) -async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput: +async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: + qa = openllm.GenerationInput.for_model(model)(**input_dict) config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump() responses = await runner.generate.async_run(qa.prompt, **config) return openllm.GenerationOutput(responses=responses, configuration=config) diff --git a/src/openllm/cli.py b/src/openllm/cli.py index 1bc8b494..08b001da 100644 --- a/src/openllm/cli.py +++ b/src/openllm/cli.py @@ -287,22 +287,19 @@ def start_model_command( whether the server is run with GPU or not. """ from bentoml._internal.configuration import get_debug_mode - from bentoml._internal.log import configure_logging - - configure_logging() ModelEnv = openllm.utils.ModelEnv(model_name) model_command_decr: dict[str, t.Any] = {"name": ModelEnv.model_name, "context_settings": _context_settings or {}} # TODO: Probably want to use docstring for the COMMAND_DOCSTRING here instead of just importing the module. - config = openllm.AutoConfig.for_model(model_name) + llm_config = openllm.AutoConfig.for_model(model_name) aliases: list[str] = [] - if config.name_type == "dasherize": - aliases.append(config.__openllm_start_name__) + if llm_config.name_type == "dasherize": + aliases.append(llm_config.__openllm_start_name__) model_command_decr.update( { - "name": config.__openllm_model_name__, + "name": llm_config.__openllm_model_name__, "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)", "help": ModelEnv.start_docstring, "aliases": aliases if len(aliases) > 0 else None, @@ -311,7 +308,7 @@ def start_model_command( gpu_available = False try: - config.check_if_gpu_is_available(ModelEnv.get_framework_env()) + llm_config.check_if_gpu_is_available(ModelEnv.get_framework_env()) gpu_available = True except openllm.exceptions.GpuNotAvailableError: # NOTE: The model requires GPU, therefore we will return a dummy command @@ -326,13 +323,13 @@ def start_model_command( @group.command(**model_command_decr) def noop() -> openllm.LLMConfig: click.secho("No GPU available, therefore this command is disabled", fg="red") - openllm.utils.analytics.track_start_init(config, gpu_available) - return config + openllm.utils.analytics.track_start_init(llm_config, gpu_available) + return llm_config return noop @group.command(**model_command_decr) - @config.to_click_options + @llm_config.to_click_options @parse_serve_args(_serve_grpc) @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds") @click.option( @@ -340,84 +337,90 @@ def start_model_command( ) def model_start(server_timeout: int, pretrained: str | None, **attrs: t.Any) -> openllm.LLMConfig: from bentoml._internal.configuration.containers import BentoMLContainer + from bentoml._internal.log import configure_logging - nonlocal config - config, server_attrs = config.model_validate_click(**attrs) + configure_logging() - if ModelEnv.get_framework_env() == "flax": - llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config) - elif ModelEnv.get_framework_env() == "tf": - llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config) - else: - llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config) - - # NOTE: We need to initialize llm here first to check if the model is already downloaded to - # avoid deadlock before the subprocess forking. - llm.ensure_pretrained_exists() - - # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still - # run this model on GPU try: - llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env()) - gpu_available = True - except openllm.exceptions.GpuNotAvailableError: - gpu_available = False + config, server_attrs = llm_config.model_validate_click(**attrs) - openllm.utils.analytics.track_start_init(llm.config, gpu_available) + if ModelEnv.get_framework_env() == "flax": + llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config) + elif ModelEnv.get_framework_env() == "tf": + llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config) + else: + llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config) - server_attrs.update({"working_dir": os.path.dirname(__file__)}) - if _serve_grpc: - server_attrs["grpc_protocol_version"] = "v1" - # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. - development = server_attrs.pop("development") - server_attrs.setdefault("production", not development) + # NOTE: We need to initialize llm here first to check if the model is already downloaded to + # avoid deadlock before the subprocess forking. + llm.ensure_pretrained_exists() - start_env = os.environ.copy() + # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still + # run this model on GPU + try: + llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env(), force=True) + gpu_available = True + except openllm.exceptions.GpuNotAvailableError: + gpu_available = False - # NOTE: This is a hack to set current configuration - _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "") - _bentoml_config_options += ( - " " - if _bentoml_config_options - else "" - + f"api_server.timeout={server_timeout}" - + f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}' - ) + openllm.utils.analytics.track_start_init(llm.config, gpu_available) - start_env.update( - { - ModelEnv.framework: ModelEnv.get_framework_env(), - ModelEnv.model_config: llm.config.model_dump_json(), - "OPENLLM_MODEL": model_name, - "BENTOML_DEBUG": str(get_debug_mode()), - "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, - "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), - } - ) + server_attrs.update({"working_dir": os.path.dirname(__file__)}) + if _serve_grpc: + server_attrs["grpc_protocol_version"] = "v1" + # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. + development = server_attrs.pop("development") + server_attrs.setdefault("production", not development) - if llm.requirements is not None: - click.secho( - f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow" + start_env = os.environ.copy() + + # NOTE: This is a hack to set current configuration + _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "") + _bentoml_config_options += ( + " " + if _bentoml_config_options + else "" + + f"api_server.timeout={server_timeout}" + + f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}' ) - click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue") - server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") - server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs) - server.timeout = 90 - try: - server.start(env=start_env, text=True) + start_env.update( + { + ModelEnv.framework: ModelEnv.get_framework_env(), + ModelEnv.model_config: llm.config.model_dump_json().decode(), + "OPENLLM_MODEL": model_name, + "BENTOML_DEBUG": str(get_debug_mode()), + "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, + "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), + } + ) + + if llm.requirements is not None: + click.secho( + f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow" + ) + click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue") + if t.TYPE_CHECKING: + server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer] + server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") + server_attrs["timeout"] = 90 + server = server_cls("_service.py:svc", **server_attrs) + + server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False) assert server.process and server.process.stdout with server.process.stdout: for f in iter(server.process.stdout.readline, b""): - click.secho(f, fg="green", nl=False) + click.echo(f, nl=False) except Exception as err: click.secho(f"Error caught while starting LLM Server:\n{err}", fg="red") raise - finally: - click.secho("\nStopping LLM Server...\n", fg="yellow") - click.secho( - f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}", fg="blue" - ) + else: + if not get_debug_mode(): + click.secho("\nStopping LLM Server...\n", fg="yellow") + click.secho( + f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}", + fg="blue", + ) # NOTE: Return the configuration for telemetry purposes. return config @@ -471,11 +474,13 @@ def cli(): \b OpenLLM: Your one stop-and-go-solution for serving any Open Large-Language Model - - StableLM, Llama, Alpaca, Dolly, Flan-T5, and more + - StableLM, Falcon, ChatGLM, Dolly, Flan-T5, and more \b - Powered by BentoML 🍱 + HuggingFace 🤗 """ + if psutil.WINDOWS: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore @cli.command() @@ -519,12 +524,12 @@ def start_grpc_cli(): """ -@cli.command(name="bundle", aliases=["build"]) +@cli.command(aliases=["build"]) @click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])) @click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].") @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.") @output_option -def _(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]): +def bundle(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]): """Package a given models into a Bento. $ openllm bundle flan-t5 @@ -663,9 +668,5 @@ def download_models(model_name: str, pretrained: str | None, output: t.Literal[" return m -if psutil.WINDOWS: - sys.stdout.reconfigure(encoding="utf-8") # type: ignore - - if __name__ == "__main__": cli() diff --git a/src/openllm/utils/codegen.py b/src/openllm/utils/codegen.py index 26a9d9e2..25aab09a 100644 --- a/src/openllm/utils/codegen.py +++ b/src/openllm/utils/codegen.py @@ -34,6 +34,11 @@ if t.TYPE_CHECKING: from fs.base import FS class ModifyNodeProtocol(t.Protocol): + @t.overload + def __call__(self, node: Node, model_name: str) -> None: + ... + + @t.overload def __call__(self, node: Node, *args: t.Any, **attrs: t.Any) -> None: ...