From e86dc35ec5d5c4ad02492f5c8ca88a46d345a7db Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Thu, 1 Jun 2023 00:27:48 -0700
Subject: [PATCH] chore: migrate service to use JSON

until we have attrs io descriptor, this should do it

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_service.py      |   7 +-
 src/openllm/cli.py           | 159 ++++++++++++++++++-----------------
 src/openllm/utils/codegen.py |   5 ++
 3 files changed, 89 insertions(+), 82 deletions(-)

diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index f54a3e8f..ebc3dd64 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -25,11 +25,12 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
 
 
 @svc.api(
-    input=bentoml.io.JSON(attr_model=openllm.GenerationInput.for_model(model)),
-    output=bentoml.io.JSON(attr_model=openllm.GenerationOutput),
+    input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": {}}),
+    output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": {}}),
     route="/v1/generate",
 )
-async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
+async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
+    qa = openllm.GenerationInput.for_model(model)(**input_dict)
     config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
     responses = await runner.generate.async_run(qa.prompt, **config)
     return openllm.GenerationOutput(responses=responses, configuration=config)
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 1bc8b494..08b001da 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -287,22 +287,19 @@ def start_model_command(
     whether the server is run with GPU or not.
     """
     from bentoml._internal.configuration import get_debug_mode
-    from bentoml._internal.log import configure_logging
-
-    configure_logging()
 
     ModelEnv = openllm.utils.ModelEnv(model_name)
     model_command_decr: dict[str, t.Any] = {"name": ModelEnv.model_name, "context_settings": _context_settings or {}}
 
     # TODO: Probably want to use docstring for the COMMAND_DOCSTRING here instead of just importing the module.
-    config = openllm.AutoConfig.for_model(model_name)
+    llm_config = openllm.AutoConfig.for_model(model_name)
 
     aliases: list[str] = []
-    if config.name_type == "dasherize":
-        aliases.append(config.__openllm_start_name__)
+    if llm_config.name_type == "dasherize":
+        aliases.append(llm_config.__openllm_start_name__)
     model_command_decr.update(
         {
-            "name": config.__openllm_model_name__,
+            "name": llm_config.__openllm_model_name__,
             "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
             "help": ModelEnv.start_docstring,
             "aliases": aliases if len(aliases) > 0 else None,
@@ -311,7 +308,7 @@ def start_model_command(
 
     gpu_available = False
     try:
-        config.check_if_gpu_is_available(ModelEnv.get_framework_env())
+        llm_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
         gpu_available = True
     except openllm.exceptions.GpuNotAvailableError:
         # NOTE: The model requires GPU, therefore we will return a dummy command
@@ -326,13 +323,13 @@ def start_model_command(
         @group.command(**model_command_decr)
         def noop() -> openllm.LLMConfig:
             click.secho("No GPU available, therefore this command is disabled", fg="red")
-            openllm.utils.analytics.track_start_init(config, gpu_available)
-            return config
+            openllm.utils.analytics.track_start_init(llm_config, gpu_available)
+            return llm_config
 
         return noop
 
     @group.command(**model_command_decr)
-    @config.to_click_options
+    @llm_config.to_click_options
     @parse_serve_args(_serve_grpc)
     @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
     @click.option(
@@ -340,84 +337,90 @@ def start_model_command(
     )
     def model_start(server_timeout: int, pretrained: str | None, **attrs: t.Any) -> openllm.LLMConfig:
         from bentoml._internal.configuration.containers import BentoMLContainer
+        from bentoml._internal.log import configure_logging
 
-        nonlocal config
-        config, server_attrs = config.model_validate_click(**attrs)
+        configure_logging()
 
-        if ModelEnv.get_framework_env() == "flax":
-            llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
-        elif ModelEnv.get_framework_env() == "tf":
-            llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
-        else:
-            llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
-
-        # NOTE: We need to initialize llm here first to check if the model is already downloaded to
-        # avoid deadlock before the subprocess forking.
-        llm.ensure_pretrained_exists()
-
-        # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
-        # run this model on GPU
         try:
-            llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env())
-            gpu_available = True
-        except openllm.exceptions.GpuNotAvailableError:
-            gpu_available = False
+            config, server_attrs = llm_config.model_validate_click(**attrs)
 
-        openllm.utils.analytics.track_start_init(llm.config, gpu_available)
+            if ModelEnv.get_framework_env() == "flax":
+                llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            elif ModelEnv.get_framework_env() == "tf":
+                llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            else:
+                llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
 
-        server_attrs.update({"working_dir": os.path.dirname(__file__)})
-        if _serve_grpc:
-            server_attrs["grpc_protocol_version"] = "v1"
-        # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
-        development = server_attrs.pop("development")
-        server_attrs.setdefault("production", not development)
+            # NOTE: We need to initialize llm here first to check if the model is already downloaded to
+            # avoid deadlock before the subprocess forking.
+            llm.ensure_pretrained_exists()
 
-        start_env = os.environ.copy()
+            # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
+            # run this model on GPU
+            try:
+                llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env(), force=True)
+                gpu_available = True
+            except openllm.exceptions.GpuNotAvailableError:
+                gpu_available = False
 
-        # NOTE: This is a hack to set current configuration
-        _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "")
-        _bentoml_config_options += (
-            " "
-            if _bentoml_config_options
-            else ""
-            + f"api_server.timeout={server_timeout}"
-            + f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}'
-        )
+            openllm.utils.analytics.track_start_init(llm.config, gpu_available)
 
-        start_env.update(
-            {
-                ModelEnv.framework: ModelEnv.get_framework_env(),
-                ModelEnv.model_config: llm.config.model_dump_json(),
-                "OPENLLM_MODEL": model_name,
-                "BENTOML_DEBUG": str(get_debug_mode()),
-                "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
-                "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
-            }
-        )
+            server_attrs.update({"working_dir": os.path.dirname(__file__)})
+            if _serve_grpc:
+                server_attrs["grpc_protocol_version"] = "v1"
+            # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
+            development = server_attrs.pop("development")
+            server_attrs.setdefault("production", not development)
 
-        if llm.requirements is not None:
-            click.secho(
-                f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow"
+            start_env = os.environ.copy()
+
+            # NOTE: This is a hack to set current configuration
+            _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "")
+            _bentoml_config_options += (
+                " "
+                if _bentoml_config_options
+                else ""
+                + f"api_server.timeout={server_timeout}"
+                + f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}'
             )
-        click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue")
-        server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
-        server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
-        server.timeout = 90
 
-        try:
-            server.start(env=start_env, text=True)
+            start_env.update(
+                {
+                    ModelEnv.framework: ModelEnv.get_framework_env(),
+                    ModelEnv.model_config: llm.config.model_dump_json().decode(),
+                    "OPENLLM_MODEL": model_name,
+                    "BENTOML_DEBUG": str(get_debug_mode()),
+                    "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
+                    "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
+                }
+            )
+
+            if llm.requirements is not None:
+                click.secho(
+                    f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow"
+                )
+            click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue")
+            if t.TYPE_CHECKING:
+                server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer]
+            server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
+            server_attrs["timeout"] = 90
+            server = server_cls("_service.py:svc", **server_attrs)
+
+            server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
             assert server.process and server.process.stdout
             with server.process.stdout:
                 for f in iter(server.process.stdout.readline, b""):
-                    click.secho(f, fg="green", nl=False)
+                    click.echo(f, nl=False)
         except Exception as err:
             click.secho(f"Error caught while starting LLM Server:\n{err}", fg="red")
             raise
-        finally:
-            click.secho("\nStopping LLM Server...\n", fg="yellow")
-            click.secho(
-                f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}", fg="blue"
-            )
+        else:
+            if not get_debug_mode():
+                click.secho("\nStopping LLM Server...\n", fg="yellow")
+                click.secho(
+                    f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}",
+                    fg="blue",
+                )
 
         # NOTE: Return the configuration for telemetry purposes.
         return config
@@ -471,11 +474,13 @@ def cli():
     \b
     OpenLLM: Your one stop-and-go-solution for serving any Open Large-Language Model
 
-        - StableLM, Llama, Alpaca, Dolly, Flan-T5, and more
+        - StableLM, Falcon, ChatGLM, Dolly, Flan-T5, and more
 
     \b
         - Powered by BentoML 🍱 + HuggingFace 🤗
     """
+    if psutil.WINDOWS:
+        sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
 
 
 @cli.command()
@@ -519,12 +524,12 @@ def start_grpc_cli():
     """
 
 
-@cli.command(name="bundle", aliases=["build"])
+@cli.command(aliases=["build"])
 @click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
 @click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].")
 @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
 @output_option
-def _(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]):
+def bundle(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]):
     """Package a given models into a Bento.
 
     $ openllm bundle flan-t5
@@ -663,9 +668,5 @@ def download_models(model_name: str, pretrained: str | None, output: t.Literal["
     return m
 
 
-if psutil.WINDOWS:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
 if __name__ == "__main__":
     cli()
diff --git a/src/openllm/utils/codegen.py b/src/openllm/utils/codegen.py
index 26a9d9e2..25aab09a 100644
--- a/src/openllm/utils/codegen.py
+++ b/src/openllm/utils/codegen.py
@@ -34,6 +34,11 @@ if t.TYPE_CHECKING:
     from fs.base import FS
 
     class ModifyNodeProtocol(t.Protocol):
+        @t.overload
+        def __call__(self, node: Node, model_name: str) -> None:
+            ...
+
+        @t.overload
         def __call__(self, node: Node, *args: t.Any, **attrs: t.Any) -> None:
             ...