From 512cd0715c5f8cfe135fdcfce4250df7804a65cc Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 11 Jun 2023 04:14:18 +0000
Subject: [PATCH] feat(service): implementing with lifecycle hooks

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_llm.py                |  7 ++++---
 src/openllm/_service.py            |  7 +++++++
 src/openllm/cli.py                 | 18 ++++--------------
 src/openllm/models/auto/factory.py |  4 ++--
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 0d623df0..da883bb1 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -43,6 +43,7 @@ if t.TYPE_CHECKING:
 
     class LLMRunner(bentoml.Runner):
         llm: openllm.LLM
+        config: openllm.LLMConfig
         llm_type: str
         identifying_params: dict[str, t.Any]
 
@@ -600,7 +601,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
             return tag, attrs
         return tag
 
-    def ensure_pretrained_exists(self):
+    def ensure_pretrained_exists(self) -> bentoml.Model:
         trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
         tag, kwds = self.make_tag(return_unused_kwargs=True, trust_remote_code=trust_remote_code, **self._llm_attrs)
         try:
@@ -700,7 +701,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
         embedded: bool = False,
         scheduling_strategy: type[Strategy] | None = None,
-    ) -> bentoml.Runner:
+    ) -> LLMRunner:
         """Convert this LLM into a Runner.
 
         Args:
@@ -848,7 +849,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
 
 
-def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
+def Runner(start_name: str, **attrs: t.Any) -> LLMRunner:
     """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'
 
     Args:
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index 2906752a..9eea9b0f 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -38,6 +38,13 @@ runner = openllm.Runner(model, llm_config=llm_config)
 svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
 
 
+@svc.on_deployment
+def ensure_exists():
+    # NOTE: We need to initialize llm here first to check if the model is already downloaded to
+    # avoid deadlock before the subprocess forking.
+    runner.llm.ensure_pretrained_exists()
+
+
 @svc.api(
     input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": llm_config.model_dump()}),
     output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": llm_config.model_dump()}),
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index d4327fb3..7514465e 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -455,10 +455,6 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
         else:
             llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
 
-        # NOTE: We need to initialize llm here first to check if the model is already downloaded to
-        # avoid deadlock before the subprocess forking.
-        llm.ensure_pretrained_exists()
-
         # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
         # run this model on GPU
         try:
@@ -522,12 +518,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
 
         try:
             openllm.utils.analytics.track_start_init(llm.config, gpu_available)
-            server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
-            if not get_debug_mode():
-                assert server.process is not None and server.process.stdout is not None
-                with server.process.stdout:
-                    for f in iter(server.process.stdout.readline, b""):
-                        _echo(f, nl=False, fg="white")
+            server.start(env=start_env, text=True, blocking=True)
         except KeyboardInterrupt:
             on_start_end(model_name)
         except Exception as err:
@@ -801,20 +792,19 @@ def cli_factory() -> click.Group:
         if len(bentoml.models.list(tag)) == 0:
             if output == "pretty":
                 _echo(f"{tag} does not exists yet!. Downloading...", nl=True)
-                m = model.ensure_pretrained_exists()
+            m = model.ensure_pretrained_exists()
+            if output == "pretty":
                 _echo(f"Saved model: {m.tag}")
             elif output == "json":
-                m = model.ensure_pretrained_exists()
                 _echo(
                     orjson.dumps(
                         {"previously_setup": False, "framework": env, "tag": str(m.tag)}, option=orjson.OPT_INDENT_2
                     ).decode()
                 )
             else:
-                m = model.ensure_pretrained_exists()
                 _echo(tag)
         else:
-            m = model.ensure_pretrained_exists()
+            m = bentoml.transformers.get(tag)
             if output == "pretty":
                 _echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True)
             elif output == "json":
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
index 1a88d6b9..f43c2744 100644
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -26,7 +26,7 @@ import openllm
 from .configuration_auto import AutoConfig
 
 if t.TYPE_CHECKING:
-    import bentoml
+    from ..._llm import LLMRunner
 
     ConfigModelOrderedDict = OrderedDict[type[openllm.LLMConfig], type[openllm.LLM]]
 else:
@@ -104,7 +104,7 @@ class _BaseAutoLLMClass:
         )
 
     @classmethod
-    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
+    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner:
         """
         Create a LLM Runner for the given model name.