feat(service): implementing with lifecycle hooks

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-05-05 06:12:43 -04:00 · 2023-06-11 04:14:18 +00:00
parent 5a7942574f
commit 512cd0715c
4 changed files with 17 additions and 19 deletions
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -43,6 +43,7 @@ if t.TYPE_CHECKING:

    class LLMRunner(bentoml.Runner):
        llm: openllm.LLM
+        config: openllm.LLMConfig
        llm_type: str
        identifying_params: dict[str, t.Any]

@@ -600,7 +601,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
            return tag, attrs
        return tag

-    def ensure_pretrained_exists(self):
+    def ensure_pretrained_exists(self) -> bentoml.Model:
        trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
        tag, kwds = self.make_tag(return_unused_kwargs=True, trust_remote_code=trust_remote_code, **self._llm_attrs)
        try:
@@ -700,7 +701,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
        embedded: bool = False,
        scheduling_strategy: type[Strategy] | None = None,
-    ) -> bentoml.Runner:
+    ) -> LLMRunner:
        """Convert this LLM into a Runner.

        Args:
@@ -848,7 +849,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)


-def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
+def Runner(start_name: str, **attrs: t.Any) -> LLMRunner:
    """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'

    Args:
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -38,6 +38,13 @@ runner = openllm.Runner(model, llm_config=llm_config)
 svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])


+@svc.on_deployment
+def ensure_exists():
+    # NOTE: We need to initialize llm here first to check if the model is already downloaded to
+    # avoid deadlock before the subprocess forking.
+    runner.llm.ensure_pretrained_exists()
+
+
@svc.api(
    input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": llm_config.model_dump()}),
    output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": llm_config.model_dump()}),
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -455,10 +455,6 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
        else:
            llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)

-        # NOTE: We need to initialize llm here first to check if the model is already downloaded to
-        # avoid deadlock before the subprocess forking.
-        llm.ensure_pretrained_exists()
-
        # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
        # run this model on GPU
        try:
@@ -522,12 +518,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai

        try:
            openllm.utils.analytics.track_start_init(llm.config, gpu_available)
-            server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
-            if not get_debug_mode():
-                assert server.process is not None and server.process.stdout is not None
-                with server.process.stdout:
-                    for f in iter(server.process.stdout.readline, b""):
-                        _echo(f, nl=False, fg="white")
+            server.start(env=start_env, text=True, blocking=True)
        except KeyboardInterrupt:
            on_start_end(model_name)
        except Exception as err:
@@ -801,20 +792,19 @@ def cli_factory() -> click.Group:
        if len(bentoml.models.list(tag)) == 0:
            if output == "pretty":
                _echo(f"{tag} does not exists yet!. Downloading...", nl=True)
-                m = model.ensure_pretrained_exists()
+            m = model.ensure_pretrained_exists()
+            if output == "pretty":
                _echo(f"Saved model: {m.tag}")
            elif output == "json":
-                m = model.ensure_pretrained_exists()
                _echo(
                    orjson.dumps(
                        {"previously_setup": False, "framework": env, "tag": str(m.tag)}, option=orjson.OPT_INDENT_2
                    ).decode()
                )
            else:
-                m = model.ensure_pretrained_exists()
                _echo(tag)
        else:
-            m = model.ensure_pretrained_exists()
+            m = bentoml.transformers.get(tag)
            if output == "pretty":
                _echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True)
            elif output == "json":
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -26,7 +26,7 @@ import openllm
 from .configuration_auto import AutoConfig

 if t.TYPE_CHECKING:
-    import bentoml
+    from ..._llm import LLMRunner

    ConfigModelOrderedDict = OrderedDict[type[openllm.LLMConfig], type[openllm.LLM]]
 else:
@@ -104,7 +104,7 @@ class _BaseAutoLLMClass:
        )

    @classmethod
-    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
+    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner:
        """
        Create a LLM Runner for the given model name.