feat(service): implementing with lifecycle hooks

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
aarnphm-ec2-dev
2023-06-11 04:14:18 +00:00
parent 5a7942574f
commit 512cd0715c
4 changed files with 17 additions and 19 deletions

View File

@@ -43,6 +43,7 @@ if t.TYPE_CHECKING:
class LLMRunner(bentoml.Runner):
llm: openllm.LLM
config: openllm.LLMConfig
llm_type: str
identifying_params: dict[str, t.Any]
@@ -600,7 +601,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
return tag, attrs
return tag
def ensure_pretrained_exists(self):
def ensure_pretrained_exists(self) -> bentoml.Model:
trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
tag, kwds = self.make_tag(return_unused_kwargs=True, trust_remote_code=trust_remote_code, **self._llm_attrs)
try:
@@ -700,7 +701,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
embedded: bool = False,
scheduling_strategy: type[Strategy] | None = None,
) -> bentoml.Runner:
) -> LLMRunner:
"""Convert this LLM into a Runner.
Args:
@@ -848,7 +849,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
def Runner(start_name: str, **attrs: t.Any) -> LLMRunner:
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'
Args:

View File

@@ -38,6 +38,13 @@ runner = openllm.Runner(model, llm_config=llm_config)
svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
@svc.on_deployment
def ensure_exists():
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
# avoid deadlock before the subprocess forking.
runner.llm.ensure_pretrained_exists()
@svc.api(
input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": llm_config.model_dump()}),
output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": llm_config.model_dump()}),

View File

@@ -455,10 +455,6 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
else:
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
# avoid deadlock before the subprocess forking.
llm.ensure_pretrained_exists()
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
# run this model on GPU
try:
@@ -522,12 +518,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
try:
openllm.utils.analytics.track_start_init(llm.config, gpu_available)
server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
if not get_debug_mode():
assert server.process is not None and server.process.stdout is not None
with server.process.stdout:
for f in iter(server.process.stdout.readline, b""):
_echo(f, nl=False, fg="white")
server.start(env=start_env, text=True, blocking=True)
except KeyboardInterrupt:
on_start_end(model_name)
except Exception as err:
@@ -801,20 +792,19 @@ def cli_factory() -> click.Group:
if len(bentoml.models.list(tag)) == 0:
if output == "pretty":
_echo(f"{tag} does not exists yet!. Downloading...", nl=True)
m = model.ensure_pretrained_exists()
m = model.ensure_pretrained_exists()
if output == "pretty":
_echo(f"Saved model: {m.tag}")
elif output == "json":
m = model.ensure_pretrained_exists()
_echo(
orjson.dumps(
{"previously_setup": False, "framework": env, "tag": str(m.tag)}, option=orjson.OPT_INDENT_2
).decode()
)
else:
m = model.ensure_pretrained_exists()
_echo(tag)
else:
m = model.ensure_pretrained_exists()
m = bentoml.transformers.get(tag)
if output == "pretty":
_echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True)
elif output == "json":

View File

@@ -26,7 +26,7 @@ import openllm
from .configuration_auto import AutoConfig
if t.TYPE_CHECKING:
import bentoml
from ..._llm import LLMRunner
ConfigModelOrderedDict = OrderedDict[type[openllm.LLMConfig], type[openllm.LLM]]
else:
@@ -104,7 +104,7 @@ class _BaseAutoLLMClass:
)
@classmethod
def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner:
"""
Create a LLM Runner for the given model name.