mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-20 07:56:18 -04:00
feat(service): implementing with lifecycle hooks
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -43,6 +43,7 @@ if t.TYPE_CHECKING:
|
||||
|
||||
class LLMRunner(bentoml.Runner):
|
||||
llm: openllm.LLM
|
||||
config: openllm.LLMConfig
|
||||
llm_type: str
|
||||
identifying_params: dict[str, t.Any]
|
||||
|
||||
@@ -600,7 +601,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
return tag, attrs
|
||||
return tag
|
||||
|
||||
def ensure_pretrained_exists(self):
|
||||
def ensure_pretrained_exists(self) -> bentoml.Model:
|
||||
trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
|
||||
tag, kwds = self.make_tag(return_unused_kwargs=True, trust_remote_code=trust_remote_code, **self._llm_attrs)
|
||||
try:
|
||||
@@ -700,7 +701,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
|
||||
embedded: bool = False,
|
||||
scheduling_strategy: type[Strategy] | None = None,
|
||||
) -> bentoml.Runner:
|
||||
) -> LLMRunner:
|
||||
"""Convert this LLM into a Runner.
|
||||
|
||||
Args:
|
||||
@@ -848,7 +849,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
|
||||
|
||||
|
||||
def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
|
||||
def Runner(start_name: str, **attrs: t.Any) -> LLMRunner:
|
||||
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'
|
||||
|
||||
Args:
|
||||
|
||||
@@ -38,6 +38,13 @@ runner = openllm.Runner(model, llm_config=llm_config)
|
||||
svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
|
||||
|
||||
|
||||
@svc.on_deployment
|
||||
def ensure_exists():
|
||||
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
|
||||
# avoid deadlock before the subprocess forking.
|
||||
runner.llm.ensure_pretrained_exists()
|
||||
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": llm_config.model_dump()}),
|
||||
output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": llm_config.model_dump()}),
|
||||
|
||||
@@ -455,10 +455,6 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
|
||||
else:
|
||||
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
|
||||
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
|
||||
# avoid deadlock before the subprocess forking.
|
||||
llm.ensure_pretrained_exists()
|
||||
|
||||
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
|
||||
# run this model on GPU
|
||||
try:
|
||||
@@ -522,12 +518,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
|
||||
|
||||
try:
|
||||
openllm.utils.analytics.track_start_init(llm.config, gpu_available)
|
||||
server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
|
||||
if not get_debug_mode():
|
||||
assert server.process is not None and server.process.stdout is not None
|
||||
with server.process.stdout:
|
||||
for f in iter(server.process.stdout.readline, b""):
|
||||
_echo(f, nl=False, fg="white")
|
||||
server.start(env=start_env, text=True, blocking=True)
|
||||
except KeyboardInterrupt:
|
||||
on_start_end(model_name)
|
||||
except Exception as err:
|
||||
@@ -801,20 +792,19 @@ def cli_factory() -> click.Group:
|
||||
if len(bentoml.models.list(tag)) == 0:
|
||||
if output == "pretty":
|
||||
_echo(f"{tag} does not exists yet!. Downloading...", nl=True)
|
||||
m = model.ensure_pretrained_exists()
|
||||
m = model.ensure_pretrained_exists()
|
||||
if output == "pretty":
|
||||
_echo(f"Saved model: {m.tag}")
|
||||
elif output == "json":
|
||||
m = model.ensure_pretrained_exists()
|
||||
_echo(
|
||||
orjson.dumps(
|
||||
{"previously_setup": False, "framework": env, "tag": str(m.tag)}, option=orjson.OPT_INDENT_2
|
||||
).decode()
|
||||
)
|
||||
else:
|
||||
m = model.ensure_pretrained_exists()
|
||||
_echo(tag)
|
||||
else:
|
||||
m = model.ensure_pretrained_exists()
|
||||
m = bentoml.transformers.get(tag)
|
||||
if output == "pretty":
|
||||
_echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True)
|
||||
elif output == "json":
|
||||
|
||||
@@ -26,7 +26,7 @@ import openllm
|
||||
from .configuration_auto import AutoConfig
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import bentoml
|
||||
from ..._llm import LLMRunner
|
||||
|
||||
ConfigModelOrderedDict = OrderedDict[type[openllm.LLMConfig], type[openllm.LLM]]
|
||||
else:
|
||||
@@ -104,7 +104,7 @@ class _BaseAutoLLMClass:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
|
||||
def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner:
|
||||
"""
|
||||
Create a LLM Runner for the given model name.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user