chore: migrate service to use JSON

until we have attrs io descriptor, this should do it

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-06-01 00:27:48 -07:00
parent a440bea184
commit e86dc35ec5
3 changed files with 89 additions and 82 deletions

View File

@@ -25,11 +25,12 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
@svc.api(
input=bentoml.io.JSON(attr_model=openllm.GenerationInput.for_model(model)),
output=bentoml.io.JSON(attr_model=openllm.GenerationOutput),
input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": {}}),
output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": {}}),
route="/v1/generate",
)
async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa = openllm.GenerationInput.for_model(model)(**input_dict)
config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
responses = await runner.generate.async_run(qa.prompt, **config)
return openllm.GenerationOutput(responses=responses, configuration=config)

View File

@@ -287,22 +287,19 @@ def start_model_command(
whether the server is run with GPU or not.
"""
from bentoml._internal.configuration import get_debug_mode
from bentoml._internal.log import configure_logging
configure_logging()
ModelEnv = openllm.utils.ModelEnv(model_name)
model_command_decr: dict[str, t.Any] = {"name": ModelEnv.model_name, "context_settings": _context_settings or {}}
# TODO: Probably want to use docstring for the COMMAND_DOCSTRING here instead of just importing the module.
config = openllm.AutoConfig.for_model(model_name)
llm_config = openllm.AutoConfig.for_model(model_name)
aliases: list[str] = []
if config.name_type == "dasherize":
aliases.append(config.__openllm_start_name__)
if llm_config.name_type == "dasherize":
aliases.append(llm_config.__openllm_start_name__)
model_command_decr.update(
{
"name": config.__openllm_model_name__,
"name": llm_config.__openllm_model_name__,
"short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
"help": ModelEnv.start_docstring,
"aliases": aliases if len(aliases) > 0 else None,
@@ -311,7 +308,7 @@ def start_model_command(
gpu_available = False
try:
config.check_if_gpu_is_available(ModelEnv.get_framework_env())
llm_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
gpu_available = True
except openllm.exceptions.GpuNotAvailableError:
# NOTE: The model requires GPU, therefore we will return a dummy command
@@ -326,13 +323,13 @@ def start_model_command(
@group.command(**model_command_decr)
def noop() -> openllm.LLMConfig:
click.secho("No GPU available, therefore this command is disabled", fg="red")
openllm.utils.analytics.track_start_init(config, gpu_available)
return config
openllm.utils.analytics.track_start_init(llm_config, gpu_available)
return llm_config
return noop
@group.command(**model_command_decr)
@config.to_click_options
@llm_config.to_click_options
@parse_serve_args(_serve_grpc)
@click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
@click.option(
@@ -340,84 +337,90 @@ def start_model_command(
)
def model_start(server_timeout: int, pretrained: str | None, **attrs: t.Any) -> openllm.LLMConfig:
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.log import configure_logging
nonlocal config
config, server_attrs = config.model_validate_click(**attrs)
configure_logging()
if ModelEnv.get_framework_env() == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
elif ModelEnv.get_framework_env() == "tf":
llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
else:
llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
# avoid deadlock before the subprocess forking.
llm.ensure_pretrained_exists()
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
# run this model on GPU
try:
llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env())
gpu_available = True
except openllm.exceptions.GpuNotAvailableError:
gpu_available = False
config, server_attrs = llm_config.model_validate_click(**attrs)
openllm.utils.analytics.track_start_init(llm.config, gpu_available)
if ModelEnv.get_framework_env() == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
elif ModelEnv.get_framework_env() == "tf":
llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
else:
llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
server_attrs.update({"working_dir": os.path.dirname(__file__)})
if _serve_grpc:
server_attrs["grpc_protocol_version"] = "v1"
# NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_attrs.pop("development")
server_attrs.setdefault("production", not development)
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
# avoid deadlock before the subprocess forking.
llm.ensure_pretrained_exists()
start_env = os.environ.copy()
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
# run this model on GPU
try:
llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env(), force=True)
gpu_available = True
except openllm.exceptions.GpuNotAvailableError:
gpu_available = False
# NOTE: This is a hack to set current configuration
_bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options += (
" "
if _bentoml_config_options
else ""
+ f"api_server.timeout={server_timeout}"
+ f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}'
)
openllm.utils.analytics.track_start_init(llm.config, gpu_available)
start_env.update(
{
ModelEnv.framework: ModelEnv.get_framework_env(),
ModelEnv.model_config: llm.config.model_dump_json(),
"OPENLLM_MODEL": model_name,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
"BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
}
)
server_attrs.update({"working_dir": os.path.dirname(__file__)})
if _serve_grpc:
server_attrs["grpc_protocol_version"] = "v1"
# NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_attrs.pop("development")
server_attrs.setdefault("production", not development)
if llm.requirements is not None:
click.secho(
f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow"
start_env = os.environ.copy()
# NOTE: This is a hack to set current configuration
_bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options += (
" "
if _bentoml_config_options
else ""
+ f"api_server.timeout={server_timeout}"
+ f' runners."llm-{llm.config.__openllm_start_name__}-runner".timeout={llm.config.__openllm_timeout__}'
)
click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue")
server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
server.timeout = 90
try:
server.start(env=start_env, text=True)
start_env.update(
{
ModelEnv.framework: ModelEnv.get_framework_env(),
ModelEnv.model_config: llm.config.model_dump_json().decode(),
"OPENLLM_MODEL": model_name,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
"BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
}
)
if llm.requirements is not None:
click.secho(
f"Make sure that you have the following dependencies available: {llm.requirements}\n", fg="yellow"
)
click.secho(f"\nStarting LLM Server for '{model_name}'\n", fg="blue")
if t.TYPE_CHECKING:
server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer]
server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
server_attrs["timeout"] = 90
server = server_cls("_service.py:svc", **server_attrs)
server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False)
assert server.process and server.process.stdout
with server.process.stdout:
for f in iter(server.process.stdout.readline, b""):
click.secho(f, fg="green", nl=False)
click.echo(f, nl=False)
except Exception as err:
click.secho(f"Error caught while starting LLM Server:\n{err}", fg="red")
raise
finally:
click.secho("\nStopping LLM Server...\n", fg="yellow")
click.secho(
f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}", fg="blue"
)
else:
if not get_debug_mode():
click.secho("\nStopping LLM Server...\n", fg="yellow")
click.secho(
f"Next step: you can run 'openllm bundle {model_name}' to create a Bento for {model_name}",
fg="blue",
)
# NOTE: Return the configuration for telemetry purposes.
return config
@@ -471,11 +474,13 @@ def cli():
\b
OpenLLM: Your one stop-and-go-solution for serving any Open Large-Language Model
- StableLM, Llama, Alpaca, Dolly, Flan-T5, and more
- StableLM, Falcon, ChatGLM, Dolly, Flan-T5, and more
\b
- Powered by BentoML 🍱 + HuggingFace 🤗
"""
if psutil.WINDOWS:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
@cli.command()
@@ -519,12 +524,12 @@ def start_grpc_cli():
"""
@cli.command(name="bundle", aliases=["build"])
@cli.command(aliases=["build"])
@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
@click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].")
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
@output_option
def _(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]):
def bundle(model_name: str, pretrained: str | None, overwrite: bool, output: t.Literal["json", "pretty", "porcelain"]):
"""Package a given models into a Bento.
$ openllm bundle flan-t5
@@ -663,9 +668,5 @@ def download_models(model_name: str, pretrained: str | None, output: t.Literal["
return m
if psutil.WINDOWS:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
if __name__ == "__main__":
cli()

View File

@@ -34,6 +34,11 @@ if t.TYPE_CHECKING:
from fs.base import FS
class ModifyNodeProtocol(t.Protocol):
@t.overload
def __call__(self, node: Node, model_name: str) -> None:
...
@t.overload
def __call__(self, node: Node, *args: t.Any, **attrs: t.Any) -> None:
...