feat(contrib): ClojureScript UI (#89)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
GutZuFusss
2023-08-16 09:30:44 +02:00
committed by GitHub
parent 4f740fc662
commit 4cad367ab5
78 changed files with 6531 additions and 40 deletions

View File

@@ -0,0 +1,67 @@
# Adding a New Model
OpenLLM encourages contributions by welcoming users to incorporate their custom
Large Language Models (LLMs) into the ecosystem. You can set up your development
environment by referring to our
[Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md).
## Procedure
All the relevant code for incorporating a new model resides within
[`src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
Here's your roadmap:
- [ ] Generate model configuration file:
`src/openllm/models/{model_name}/configuration_{model_name}.py`
- [ ] Establish model implementation files:
`src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
- [ ] Create module's `__init__.py`:
`src/openllm/models/{model_name}/__init__.py`
- [ ] Adjust the entrypoints for files at `src/openllm/models/auto/*` If it is a
new runtime, then add it a `src/openllm/models/auto/modeling_{runtime}_auto.py`.
See the other auto runtime for example.
- [ ] Modify the main `__init__.py`: `src/openllm/models/__init__.py`
- [ ] Run the following to update stubs: `hatch run check-stubs`
For a working example, check out any existing model.
### Model Configuration
File Name: `configuration_{model_name}.py`
This file is dedicated to specifying docstrings, default prompt templates,
default parameters, as well as additional fields for the models.
### Model Implementation
File Name: `modeling_{runtime}_{model_name}.py`
For each runtime, i.e., torch (default with no prefix), TensorFlow -`tf`, Flax -
`flax`, it is necessary to implement a class that adheres to the `openllm.LLM`
interface. The conventional class name follows the `RuntimeModelName` pattern,
e.g., `FlaxFlanT5`.
### Initialization Files
The `__init__.py` files facilitate intelligent imports, type checking, and
auto-completions for the OpenLLM codebase and CLIs.
### Entrypoint
After establishing the model config and implementation class, register them in
the `auto` folder files. There are four entrypoint files:
- `configuration_auto.py`: Registers `ModelConfig` classes
- `modeling_auto.py`: Registers a model's PyTorch implementation
- `modeling_tf_auto.py`: Registers a model's TensorFlow implementation
- `modeling_flax_auto.py`: Registers a model's Flax implementation
### Updating README.md
Run `./tools/update-readme.py` to update the README.md file with the new model.
## Raise a Pull Request
Once you have completed the checklist above, raise a PR and the OpenLLMs
maintainer will review it ASAP. Once the PR is merged, you should be able to see
your model in the next release! 🎉 🎊

View File

@@ -1,8 +1,9 @@
from __future__ import annotations
import functools, importlib.util, os, typing as t
import functools, importlib.util, os, typing as t, logging
import click, click_option_group as cog, inflection, orjson, bentoml, openllm
from bentoml_cli.utils import BentoMLCommandGroup
from click.shell_completion import CompletionItem
from openllm.utils import DEBUG
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
from . import termui
@@ -11,21 +12,27 @@ if t.TYPE_CHECKING:
import subprocess
from openllm._configuration import LLMConfig
logger = logging.getLogger(__name__)
P = ParamSpec("P")
LiteralOutput = t.Literal["json", "pretty", "porcelain"]
_AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, environ: DictStrAny,) -> DictStrAny:
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options_opts = ["tracing.sample_rate=1.0", f"api_server.traffic.timeout={server_timeout}", f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
if device:
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
if cors:
_bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
_bentoml_config_options_env += " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts)
environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env
if DEBUG: logger.debug("Setting BENTOML_CONFIG_OPTIONS=%s", _bentoml_config_options_env)
return environ
_adapter_mapping_key = "adapter_map"
@@ -89,7 +96,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
@click.pass_context
def start_cmd(
ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool,
serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
@@ -124,7 +131,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
# NOTE: This is to set current configuration
start_env = os.environ.copy()
start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
@@ -193,6 +200,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
model_version_option(factory=cog.optgroup),
cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
fast_option(factory=cog.optgroup),
cog.optgroup.group(
"LLM Optimization Options", help="""Optimization related options.
@@ -303,11 +311,11 @@ def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
This is useful if you already downloaded or setup the model beforehand.
""", **attrs
)(f)
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f)
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
@@ -327,7 +335,6 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
) + """
> [!NOTE] that quantization are currently only available in *PyTorch* models.""", **attrs
)(f)
def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--workers-per-resource", default=None, callback=workers_per_resource_callback, type=str, required=False, help="""Number of workers per resource assigned.
@@ -347,12 +354,10 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
> ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
), **attrs
)(f)
def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs
)(f)
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--serialisation", "--serialization", "serialisation_format", type=click.Choice(["safetensors", "legacy"]), default="safetensors", show_default=True, show_envvar=True, envvar="OPENLLM_SERIALIZATION", help="""Serialisation format for save/load LLM.
@@ -374,7 +379,6 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
> [!NOTE] that GGML format is working in progress.
""", **attrs
)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
@@ -383,7 +387,7 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) ->
\b
> [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
"""
""", **attrs
)(f)
_wpr_strategies = {"round_robin", "conserved"}

View File

@@ -15,7 +15,8 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", fast: bool = False, adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers",
adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, cors: bool = False, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
"""Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.
For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
@@ -50,13 +51,12 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
- gptq: Quantize the model with GPTQ (auto-gptq required)
bettertransformer: Convert given model to FastTransformer with PyTorch.
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
framework: The framework to use for this LLM. By default, this is set to ``pt``.
additional_args: Additional arguments to pass to ``openllm start``.
"""
from .entrypoint import start_command, start_grpc_command
fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
@@ -69,7 +69,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(["--quantize", str(quantize)])
elif bettertransformer: args.append("--bettertransformer")
if fast: args.append("--fast")
if cors: args.append("--cors")
if adapter_map: args.extend(list(itertools.chain.from_iterable([["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
if additional_args: args.extend(additional_args)
if __test__: args.append("--return-process")