feat(contrib): ClojureScript UI (#89)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-05-19 14:16:22 -04:00 · 2023-08-16 09:30:44 +02:00
parent 4f740fc662
commit 4cad367ab5
78 changed files with 6531 additions and 40 deletions
--- a/openllm-python/ADDING_NEW_MODEL.md
+++ b/openllm-python/ADDING_NEW_MODEL.md
@@ -0,0 +1,67 @@
+# Adding a New Model
+
+OpenLLM encourages contributions by welcoming users to incorporate their custom
+Large Language Models (LLMs) into the ecosystem. You can set up your development
+environment by referring to our
+[Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md).
+
+## Procedure
+
+All the relevant code for incorporating a new model resides within
+[`src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
+Here's your roadmap:
+
+- [ ] Generate model configuration file:
+      `src/openllm/models/{model_name}/configuration_{model_name}.py`
+- [ ] Establish model implementation files:
+      `src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
+- [ ] Create module's `__init__.py`:
+      `src/openllm/models/{model_name}/__init__.py`
+- [ ] Adjust the entrypoints for files at `src/openllm/models/auto/*` If it is a
+      new runtime, then add it a `src/openllm/models/auto/modeling_{runtime}_auto.py`.
+      See the other auto runtime for example.
+- [ ] Modify the main `__init__.py`: `src/openllm/models/__init__.py`
+- [ ] Run the following to update stubs: `hatch run check-stubs`
+
+For a working example, check out any existing model.
+
+### Model Configuration
+
+File Name: `configuration_{model_name}.py`
+
+This file is dedicated to specifying docstrings, default prompt templates,
+default parameters, as well as additional fields for the models.
+
+### Model Implementation
+
+File Name: `modeling_{runtime}_{model_name}.py`
+
+For each runtime, i.e., torch (default with no prefix), TensorFlow -`tf`, Flax -
+`flax`, it is necessary to implement a class that adheres to the `openllm.LLM`
+interface. The conventional class name follows the `RuntimeModelName` pattern,
+e.g., `FlaxFlanT5`.
+
+### Initialization Files
+
+The `__init__.py` files facilitate intelligent imports, type checking, and
+auto-completions for the OpenLLM codebase and CLIs.
+
+### Entrypoint
+
+After establishing the model config and implementation class, register them in
+the `auto` folder files. There are four entrypoint files:
+
+- `configuration_auto.py`: Registers `ModelConfig` classes
+- `modeling_auto.py`: Registers a model's PyTorch implementation
+- `modeling_tf_auto.py`: Registers a model's TensorFlow implementation
+- `modeling_flax_auto.py`: Registers a model's Flax implementation
+
+### Updating README.md
+
+Run `./tools/update-readme.py` to update the README.md file with the new model.
+
+## Raise a Pull Request
+
+Once you have completed the checklist above, raise a PR and the OpenLLMs
+maintainer will review it ASAP. Once the PR is merged, you should be able to see
+your model in the next release! 🎉 🎊
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
-import functools, importlib.util, os, typing as t
+import functools, importlib.util, os, typing as t, logging
 import click, click_option_group as cog, inflection, orjson, bentoml, openllm
 from bentoml_cli.utils import BentoMLCommandGroup
 from click.shell_completion import CompletionItem
+from openllm.utils import DEBUG
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
 from . import termui
@@ -11,21 +12,27 @@ if t.TYPE_CHECKING:
  import subprocess
  from openllm._configuration import LLMConfig

+logger = logging.getLogger(__name__)
+
 P = ParamSpec("P")
 LiteralOutput = t.Literal["json", "pretty", "porcelain"]

 _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])

-def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, environ: DictStrAny,) -> DictStrAny:
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f"api_server.traffic.timeout={server_timeout}", f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
  if device:
    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  if cors:
+    _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
+    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
  _bentoml_config_options_env += " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts)
  environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env
+  if DEBUG: logger.debug("Setting BENTOML_CONFIG_OPTIONS=%s", _bentoml_config_options_env)
  return environ

 _adapter_mapping_key = "adapter_map"
@@ -89,7 +96,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  @click.pass_context
  def start_cmd(
      ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool,
-      serialisation_format: t.Literal["safetensors", "legacy"], adapter_id: str | None, return_process: bool, **attrs: t.Any,
+      serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
  ) -> LLMConfig | subprocess.Popen[bytes]:
    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
    if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
@@ -124,7 +131,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]

    # NOTE: This is to set current configuration
    start_env = os.environ.copy()
-    start_env = parse_config_options(config, server_timeout, wpr, device, start_env)
+    start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
    if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")

    start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
@@ -193,6 +200,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
          model_version_option(factory=cog.optgroup),
          cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"),
          workers_per_resource_option(factory=cog.optgroup),
+          cors_option(factory=cog.optgroup),
          fast_option(factory=cog.optgroup),
          cog.optgroup.group(
              "LLM Optimization Options", help="""Optimization related options.
@@ -303,11 +311,11 @@ def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
                                                                                                          This is useful if you already downloaded or setup the model beforehand.
                                                                                                          """, **attrs
  )(f)
+def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f)
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
 def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
-
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
@@ -327,7 +335,6 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
      ) + """
                                                                                                                                                                            > [!NOTE] that quantization are currently only available in *PyTorch* models.""", **attrs
  )(f)
-
 def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--workers-per-resource", default=None, callback=workers_per_resource_callback, type=str, required=False, help="""Number of workers per resource assigned.
@@ -347,12 +354,10 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
                                                                                                                                                  > ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
      ), **attrs
  )(f)
-
 def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs
  )(f)
-
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--serialisation", "--serialization", "serialisation_format", type=click.Choice(["safetensors", "legacy"]), default="safetensors", show_default=True, show_envvar=True, envvar="OPENLLM_SERIALIZATION", help="""Serialisation format for save/load LLM.
@@ -374,7 +379,6 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
                                                                                                                  > [!NOTE] that GGML format is working in progress.
                                                                                                                  """, **attrs
  )(f)
-
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      "--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
@@ -383,7 +387,7 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) ->

                                                                                                                        \b
                                                                                                                        > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
-                                                                                                                        """
+                                                                                                                        """, **attrs
  )(f)

 _wpr_strategies = {"round_robin", "conserved"}
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -15,7 +15,8 @@ if t.TYPE_CHECKING:

 logger = logging.getLogger(__name__)

-def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", fast: bool = False, adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
+def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers",
+            adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, cors: bool = False, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.

  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
@@ -50,13 +51,12 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
                - gptq: Quantize the model with GPTQ (auto-gptq required)
      bettertransformer: Convert given model to FastTransformer with PyTorch.
      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
+      cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
      framework: The framework to use for this LLM. By default, this is set to ``pt``.
      additional_args: Additional arguments to pass to ``openllm start``.
  """
  from .entrypoint import start_command, start_grpc_command
-  fast = os.environ.get("OPENLLM_FAST", str(fast)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
  llm_config = openllm.AutoConfig.for_model(model_name)
  _ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
@@ -69,7 +69,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", str(quantize)])
  elif bettertransformer: args.append("--bettertransformer")
-  if fast: args.append("--fast")
+  if cors: args.append("--cors")
  if adapter_map: args.extend(list(itertools.chain.from_iterable([["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
  if additional_args: args.extend(additional_args)
  if __test__: args.append("--return-process")