diff --git a/README.md b/README.md
index 4c4d026c..27e71ca5 100644
--- a/README.md
+++ b/README.md
@@ -135,7 +135,8 @@ specify different variants of the model to be served, by providing the
openllm start flan-t5 --model-id google/flan-t5-large
```
-> **Note** that `openllm` also supports all variants of fine-tuning weights,
+> [!NOTE]
+> `openllm` also supports all variants of fine-tuning weights,
> custom model path as well as quantized weights for any of the supported models
> as long as it can be loaded with the model architecture. Refer to
> [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models)
@@ -417,7 +418,8 @@ For example, if you want to use the Tensorflow (`tf`) implementation for the
OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
```
-> **Note** For GPU support on Flax, refers to
+> [!NOTE]
+> For GPU support on Flax, refers to
> [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
> to make sure that you have Jax support for the corresponding CUDA version.
@@ -437,7 +439,8 @@ To run inference with `gptq`, simply pass `--quantize gptq`:
openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gptq --device 0
```
-> **Note**: to run GPTQ, make sure to install with
+> [!NOTE]
+> In order to run GPTQ, make sure to install with
> `pip install "openllm[gptq]"`. The weights of all supported models should be
> quantized before serving. See
> [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) for more
@@ -482,7 +485,8 @@ To include this into the Bento, one can also provide a `--adapter-id` into
openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
```
-> **Note**: We will gradually roll out support for fine-tuning all models. The
+> [!NOTE]
+> We will gradually roll out support for fine-tuning all models. The
> following models contain fine-tuning support: OPT, Falcon, LlaMA.
### Integrating a New Model
@@ -527,8 +531,8 @@ client = openllm.client.HTTPClient("http://localhost:3000")
client.embed("I like to eat apples")
```
-> **Note**: Currently, the following model framily supports embeddings: Llama,
-> T5 (Flan-T5, FastChat, etc.), ChatGLM
+> [!NOTE]
+> Currently, the following model framily supports embeddings: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
## ⚙️ Integrations
@@ -606,7 +610,8 @@ def chat(input_text: str):
return agent.run(input_text)
```
-> **Note** You can find out more examples under the
+> [!NOTE]
+> You can find out more examples under the
> [examples](https://github.com/bentoml/OpenLLM/tree/main/examples) folder.
### Transformers Agents
@@ -614,7 +619,8 @@ def chat(input_text: str):
OpenLLM seamlessly integrates with
[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents).
-> **Warning** The Transformers Agent is still at an experimental stage. It is
+> [!WARNING]
+> The Transformers Agent is still at an experimental stage. It is
> recommended to install OpenLLM with `pip install -r nightly-requirements.txt`
> to get the latest API update for HuggingFace agent.
@@ -626,7 +632,8 @@ agent = transformers.HfAgent("http://localhost:3000/hf/agent") # URL that runs
agent.run("Is the following `text` positive or negative?", text="I don't like how this models is generate inputs")
```
-> **Note** Only `starcoder` is currently supported with Agent integration. The
+> [!IMPORTANT]
+> Only `starcoder` is currently supported with Agent integration. The
> example above was also run with four T4s on EC2 `g4dn.12xlarge`
If you want to use OpenLLM client to ask questions to the running agent, you can
@@ -646,6 +653,7 @@ client.ask_agent(

+
@@ -691,7 +699,8 @@ serverless cloud for shipping and scaling AI applications.
bentoml cloud login --api-token --endpoint
```
-> **Note**: Replace `` and `` with your
+> [!NOTE]
+> Replace `` and `` with your
> specific API token and the BentoCloud endpoint respectively.
3. **Bulding a Bento**: With OpenLLM, you can easily build a Bento for a
diff --git a/hatch.toml b/hatch.toml
index c4ad5170..bb7459b7 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -11,7 +11,7 @@ text = """
"""
[[metadata.hooks.fancy-pypi-readme.fragments]]
-end-before = "\n\n"
+end-before = "\n"
path = "README.md"
start-after = "\n"
[[metadata.hooks.fancy-pypi-readme.fragments]]
@@ -22,11 +22,12 @@ text = """
"""
[[metadata.hooks.fancy-pypi-readme.fragments]]
-end-before = "\n\n"
+end-before = "\n"
path = "README.md"
start-after = "\n"
[[metadata.hooks.fancy-pypi-readme.fragments]]
text = """
+
@@ -43,6 +44,7 @@ text = """
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
path = "CHANGELOG.md"
+start-after = ""
pattern = "\n(###.+?\n)## "
[[metadata.hooks.fancy-pypi-readme.fragments]]
text = """
diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index cbfb3f4e..5a575717 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -1423,8 +1423,8 @@ class LLMConfig(_ConfigAttr):
This can be used as a decorator for click commands.
- > **Note**: that the identifier for all LLMConfig will be prefixed with '_*', and the generation config
- will be prefixed with '_generation_*'.
+ > [!NOTE]
+ > The identifier for all LLMConfig will be prefixed with '_*', and the generation config will be prefixed with '_generation_*'.
"""
for name, field in attr.fields_dict(cls.__openllm_generation_class__).items():
ty = cls.__openllm_hints__.get(name)
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 28a07b3d..173bd908 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -223,7 +223,8 @@ class LLMInterface(ABC, t.Generic[M, T]):
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
- NOTE: this will be used from the client side.
+ > [!NOTE]
+ > This will be used from the client side.
"""
return generation_result
def llm_post_init(self) -> None:
@@ -271,7 +272,7 @@ class LLMInterface(ABC, t.Generic[M, T]):
- If `self.bettertransformer` is set within `llm_post_init`.
- Finally, if none of the above, default to self.config['bettertransformer']
- > **Note** that if LoRA is enabled, bettertransformer will be disabled.
+ > [!NOTE] that if LoRA is enabled, bettertransformer will be disabled.
"""
device: "torch.device"
"""The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string."""
@@ -562,7 +563,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
Args:
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
- > **Warning**: If custom path is passed, make sure it contains all available file to construct
+ > [!WARNING] If custom path is passed, make sure it contains all available file to construct
> ``transformers.PretrainedConfig``, ``transformers.PreTrainedModel``, and ``transformers.PreTrainedTokenizer``.
model_name: Optional model name to be saved with this LLM. Default to None. It will be inferred automatically from model_id.
If model_id is a custom path, it will be the basename of the given path.
@@ -629,7 +630,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
If model_id contains the revision itself, then the same format above
If model_id is a path, then it will be -: if model_version is not passesd, otherwise -:
- **Note** here that the generated SHA1 for path cases is that it will be based on last modified time.
+ > [!NOTE] here that the generated SHA1 for path cases is that it will be based on last modified time.
Args:
model_id: Model id for this given LLM. It can be pretrained weights URL, custom path.
@@ -664,14 +665,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
):
"""Initialize the LLM with given pretrained model.
- > **Warning**
+ > [!WARNING]
> To initializing any LLM, you should use `openllm.AutoLLM` or `openllm.LLM.from_pretrained` instead.
> `__init__` initialization is only for internal use.
- Note:
- - *args to be passed to the model.
- - **attrs will first be parsed to the AutoConfig, then the rest will be parsed to the import_model
- - for tokenizer kwargs, it should be prefixed with _tokenizer_*
+ > [!NOTE]
+ > - *args to be passed to the model.
+ > - **attrs will first be parsed to the AutoConfig, then the rest will be parsed to the import_model
+ > - for tokenizer kwargs, it should be prefixed with _tokenizer_*
For custom pretrained path, it is recommended to pass in 'model_version' alongside with the path
to ensure that it won't be loaded multiple times.
@@ -925,10 +926,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
Returns:
A generated LLMRunner for this LLM.
- > **Note**: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'.
- - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm--runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
- - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
- - 'method_configs': The method configs for the runner will be managed internally by OpenLLM.
+ > [!NOTE]: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner():
+ >
+ > - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm--runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
+ > - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
+ > - 'method_configs': The method configs for the runner will be managed internally by OpenLLM.
"""
models = models if models is not None else []
@@ -946,7 +948,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
# NOTE: returning the two langchain API's to the runner
return llm_runner_class(self)(
llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), name=self.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms,
- method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}), scheduling_strategy=scheduling_strategy,
+ method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}),
+ scheduling_strategy=scheduling_strategy,
)
# NOTE: Scikit API
@@ -972,16 +975,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
@overload
def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
-def Runner(
- model_name: str, *, model_id: str = ..., model_version: str | None = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ..., embedded: t.Literal[True, False] = ..., scheduling_strategy: type[bentoml.Strategy] | None = ..., **attrs: t.Any
-) -> LLMRunner[t.Any, t.Any]: ...
+def Runner(model_name: str, *, model_id: str = ..., model_version: str | None = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ..., embedded: t.Literal[True, False] = ..., scheduling_strategy: type[bentoml.Strategy] | None = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
def Runner(model_name: str, *, ensure_available: bool | None = None, init_local: bool = ..., implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
-def Runner(
- model_name: str, *, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4", "gptq"] | None = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ...,
- adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
-) -> LLMRunner[t.Any, t.Any]: ...
+def Runner(model_name: str, *, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4", "gptq"] | None = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
# fmt: on
def Runner(model_name: str, ensure_available: bool | None = None, init_local: bool = False, implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
@@ -1017,7 +1015,8 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
behaviour
"""
if llm_config is not None:
- attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"], "serialisation": first_not_none(os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors"),})
+ attrs.update({"model_id": llm_config["env"]["model_id_value"], "bettertransformer": llm_config["env"]["bettertransformer_value"], "quantize": llm_config["env"]["quantize_value"], "runtime": llm_config["env"]["runtime_value"],
+ "serialisation": first_not_none(os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors")})
default_implementation = llm_config.default_implementation() if llm_config is not None else "pt"
implementation = first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)["framework_value"])
diff --git a/src/openllm/cli/_factory.py b/src/openllm/cli/_factory.py
index 456c8159..4a4ec176 100644
--- a/src/openllm/cli/_factory.py
+++ b/src/openllm/cli/_factory.py
@@ -373,12 +373,12 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
- ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323)
- **Note** that the model can also be served with quantized weights.
+ > [!NOTE] that the model can also be served with quantized weights.
""" + (
"""
- **Note** that this will set the mode for serving within deployment.""" if build else ""
+ > [!NOTE] that this will set the mode for serving within deployment.""" if build else ""
) + """
- **Note** that quantization are currently only available in *PyTorch* models.""", **attrs)(f)
+ > [!NOTE] that quantization are currently only available in *PyTorch* models.""", **attrs)(f)
def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
@@ -387,16 +387,16 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
for more information. By default, this is set to 1.
- **Note**: ``--workers-per-resource`` will also accept the following strategies:
+ > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
- ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
""" + (
"""\n
- **Note**: The workers value passed into 'build' will determine how the LLM can
- be provisioned in Kubernetes as well as in standalone container. This will
- ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
+ > [!NOTE] The workers value passed into 'build' will determine how the LLM can
+ > be provisioned in Kubernetes as well as in standalone container. This will
+ > ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
), **attrs)(f)
def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -416,13 +416,13 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
``safe_serialization=True``.
\b
- **Note** that this format might not work for every cases, and
+ > [!NOTE] that this format might not work for every cases, and
you can always fallback to ``legacy`` if needed.
- ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files.
This should be used if the model doesn't yet support safetensors.
- **Note** that GGML format is working in progress.
+ > [!NOTE] that GGML format is working in progress.
""", **attrs)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -432,7 +432,7 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) ->
Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
\b
- **Note** that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
+ > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
""")(f)
_wpr_strategies = {"round_robin", "conserved"}
diff --git a/src/openllm/cli/entrypoint.py b/src/openllm/cli/entrypoint.py
index 45a4fb58..63c7730a 100644
--- a/src/openllm/cli/entrypoint.py
+++ b/src/openllm/cli/entrypoint.py
@@ -332,17 +332,17 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
def group(self, name: _AnyCallable) -> click.Group:
...
+ # variant: name omitted, cls _must_ be a keyword argument, @group(cmd=GroupCls, ...)
+ @overload
+ def group(self, name: None = None, *, cls: t.Type[GrpType], **attrs: t.Any) -> t.Callable[[_AnyCallable], GrpType]:
+ ...
+
# variant: with positional name and with positional or keyword cls argument:
# @group(namearg, GroupCls, ...) or @group(namearg, cls=GroupCls, ...)
@overload
def group(self, name: str | None, cls: type[GrpType], **attrs: t.Any) -> t.Callable[[_AnyCallable], GrpType]:
...
- # variant: name omitted, cls _must_ be a keyword argument, @group(cmd=GroupCls, ...)
- @overload
- def group(self, name: None = None, *, cls: t.Type[GrpType], **attrs: t.Any) -> t.Callable[[_AnyCallable], GrpType]:
- ...
-
# variant: with optional string name, no cls argument provided.
@overload
def group(self, name: str | None = ..., cls: None = None, **attrs: t.Any) -> t.Callable[[_AnyCallable], click.Group]:
@@ -451,7 +451,7 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
$ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
```
- > **Note**: This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
+ > [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
"""
llm_config = AutoConfig.for_model(model_name)
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
@@ -484,41 +484,39 @@ def _start(
For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
pass ``--port 5001``, you can pass ``additional_args=["--port", "5001"]``
- > **Note**: This will create a blocking process, so if you use this API, you can create a running sub thread
+ > [!NOTE] This will create a blocking process, so if you use this API, you can create a running sub thread
> to start the server instead of blocking the main thread.
``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
- > **Note**: ``quantize`` and ``bettertransformer`` are mutually exclusive.
+ > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
Args:
- model_name: The model name to start this LLM
- model_id: Optional model id for this given LLM
- timeout: The server timeout
- workers_per_resource: Number of workers per resource assigned.
- See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
- for more information. By default, this is set to 1.
+ model_name: The model name to start this LLM
+ model_id: Optional model id for this given LLM
+ timeout: The server timeout
+ workers_per_resource: Number of workers per resource assigned.
+ See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+ for more information. By default, this is set to 1.
- > **Note**: ``--workers-per-resource`` will also accept the following strategies:
-
- > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-
- > - ``conserved``: Thjis will determine the number of available GPU resources, and only assign
- one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
- equivalent to ``--workers-per-resource 0.25``.
- device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
- argument to assign all available GPUs to this LLM.
- quantize: Quantize the model weights. This is only applicable for PyTorch models.
- Possible quantisation strategies:
- - int8: Quantize the model with 8bit (bitsandbytes required)
- - int4: Quantize the model with 4bit (bitsandbytes required)
- - gptq: Quantize the model with GPTQ (auto-gptq required)
- bettertransformer: Convert given model to FastTransformer with PyTorch.
- runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
- fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
- adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
- framework: The framework to use for this LLM. By default, this is set to ``pt``.
- additional_args: Additional arguments to pass to ``openllm start``.
+ > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+ > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+ > - ``conserved``: This will determine the number of available GPU resources, and only assign
+ > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+ > equivalent to ``--workers-per-resource 0.25``.
+ device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
+ argument to assign all available GPUs to this LLM.
+ quantize: Quantize the model weights. This is only applicable for PyTorch models.
+ Possible quantisation strategies:
+ - int8: Quantize the model with 8bit (bitsandbytes required)
+ - int4: Quantize the model with 4bit (bitsandbytes required)
+ - gptq: Quantize the model with GPTQ (auto-gptq required)
+ bettertransformer: Convert given model to FastTransformer with PyTorch.
+ runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+ fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
+ adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+ framework: The framework to use for this LLM. By default, this is set to ``pt``.
+ additional_args: Additional arguments to pass to ``openllm start``.
"""
fast = os.getenv("OPENLLM_FAST", str(fast)).upper() in ENV_VARS_TRUE_VALUES
llm_config = AutoConfig.for_model(model_name)
@@ -554,48 +552,45 @@ def _build(
``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
- > **Note**: ``quantize`` and ``bettertransformer`` are mutually exclusive.
+ > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
Args:
- model_name: The model name to start this LLM
- model_id: Optional model id for this given LLM
- model_version: Optional model version for this given LLM
- quantize: Quantize the model weights. This is only applicable for PyTorch models.
- Possible quantisation strategies:
- - int8: Quantize the model with 8bit (bitsandbytes required)
- - int4: Quantize the model with 4bit (bitsandbytes required)
- - gptq: Quantize the model with GPTQ (auto-gptq required)
- bettertransformer: Convert given model to FastTransformer with PyTorch.
- adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
- build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
- enable_features: Additional OpenLLM features to be included with this BentoLLM.
- workers_per_resource: Number of workers per resource assigned.
- See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
- for more information. By default, this is set to 1.
+ model_name: The model name to start this LLM
+ model_id: Optional model id for this given LLM
+ model_version: Optional model version for this given LLM
+ quantize: Quantize the model weights. This is only applicable for PyTorch models.
+ Possible quantisation strategies:
+ - int8: Quantize the model with 8bit (bitsandbytes required)
+ - int4: Quantize the model with 4bit (bitsandbytes required)
+ - gptq: Quantize the model with GPTQ (auto-gptq required)
+ bettertransformer: Convert given model to FastTransformer with PyTorch.
+ adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+ build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
+ enable_features: Additional OpenLLM features to be included with this BentoLLM.
+ workers_per_resource: Number of workers per resource assigned.
+ See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+ for more information. By default, this is set to 1.
- > **Note**: ``--workers-per-resource`` will also accept the following strategies:
-
- > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-
- > - ``conserved``: This will determine the number of available GPU resources, and only assign
- one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
- equivalent to ``--workers-per-resource 0.25``.
- runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
- dockerfile_template: The dockerfile template to use for building BentoLLM. See
- https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
- overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
- push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
- containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
- Note that 'containerize' and 'push' are mutually exclusive
- container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
- container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
- serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
- additional_args: Additional arguments to pass to ``openllm build``.
- bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
+ > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+ > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+ > - ``conserved``: This will determine the number of available GPU resources, and only assign
+ > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+ > equivalent to ``--workers-per-resource 0.25``.
+ runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+ dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
+ overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
+ push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
+ containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
+ Note that 'containerize' and 'push' are mutually exclusive
+ container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+ container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+ container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
+ serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+ additional_args: Additional arguments to pass to ``openllm build``.
+ bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
Returns:
- ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
- If 'format="container"', then it returns the default 'container_name:container_tag'
+ ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: ListStr = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
@@ -633,31 +628,33 @@ def _import_model(
) -> bentoml.Model:
"""Import a LLM into local store.
- > **Note**: If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
+ > [!NOTE]
+ > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
> support on-demand quantisation during initial startup.
``openllm.download`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.
- > **Note**: ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
+ > [!NOTE]
+ > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
Args:
- model_name: The model name to start this LLM
- model_id: Optional model id for this given LLM
- model_version: Optional model version for this given LLM
- runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
- implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
- quantize: Quantize the model weights. This is only applicable for PyTorch models.
- Possible quantisation strategies:
- - int8: Quantize the model with 8bit (bitsandbytes required)
- - int4: Quantize the model with 4bit (bitsandbytes required)
- - gptq: Quantize the model with GPTQ (auto-gptq required)
- serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
- Default behaviour is similar to ``safe_serialization=False``.
- additional_args: Additional arguments to pass to ``openllm import``.
+ model_name: The model name to start this LLM
+ model_id: Optional model id for this given LLM
+ model_version: Optional model version for this given LLM
+ runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
+ implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
+ quantize: Quantize the model weights. This is only applicable for PyTorch models.
+ Possible quantisation strategies:
+ - int8: Quantize the model with 8bit (bitsandbytes required)
+ - int4: Quantize the model with 4bit (bitsandbytes required)
+ - gptq: Quantize the model with GPTQ (auto-gptq required)
+ serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+ Default behaviour is similar to ``safe_serialization=False``.
+ additional_args: Additional arguments to pass to ``openllm import``.
Returns:
- ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
+ ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args = [model_name, "--runtime", runtime, "--implementation", implementation, "--machine", "--serialisation", serialisation_format,]
if model_id is not None: args.append(model_id)