feat(embedding): Adding generic endpoint (#227)

This commit is contained in:
Aaron Pham
2023-08-17 15:17:00 -04:00
committed by GitHub
parent d5c4066ff4
commit 4140d160b8
23 changed files with 621 additions and 393 deletions

View File

@@ -6,6 +6,8 @@
<h1 align="center">🦾 OpenLLM</h1>
<a href="https://pypi.org/project/openllm">
<img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
</a><a href="https://test.pypi.org/project/openllm/">
<img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
</a><a href="https://twitter.com/bentomlai">
<img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
</a><a href="https://l.bentoml.com/join-openllm-discord">
@@ -493,12 +495,12 @@ openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
OpenLLM encourages contributions by welcoming users to incorporate their custom
LLMs into the ecosystem. Check out
[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md)
to see how you can do it yourself.
### Embeddings
OpenLLM tentatively provides embeddings endpoint for supported models. This can
OpenLLM provides embeddings endpoint for embeddings calculation. This can
be accessed via `/v1/embeddings`.
To use via CLI, simply call `openllm embed`:
@@ -532,8 +534,19 @@ client.embed("I like to eat apples")
```
> [!NOTE]
> Currently, the following model family supports embeddings: Llama, T5
> (Flan-T5, FastChat, etc.), ChatGLM
> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
> For the remaining LLM that doesn't have specific embedding implementation,
> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento)
### Playground and Chat UI
The following UIs are currently available for OpenLLM:
| UI | Owner | Type | Progress |
|-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------|
| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 |
| TS | BentoML Team | | 🚧 |
## ⚙️ Integrations

View File

@@ -2,80 +2,80 @@
[build-system]
build-backend = "hatchling.build"
requires = [
"hatchling==1.18.0",
"hatch-vcs==0.3.0",
"hatch-fancy-pypi-readme==23.1.0",
"hatch-mypyc==0.16.0",
"hatchling==1.18.0",
"hatch-vcs==0.3.0",
"hatch-fancy-pypi-readme==23.1.0",
"hatch-mypyc==0.16.0",
]
[project]
authors = [
{name = "Aaron Pham",email = "aarnphm@bentoml.com"},
{name = "BentoML Team",email = "contact@bentoml.com"},
{ name = "Aaron Pham", email = "aarnphm@bentoml.com" },
{ name = "BentoML Team", email = "contact@bentoml.com" },
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: GPU :: NVIDIA CUDA",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
"Typing :: Typed",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Development Status :: 5 - Production/Stable",
"Environment :: GPU :: NVIDIA CUDA",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
"Typing :: Typed",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"bentoml[grpc,io]>=1.0.25",
"transformers[torch,tokenizers,accelerate]>=4.29.0",
"safetensors",
"optimum",
"attrs>=23.1.0",
"cattrs>=23.1.0",
"orjson",
"inflection",
"tabulate[widechars]>=0.9.0",
"httpx",
"click>=8.1.3",
"typing_extensions",
"mypy_extensions",
"ghapi",
"cuda-python;platform_system!=\"Darwin\"",
"bitsandbytes<0.42",
"bentoml[grpc,io]>=1.0.25",
"transformers[torch,tokenizers,accelerate]>=4.29.0",
"safetensors",
"optimum",
"attrs>=23.1.0",
"cattrs>=23.1.0",
"orjson",
"inflection",
"tabulate[widechars]>=0.9.0",
"httpx",
"click>=8.1.3",
"typing_extensions",
"mypy_extensions",
"ghapi",
"cuda-python;platform_system!=\"Darwin\"",
"bitsandbytes<0.42",
]
description = "OpenLLM: Operating LLMs in production"
dynamic = ["version", "readme"]
keywords = [
"MLOps",
"AI",
"BentoML",
"Model Serving",
"Model Deployment",
"LLMOps",
"Falcon",
"Vicuna",
"Llama 2",
"Fine tuning",
"Serverless",
"Large Language Model",
"Generative AI",
"StableLM",
"Alpaca",
"PyTorch",
"Transformers",
"MLOps",
"AI",
"BentoML",
"Model Serving",
"Model Deployment",
"LLMOps",
"Falcon",
"Vicuna",
"Llama 2",
"Fine tuning",
"Serverless",
"Large Language Model",
"Generative AI",
"StableLM",
"Alpaca",
"PyTorch",
"Transformers",
]
license = "Apache-2.0"
name = "openllm"
@@ -103,21 +103,21 @@ Twitter = "https://twitter.com/bentomlai"
[project.optional-dependencies]
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
all = [
"openllm[agents]",
"openllm[baichuan]",
"openllm[chatglm]",
"openllm[falcon]",
"openllm[fine-tune]",
"openllm[flan-t5]",
"openllm[ggml]",
"openllm[gptq]",
"openllm[llama]",
"openllm[mpt]",
"openllm[openai]",
"openllm[opt]",
"openllm[playground]",
"openllm[starcoder]",
"openllm[vllm]",
"openllm[agents]",
"openllm[baichuan]",
"openllm[chatglm]",
"openllm[falcon]",
"openllm[fine-tune]",
"openllm[flan-t5]",
"openllm[ggml]",
"openllm[gptq]",
"openllm[llama]",
"openllm[mpt]",
"openllm[openai]",
"openllm[opt]",
"openllm[playground]",
"openllm[starcoder]",
"openllm[vllm]",
]
baichuan = ["cpm-kernels", "sentencepiece"]
chatglm = ["cpm-kernels", "sentencepiece"]
@@ -141,12 +141,12 @@ source = "vcs"
version-file = "src/openllm/_version.py"
[tool.hatch.version.raw-options]
git_describe_command = [
"git",
"describe",
"--dirty",
"--tags",
"--long",
"--first-parent",
"git",
"describe",
"--dirty",
"--tags",
"--long",
"--first-parent",
]
local_scheme = "no-local-version"
root = ".."
@@ -157,55 +157,54 @@ only-include = ["src/openllm"]
sources = ["src"]
[tool.hatch.build.targets.sdist]
exclude = [
"/.git_archival.txt",
"tests",
"/.python-version-default",
"ADDING_NEW_MODEL.md",
"/.git_archival.txt",
"tests",
"/.python-version-default",
"ADDING_NEW_MODEL.md",
]
[tool.hatch.build.targets.wheel.hooks.mypyc]
dependencies = [
"hatch-mypyc==0.16.0",
"mypy==1.4.1",
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.1",
"transformers>=4.31.0",
"pandas-stubs",
"types-psutil",
"types-tabulate",
"types-PyYAML",
"types-protobuf",
"hatch-mypyc==0.16.0",
"mypy==1.4.1",
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.1",
"transformers>=4.31.0",
"pandas-stubs",
"types-psutil",
"types-tabulate",
"types-PyYAML",
"types-protobuf",
]
enable-by-default = false
exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
include = [
"src/openllm/bundle",
"src/openllm/models/__init__.py",
"src/openllm/models/auto/__init__.py",
"src/openllm/utils/__init__.py",
"src/openllm/utils/codegen.py",
"src/openllm/__init__.py",
"src/openllm/_prompt.py",
"src/openllm/_schema.py",
"src/openllm/_quantisation.py",
"src/openllm/_generation.py",
"src/openllm/_strategies.py",
"src/openllm/exceptions.py",
"src/openllm/testing.py",
"src/openllm/models/__init__.py",
"src/openllm/models/auto/__init__.py",
"src/openllm/utils/__init__.py",
"src/openllm/__init__.py",
"src/openllm/_prompt.py",
"src/openllm/_schema.py",
"src/openllm/_quantisation.py",
"src/openllm/_generation.py",
"src/openllm/_strategies.py",
"src/openllm/exceptions.py",
"src/openllm/testing.py",
]
# NOTE: This is consistent with pyproject.toml
mypy-args = [
"--strict",
# this is because all transient library doesn't have types
"--allow-subclassing-any",
"--follow-imports=skip",
"--check-untyped-defs",
"--ignore-missing-imports",
"--no-warn-return-any",
"--warn-unreachable",
"--no-warn-no-return",
"--no-warn-unused-ignores",
"--exclude='/src\\/openllm\\/playground\\/**'",
"--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
"--strict",
# this is because all transient library doesn't have types
"--follow-imports=skip",
"--allow-subclassing-any",
"--check-untyped-defs",
"--ignore-missing-imports",
"--no-warn-return-any",
"--warn-unreachable",
"--no-warn-no-return",
"--no-warn-unused-ignores",
"--exclude='/src\\/openllm\\/playground\\/**'",
"--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
]
options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
require-runtime-dependencies = true

View File

@@ -28,7 +28,9 @@ else:
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_import_structure: dict[str, list[str]] = {
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
"_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
"models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
}
COMPILED = _Path(__file__).suffix in (".pyd", ".so")
@@ -40,6 +42,8 @@ if _t.TYPE_CHECKING:
from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
from ._quantisation import infer_quantisation_config as infer_quantisation_config
from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
from .models.baichuan import BaichuanConfig as BaichuanConfig
@@ -54,7 +58,7 @@ if _t.TYPE_CHECKING:
from .models.stablelm import StableLMConfig as StableLMConfig
from .models.starcoder import StarCoderConfig as StarCoderConfig
from .serialisation import ggml as ggml, transformers as transformers
from openllm.utils import infer_auto_class as infer_auto_class
from .utils import infer_auto_class as infer_auto_class
try:
if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError

View File

@@ -0,0 +1,48 @@
# See https://github.com/bentoml/sentence-embedding-bento for more information.
from __future__ import annotations
import bentoml, openllm, transformers, typing as t
from huggingface_hub import snapshot_download
from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION
from bentoml._internal.models.model import ModelOptions, ModelSignature
if t.TYPE_CHECKING: import torch
_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2"
_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2"
def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
try:
return bentoml.transformers.get(ids)
except bentoml.exceptions.NotFound:
model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"])
return bentomodel
class GenericEmbeddingRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True
def __init__(self) -> None:
self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu"
self._bentomodel = get_or_download()
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path)
self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path)
self.model.to(self.device)
@bentoml.Runnable.method(batchable=True, batch_dim=0)
def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
import torch, torch.nn.functional as F
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
attention_mask = encoded_input["attention_mask"]
# Compute token embeddings
with torch.no_grad(): model_output = self.model(**encoded_input)
# Perform pooling and normalize
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
@staticmethod
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
import torch
# Mean Pooling - Take attention mask into account for correct averaging
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
__all__ = ["GenericEmbeddingRunnable"]

View File

@@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
scheduling_strategy = CascadingResourceStrategy
generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))
# NOTE: returning the two langchain API's to the runner
@@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
logger.info("Successfully apply LoRA layer %s", adapter_name)
@bentoml.Runnable.method(**method_signature(embeddings_sig))
def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
return self.embeddings([prompt] if isinstance(prompt, str) else prompt)
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
@bentoml.Runnable.method(**method_signature(generate_sig))
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:

View File

@@ -1,9 +1,3 @@
# mypy: disable-error-code="arg-type,misc"
"""The service definition for running any LLMService.
For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
Codegen can be found under 'openllm.utils.codegen'
"""
from __future__ import annotations
import os, warnings, orjson, bentoml, openllm, typing as t
from starlette.applications import Starlette
@@ -12,6 +6,7 @@ from starlette.routing import Route
if t.TYPE_CHECKING:
from starlette.requests import Request
from starlette.responses import Response
from bentoml._internal.runner.runner import RunnerMethod
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
@@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na
adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map
llm_config = openllm.AutoConfig.for_model(model)
runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300)
runners: t.Sequence[bentoml.Runner] = [runner]
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
@@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
if runner.supports_embeddings:
@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
responses = await runner.embeddings.async_run(phrases)
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode
responses = (await embed_call.async_run(phrases))[0]
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
async def hf_agent(request: Request) -> Response:

View File

@@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
dev = str(gpus[idx])
return dev
__all__=["CascadingResourceStrategy", "get_resource"]

View File

@@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
supports_embeddings: bool
supports_hf_agent: bool
has_adapters: bool
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]

View File

@@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
env: openllm.utils.EnvVarMixin = llm.config["env"]
if env["framework_value"] == "vllm": serialisation_format = "legacy"
env_dict = {
env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
"OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
}
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
@@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
)
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
# NOTE: the model_id_path here are only used for setting this environment variable within the container
# built with for BentoLLM.
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
service_fs_path = fs.path.join("src", llm.config["service_name"])
service_path = bento._fs.getsyspath(service_fs_path)
with open(service_path, "r") as f:

View File

@@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
if device:
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if cors:
_bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])

View File

@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
@inject
def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
"""Package a LLM into a Bento.
The LLM will be built into a BentoService with the following structure:
@@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
model_version: Optional model version for this given LLM
bento_version: Optional bento veresion for this given BentoLLM
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
@@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
Returns:
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(["--quantize", quantize])
if bettertransformer: args.append("--bettertransformer")
@@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
if overwrite: args.append("--overwrite")
if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
if model_version: args.extend(["--model-version", model_version])
if bento_version: args.extend(["--bento-version", bento_version])
if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
if container_registry is None: container_registry = "ecr"
if container_version_strategy is None: container_version_strategy = "release"

View File

@@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]):
def health(self) -> t.Any: return self._cached.health()
def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
if isinstance(prompt, str): prompt = [prompt]
result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
return openllm.EmbeddingsOutput(**result)
@@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
async def health(self) -> t.Any: return await self._cached.async_health()
async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
if isinstance(prompt, str): prompt = [prompt]
res = await self.acall("embeddings", list(prompt))
return openllm.EmbeddingsOutput(**res)

View File

@@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
else:
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
except Exception:
raise
except Exception: raise
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
@@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
# NOTE: We need to free up the cache after importing the model
# in the case where users first run openllm start without the model available locally.
if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
return bentomodel
def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:

View File

@@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter):
class ModelAdapterMapFormatter(ModelNameFormatter):
model_keyword: LiteralString = "__model_adapter_map__"
_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py"
_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
from openllm.utils import DEBUG
model_name = llm.config["model_name"]