mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-19 14:16:22 -04:00
feat(embedding): Adding generic endpoint (#227)
This commit is contained in:
21
openllm-python/README.md
generated
21
openllm-python/README.md
generated
@@ -6,6 +6,8 @@
|
||||
<h1 align="center">🦾 OpenLLM</h1>
|
||||
<a href="https://pypi.org/project/openllm">
|
||||
<img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
|
||||
</a><a href="https://test.pypi.org/project/openllm/">
|
||||
<img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
|
||||
</a><a href="https://twitter.com/bentomlai">
|
||||
<img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
|
||||
</a><a href="https://l.bentoml.com/join-openllm-discord">
|
||||
@@ -493,12 +495,12 @@ openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
|
||||
|
||||
OpenLLM encourages contributions by welcoming users to incorporate their custom
|
||||
LLMs into the ecosystem. Check out
|
||||
[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
|
||||
[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md)
|
||||
to see how you can do it yourself.
|
||||
|
||||
### Embeddings
|
||||
|
||||
OpenLLM tentatively provides embeddings endpoint for supported models. This can
|
||||
OpenLLM provides embeddings endpoint for embeddings calculation. This can
|
||||
be accessed via `/v1/embeddings`.
|
||||
|
||||
To use via CLI, simply call `openllm embed`:
|
||||
@@ -532,8 +534,19 @@ client.embed("I like to eat apples")
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> Currently, the following model family supports embeddings: Llama, T5
|
||||
> (Flan-T5, FastChat, etc.), ChatGLM
|
||||
> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
|
||||
> For the remaining LLM that doesn't have specific embedding implementation,
|
||||
> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
||||
> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento)
|
||||
|
||||
### Playground and Chat UI
|
||||
|
||||
The following UIs are currently available for OpenLLM:
|
||||
|
||||
| UI | Owner | Type | Progress |
|
||||
|-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------|
|
||||
| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 |
|
||||
| TS | BentoML Team | | 🚧 |
|
||||
|
||||
## ⚙️ Integrations
|
||||
|
||||
|
||||
@@ -2,80 +2,80 @@
|
||||
[build-system]
|
||||
build-backend = "hatchling.build"
|
||||
requires = [
|
||||
"hatchling==1.18.0",
|
||||
"hatch-vcs==0.3.0",
|
||||
"hatch-fancy-pypi-readme==23.1.0",
|
||||
"hatch-mypyc==0.16.0",
|
||||
"hatchling==1.18.0",
|
||||
"hatch-vcs==0.3.0",
|
||||
"hatch-fancy-pypi-readme==23.1.0",
|
||||
"hatch-mypyc==0.16.0",
|
||||
]
|
||||
|
||||
[project]
|
||||
authors = [
|
||||
{name = "Aaron Pham",email = "aarnphm@bentoml.com"},
|
||||
{name = "BentoML Team",email = "contact@bentoml.com"},
|
||||
{ name = "Aaron Pham", email = "aarnphm@bentoml.com" },
|
||||
{ name = "BentoML Team", email = "contact@bentoml.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: GPU :: NVIDIA CUDA",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 12",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
"Operating System :: OS Independent",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: System Administrators",
|
||||
"Typing :: Typed",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: GPU :: NVIDIA CUDA",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 12",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
|
||||
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
"Operating System :: OS Independent",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: System Administrators",
|
||||
"Typing :: Typed",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"bentoml[grpc,io]>=1.0.25",
|
||||
"transformers[torch,tokenizers,accelerate]>=4.29.0",
|
||||
"safetensors",
|
||||
"optimum",
|
||||
"attrs>=23.1.0",
|
||||
"cattrs>=23.1.0",
|
||||
"orjson",
|
||||
"inflection",
|
||||
"tabulate[widechars]>=0.9.0",
|
||||
"httpx",
|
||||
"click>=8.1.3",
|
||||
"typing_extensions",
|
||||
"mypy_extensions",
|
||||
"ghapi",
|
||||
"cuda-python;platform_system!=\"Darwin\"",
|
||||
"bitsandbytes<0.42",
|
||||
"bentoml[grpc,io]>=1.0.25",
|
||||
"transformers[torch,tokenizers,accelerate]>=4.29.0",
|
||||
"safetensors",
|
||||
"optimum",
|
||||
"attrs>=23.1.0",
|
||||
"cattrs>=23.1.0",
|
||||
"orjson",
|
||||
"inflection",
|
||||
"tabulate[widechars]>=0.9.0",
|
||||
"httpx",
|
||||
"click>=8.1.3",
|
||||
"typing_extensions",
|
||||
"mypy_extensions",
|
||||
"ghapi",
|
||||
"cuda-python;platform_system!=\"Darwin\"",
|
||||
"bitsandbytes<0.42",
|
||||
]
|
||||
description = "OpenLLM: Operating LLMs in production"
|
||||
dynamic = ["version", "readme"]
|
||||
keywords = [
|
||||
"MLOps",
|
||||
"AI",
|
||||
"BentoML",
|
||||
"Model Serving",
|
||||
"Model Deployment",
|
||||
"LLMOps",
|
||||
"Falcon",
|
||||
"Vicuna",
|
||||
"Llama 2",
|
||||
"Fine tuning",
|
||||
"Serverless",
|
||||
"Large Language Model",
|
||||
"Generative AI",
|
||||
"StableLM",
|
||||
"Alpaca",
|
||||
"PyTorch",
|
||||
"Transformers",
|
||||
"MLOps",
|
||||
"AI",
|
||||
"BentoML",
|
||||
"Model Serving",
|
||||
"Model Deployment",
|
||||
"LLMOps",
|
||||
"Falcon",
|
||||
"Vicuna",
|
||||
"Llama 2",
|
||||
"Fine tuning",
|
||||
"Serverless",
|
||||
"Large Language Model",
|
||||
"Generative AI",
|
||||
"StableLM",
|
||||
"Alpaca",
|
||||
"PyTorch",
|
||||
"Transformers",
|
||||
]
|
||||
license = "Apache-2.0"
|
||||
name = "openllm"
|
||||
@@ -103,21 +103,21 @@ Twitter = "https://twitter.com/bentomlai"
|
||||
[project.optional-dependencies]
|
||||
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
|
||||
all = [
|
||||
"openllm[agents]",
|
||||
"openllm[baichuan]",
|
||||
"openllm[chatglm]",
|
||||
"openllm[falcon]",
|
||||
"openllm[fine-tune]",
|
||||
"openllm[flan-t5]",
|
||||
"openllm[ggml]",
|
||||
"openllm[gptq]",
|
||||
"openllm[llama]",
|
||||
"openllm[mpt]",
|
||||
"openllm[openai]",
|
||||
"openllm[opt]",
|
||||
"openllm[playground]",
|
||||
"openllm[starcoder]",
|
||||
"openllm[vllm]",
|
||||
"openllm[agents]",
|
||||
"openllm[baichuan]",
|
||||
"openllm[chatglm]",
|
||||
"openllm[falcon]",
|
||||
"openllm[fine-tune]",
|
||||
"openllm[flan-t5]",
|
||||
"openllm[ggml]",
|
||||
"openllm[gptq]",
|
||||
"openllm[llama]",
|
||||
"openllm[mpt]",
|
||||
"openllm[openai]",
|
||||
"openllm[opt]",
|
||||
"openllm[playground]",
|
||||
"openllm[starcoder]",
|
||||
"openllm[vllm]",
|
||||
]
|
||||
baichuan = ["cpm-kernels", "sentencepiece"]
|
||||
chatglm = ["cpm-kernels", "sentencepiece"]
|
||||
@@ -141,12 +141,12 @@ source = "vcs"
|
||||
version-file = "src/openllm/_version.py"
|
||||
[tool.hatch.version.raw-options]
|
||||
git_describe_command = [
|
||||
"git",
|
||||
"describe",
|
||||
"--dirty",
|
||||
"--tags",
|
||||
"--long",
|
||||
"--first-parent",
|
||||
"git",
|
||||
"describe",
|
||||
"--dirty",
|
||||
"--tags",
|
||||
"--long",
|
||||
"--first-parent",
|
||||
]
|
||||
local_scheme = "no-local-version"
|
||||
root = ".."
|
||||
@@ -157,55 +157,54 @@ only-include = ["src/openllm"]
|
||||
sources = ["src"]
|
||||
[tool.hatch.build.targets.sdist]
|
||||
exclude = [
|
||||
"/.git_archival.txt",
|
||||
"tests",
|
||||
"/.python-version-default",
|
||||
"ADDING_NEW_MODEL.md",
|
||||
"/.git_archival.txt",
|
||||
"tests",
|
||||
"/.python-version-default",
|
||||
"ADDING_NEW_MODEL.md",
|
||||
]
|
||||
[tool.hatch.build.targets.wheel.hooks.mypyc]
|
||||
dependencies = [
|
||||
"hatch-mypyc==0.16.0",
|
||||
"mypy==1.4.1",
|
||||
# avoid https://github.com/pallets/click/issues/2558
|
||||
"click==8.1.3",
|
||||
"bentoml==1.1.1",
|
||||
"transformers>=4.31.0",
|
||||
"pandas-stubs",
|
||||
"types-psutil",
|
||||
"types-tabulate",
|
||||
"types-PyYAML",
|
||||
"types-protobuf",
|
||||
"hatch-mypyc==0.16.0",
|
||||
"mypy==1.4.1",
|
||||
# avoid https://github.com/pallets/click/issues/2558
|
||||
"click==8.1.3",
|
||||
"bentoml==1.1.1",
|
||||
"transformers>=4.31.0",
|
||||
"pandas-stubs",
|
||||
"types-psutil",
|
||||
"types-tabulate",
|
||||
"types-PyYAML",
|
||||
"types-protobuf",
|
||||
]
|
||||
enable-by-default = false
|
||||
exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
|
||||
include = [
|
||||
"src/openllm/bundle",
|
||||
"src/openllm/models/__init__.py",
|
||||
"src/openllm/models/auto/__init__.py",
|
||||
"src/openllm/utils/__init__.py",
|
||||
"src/openllm/utils/codegen.py",
|
||||
"src/openllm/__init__.py",
|
||||
"src/openllm/_prompt.py",
|
||||
"src/openllm/_schema.py",
|
||||
"src/openllm/_quantisation.py",
|
||||
"src/openllm/_generation.py",
|
||||
"src/openllm/_strategies.py",
|
||||
"src/openllm/exceptions.py",
|
||||
"src/openllm/testing.py",
|
||||
"src/openllm/models/__init__.py",
|
||||
"src/openllm/models/auto/__init__.py",
|
||||
"src/openllm/utils/__init__.py",
|
||||
"src/openllm/__init__.py",
|
||||
"src/openllm/_prompt.py",
|
||||
"src/openllm/_schema.py",
|
||||
"src/openllm/_quantisation.py",
|
||||
"src/openllm/_generation.py",
|
||||
"src/openllm/_strategies.py",
|
||||
"src/openllm/exceptions.py",
|
||||
"src/openllm/testing.py",
|
||||
]
|
||||
# NOTE: This is consistent with pyproject.toml
|
||||
mypy-args = [
|
||||
"--strict",
|
||||
# this is because all transient library doesn't have types
|
||||
"--allow-subclassing-any",
|
||||
"--follow-imports=skip",
|
||||
"--check-untyped-defs",
|
||||
"--ignore-missing-imports",
|
||||
"--no-warn-return-any",
|
||||
"--warn-unreachable",
|
||||
"--no-warn-no-return",
|
||||
"--no-warn-unused-ignores",
|
||||
"--exclude='/src\\/openllm\\/playground\\/**'",
|
||||
"--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
|
||||
"--strict",
|
||||
# this is because all transient library doesn't have types
|
||||
"--follow-imports=skip",
|
||||
"--allow-subclassing-any",
|
||||
"--check-untyped-defs",
|
||||
"--ignore-missing-imports",
|
||||
"--no-warn-return-any",
|
||||
"--warn-unreachable",
|
||||
"--no-warn-no-return",
|
||||
"--no-warn-unused-ignores",
|
||||
"--exclude='/src\\/openllm\\/playground\\/**'",
|
||||
"--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
|
||||
]
|
||||
options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
|
||||
require-runtime-dependencies = true
|
||||
|
||||
@@ -28,7 +28,9 @@ else:
|
||||
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
|
||||
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
|
||||
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
|
||||
"_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
|
||||
"models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
|
||||
}
|
||||
COMPILED = _Path(__file__).suffix in (".pyd", ".so")
|
||||
@@ -40,6 +42,8 @@ if _t.TYPE_CHECKING:
|
||||
from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
|
||||
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
||||
from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
|
||||
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
|
||||
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
|
||||
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
|
||||
from .models.baichuan import BaichuanConfig as BaichuanConfig
|
||||
@@ -54,7 +58,7 @@ if _t.TYPE_CHECKING:
|
||||
from .models.stablelm import StableLMConfig as StableLMConfig
|
||||
from .models.starcoder import StarCoderConfig as StarCoderConfig
|
||||
from .serialisation import ggml as ggml, transformers as transformers
|
||||
from openllm.utils import infer_auto_class as infer_auto_class
|
||||
from .utils import infer_auto_class as infer_auto_class
|
||||
|
||||
try:
|
||||
if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
|
||||
|
||||
48
openllm-python/src/openllm/_embeddings.py
Normal file
48
openllm-python/src/openllm/_embeddings.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# See https://github.com/bentoml/sentence-embedding-bento for more information.
|
||||
from __future__ import annotations
|
||||
import bentoml, openllm, transformers, typing as t
|
||||
from huggingface_hub import snapshot_download
|
||||
from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION
|
||||
from bentoml._internal.models.model import ModelOptions, ModelSignature
|
||||
if t.TYPE_CHECKING: import torch
|
||||
|
||||
_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2"
|
||||
_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2"
|
||||
|
||||
def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
|
||||
try:
|
||||
return bentoml.transformers.get(ids)
|
||||
except bentoml.exceptions.NotFound:
|
||||
model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
|
||||
with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
|
||||
snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"])
|
||||
return bentomodel
|
||||
|
||||
class GenericEmbeddingRunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
def __init__(self) -> None:
|
||||
self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu"
|
||||
self._bentomodel = get_or_download()
|
||||
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path)
|
||||
self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path)
|
||||
self.model.to(self.device)
|
||||
@bentoml.Runnable.method(batchable=True, batch_dim=0)
|
||||
def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
|
||||
import torch, torch.nn.functional as F
|
||||
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
|
||||
attention_mask = encoded_input["attention_mask"]
|
||||
# Compute token embeddings
|
||||
with torch.no_grad(): model_output = self.model(**encoded_input)
|
||||
# Perform pooling and normalize
|
||||
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
|
||||
return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
|
||||
@staticmethod
|
||||
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
import torch
|
||||
# Mean Pooling - Take attention mask into account for correct averaging
|
||||
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
||||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||
|
||||
__all__ = ["GenericEmbeddingRunnable"]
|
||||
@@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
scheduling_strategy = CascadingResourceStrategy
|
||||
|
||||
generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
|
||||
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
|
||||
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
|
||||
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))
|
||||
|
||||
# NOTE: returning the two langchain API's to the runner
|
||||
@@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
|
||||
logger.info("Successfully apply LoRA layer %s", adapter_name)
|
||||
|
||||
@bentoml.Runnable.method(**method_signature(embeddings_sig))
|
||||
def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
|
||||
return self.embeddings([prompt] if isinstance(prompt, str) else prompt)
|
||||
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
|
||||
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
|
||||
|
||||
@bentoml.Runnable.method(**method_signature(generate_sig))
|
||||
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
# mypy: disable-error-code="arg-type,misc"
|
||||
"""The service definition for running any LLMService.
|
||||
|
||||
For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
|
||||
Codegen can be found under 'openllm.utils.codegen'
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os, warnings, orjson, bentoml, openllm, typing as t
|
||||
from starlette.applications import Starlette
|
||||
@@ -12,6 +6,7 @@ from starlette.routing import Route
|
||||
if t.TYPE_CHECKING:
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
from bentoml._internal.runner.runner import RunnerMethod
|
||||
# The following warnings from bitsandbytes, and probably not that important for users to see
|
||||
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
|
||||
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
|
||||
@@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na
|
||||
adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
|
||||
generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300)
|
||||
runners: t.Sequence[bentoml.Runner] = [runner]
|
||||
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
|
||||
|
||||
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
@@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
|
||||
|
||||
if runner.supports_embeddings:
|
||||
@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
responses = await runner.embeddings.async_run(phrases)
|
||||
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
|
||||
@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode
|
||||
responses = (await embed_call.async_run(phrases))[0]
|
||||
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
|
||||
|
||||
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
|
||||
async def hf_agent(request: Request) -> Response:
|
||||
|
||||
@@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
|
||||
dev = str(gpus[idx])
|
||||
return dev
|
||||
|
||||
__all__=["CascadingResourceStrategy", "get_resource"]
|
||||
|
||||
@@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
|
||||
supports_embeddings: bool
|
||||
supports_hf_agent: bool
|
||||
has_adapters: bool
|
||||
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
|
||||
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
|
||||
generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
|
||||
generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
|
||||
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
|
||||
|
||||
@@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
|
||||
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
|
||||
_bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
|
||||
env: openllm.utils.EnvVarMixin = llm.config["env"]
|
||||
if env["framework_value"] == "vllm": serialisation_format = "legacy"
|
||||
env_dict = {
|
||||
env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
|
||||
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
|
||||
"OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
|
||||
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
|
||||
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
|
||||
}
|
||||
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
|
||||
|
||||
@@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
|
||||
)
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
|
||||
# NOTE: the model_id_path here are only used for setting this environment variable within the container
|
||||
# built with for BentoLLM.
|
||||
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
|
||||
service_fs_path = fs.path.join("src", llm.config["service_name"])
|
||||
service_path = bento._fs.getsyspath(service_fs_path)
|
||||
with open(service_path, "r") as f:
|
||||
|
||||
@@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
|
||||
if device:
|
||||
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
|
||||
else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
|
||||
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
|
||||
if cors:
|
||||
_bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
|
||||
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
|
||||
|
||||
@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
|
||||
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
|
||||
|
||||
@inject
|
||||
def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
|
||||
def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
|
||||
"""Package a LLM into a Bento.
|
||||
|
||||
The LLM will be built into a BentoService with the following structure:
|
||||
@@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
bento_version: Optional bento veresion for this given BentoLLM
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
@@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
|
||||
Returns:
|
||||
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
|
||||
args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
|
||||
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
|
||||
if quantize: args.extend(["--quantize", quantize])
|
||||
if bettertransformer: args.append("--bettertransformer")
|
||||
@@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
|
||||
if overwrite: args.append("--overwrite")
|
||||
if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
|
||||
if model_version: args.extend(["--model-version", model_version])
|
||||
if bento_version: args.extend(["--bento-version", bento_version])
|
||||
if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
|
||||
if container_registry is None: container_registry = "ecr"
|
||||
if container_version_strategy is None: container_version_strategy = "release"
|
||||
|
||||
@@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]):
|
||||
|
||||
def health(self) -> t.Any: return self._cached.health()
|
||||
def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
|
||||
if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
|
||||
if isinstance(prompt, str): prompt = [prompt]
|
||||
result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
|
||||
return openllm.EmbeddingsOutput(**result)
|
||||
@@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
|
||||
|
||||
async def health(self) -> t.Any: return await self._cached.async_health()
|
||||
async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
|
||||
if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
|
||||
if isinstance(prompt, str): prompt = [prompt]
|
||||
res = await self.acall("embeddings", list(prompt))
|
||||
return openllm.EmbeddingsOutput(**res)
|
||||
|
||||
@@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
else:
|
||||
# we will clone the all tings into the bentomodel path without loading model into memory
|
||||
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
|
||||
except Exception:
|
||||
raise
|
||||
except Exception: raise
|
||||
else:
|
||||
bentomodel.flush() # type: ignore[no-untyped-call]
|
||||
bentomodel.save(_model_store)
|
||||
@@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
# NOTE: We need to free up the cache after importing the model
|
||||
# in the case where users first run openllm start without the model available locally.
|
||||
if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
|
||||
return bentomodel
|
||||
|
||||
def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
|
||||
@@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter):
|
||||
class ModelAdapterMapFormatter(ModelNameFormatter):
|
||||
model_keyword: LiteralString = "__model_adapter_map__"
|
||||
|
||||
_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py"
|
||||
_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
|
||||
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
|
||||
from openllm.utils import DEBUG
|
||||
model_name = llm.config["model_name"]
|
||||
|
||||
Reference in New Issue
Block a user