feat(service): provisional API (#133)

This commit is contained in:
Aaron Pham
2023-07-23 02:15:39 -04:00
committed by GitHub
parent d88b069160
commit 693631958a
59 changed files with 683 additions and 2085 deletions

View File

@@ -20,7 +20,7 @@ ci:
exclude: '.*\.(css|js|svg)$'
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.0.278'
rev: 'v0.0.280'
hooks:
- id: ruff
args: [--exit-non-zero-on-fix, --show-fixes]
@@ -28,6 +28,8 @@ repos:
rev: 23.7.0
hooks:
- id: black-jupyter
args: [--config=pyproject.toml]
exclude: (?x)^(src/openllm/models/.*)$
- repo: https://github.com/econchick/interrogate
rev: 1.5.0
hooks:
@@ -50,7 +52,6 @@ repos:
tools/.*|
tests/.*|
src/openllm/playground/.*|
src/openllm/models/.*|
.github/.*
)$
additional_dependencies: ["mypy==1.4.1", "types-tabulate", "types-Deprecated", "types-PyYAML", "types-decorator", "types-protobuf", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "types-ujson", "pandas-stubs", "types-Pillow", "types-Pygments", "types-appdirs", "types-colorama", "types-google-cloud-ndb", "types-jsonschema", "types-psutil", "types-pywin32", "types-tqdm", "types-openpyxl"]

View File

@@ -299,7 +299,7 @@ pip install "openllm[mpt]"
<tr>
<td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.MPTForCausalLM><code>MPTForCausalLM</code></a></td>
<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
<td>✅</td>
<td>✅</td>
<td>

View File

@@ -0,0 +1,14 @@
APIs for LLMService are now provisional based on the capabilities of the LLM.
The following APIs are considered provisional:
- `/v1/embeddings`: This will be available if the LLM supports embeddings (i.e: ``LLM.embeddings`` is implemented. Example model are ``llama``)
- `/hf/agent`: This will be available if LLM supports running HF agents (i.e: ``LLM.generate_one`` is implemented. Example model are ``starcoder``, ``falcon``.)
- `POST /v1/adapters` and `GET /v1/adapters`: This will be available if the server is running with LoRA weights
``openllm.LLMRunner`` now include three additional boolean:
- `runner.supports_embeddings`: Whether this runner supports embeddings
- `runner.supports_hf_agent`: Whether this runner support HF agents
- `runner.has_adapters`: Whether this runner is loaded with LoRA adapters.
Optimized ``openllm.models``'s bytecode performance

View File

@@ -157,7 +157,7 @@ python_files = ["test_*.py", "*_test.py"]
testpaths = ["tests"]
[tool.black]
exclude = '''
extend-exclude = '''
(
/(
\.eggs
@@ -174,14 +174,15 @@ exclude = '''
| tools
)/
| src/openllm/__about__.py
| src/openllm/models
)
'''
line-length = 119
target-version = ["py38", "py39", "py310", "py311"]
[tool.ruff]
exclude = ["tools", "src/openllm/playground"]
extend-include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
extend-exclude = ["tools", "src/openllm/playground", "src/openllm/models", "src/openllm/_types.py"]
extend-include = ["*.ipynb"]
extend-select = [
"B", # flake8-bugbear
"I", # isort
@@ -223,12 +224,14 @@ ignore = [
"TCH004", # don't move runtime import out, just warn about it
"RUF012", # mutable attributes to be used with ClassVar
"B905", # zip warning about strict, only applicable for 3.10+
"D105", # magic docstring
]
line-length = 119
target-version = "py312"
unfixable = [
"F401", # Don't touch unused imports, just warn about it.
"TCH004", # Don't touch import outside of TYPE_CHECKING block
"RUF100", # unused noqa, just warn about it
]
[tool.ruff.flake8-type-checking]
exempt-modules = ["typing", "typing_extensions", "."]
@@ -255,17 +258,9 @@ avoid-escape = false
# Tests can use magic values, assertions, and relative imports
"__init__.py" = ["E402", "F401", "F403", "F811"]
"examples/**/*" = ["D"]
"src/openllm/_llm.py" = ["B010", "B009"]
"src/openllm/_strategies.py" = ["B904"]
"src/openllm/_types.py" = ["E402"]
"src/openllm/cli.py" = ["D301", "S101"]
"src/openllm/models/**/*" = ["D106", "S101", "D104"]
"src/openllm/playground/**/*" = ["E402", "F401", "PLR", "D"]
"src/openllm/utils/dummy_*" = ["D107"]
"src/openllm/utils/import_utils.py" = [
"PLW0603", # OK to ignore global access here
"D105", # magic docstring
]
"src/openllm/utils/import_utils.py" = ["PLW0603"]
"src/openllm_client/runtimes/*" = ["D107"]
"tests/**/*" = [
"S101",

View File

@@ -28,6 +28,7 @@ from abc import abstractmethod
from pathlib import Path
import attr
import inflection
import orjson
from huggingface_hub import hf_hub_download
@@ -82,6 +83,7 @@ if t.TYPE_CHECKING:
from ._configuration import PeftType
from ._types import AdaptersMapping
from ._types import AdaptersTuple
from ._types import AnyCallable
from ._types import DictStrAny
from ._types import ListStr
from ._types import LiteralRuntime
@@ -161,13 +163,12 @@ def make_tag(
model_version = tag.version
model_name = tag.name
else:
if model_version is None: # noqa: PLR5501
if not quiet:
logger.warning(
"Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
model_id,
)
model_version = generate_hash_from_file(model_id)
if not quiet and model_version is None:
logger.warning(
"Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
model_id,
)
model_version = first_not_none(model_version, default=generate_hash_from_file(model_id))
else:
config = t.cast(
"transformers.PretrainedConfig",
@@ -418,6 +419,15 @@ class LLMInterface(ABC, t.Generic[M, T]):
__llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
"""A reference to the the cached LoRA adapter mapping."""
__llm_supports_embeddings__: bool
"""A boolean to determine whether models does implement ``LLM.embeddings``."""
__llm_supports_generate__: bool
"""A boolean to determine whether models does implement ``LLM.generate``."""
__llm_supports_generate_one__: bool
"""A boolean to determine whether models does implement ``LLM.generate_one``."""
__llm_supports_generate_iterator__: bool
"""A boolean to determine whether models does implement ``LLM.generate_iterator``."""
if t.TYPE_CHECKING and not MYPY:
def __attrs_init__(
@@ -528,6 +538,21 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]):
return wrapper
def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
# update docstring for given entrypoint
original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
original_fn.__doc__ = (
original_fn.__doc__
or f"""\
{cls.__name__}'s implementation for {fn}.
Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
The original model can then be accessed with 'self.model.get_base_model()'.
"""
)
setattr(cls, fn, original_fn)
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
attributes = {
"import_model": _wrapped_import_model,
@@ -539,7 +564,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
args: ListStr = []
anns: DictStrAny = {}
lines: ListStr = []
globs: DictStrAny = {"cls": cls, "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface)}
globs: DictStrAny = {
"cls": cls,
"_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface),
"__gen_docstring": _update_docstring,
}
# function initialisation
for func, impl in attributes.items():
impl_name = f"__wrapped_{func}"
@@ -561,9 +590,22 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
interface_anns = codegen.get_annotations(LLMInterface)
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
lines.append(_setattr_class(f"__llm_{v}__", None))
anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
anns[f"__llm_{v}__"] = interface_anns.get(f"__llm_{v}__")
return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
# boolean to determine whether LLM has defined an implementation for a function
for fn in {"generate", "generate_one", "generate_iterator", "embeddings"}:
key = f"__llm_supports_{fn}__"
lines.extend(
[
_setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"),
f"__gen_docstring(cls, '{fn}')",
]
)
anns[key] = interface_anns.get(key)
return codegen.generate_function(
cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns
)
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
@@ -607,28 +649,24 @@ class LLM(LLMInterface[M, T], ReprMixin):
implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
cls.__llm_implementation__ = implementation
config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
if "__openllm_internal__" in cd:
if "config_class" not in cd:
cls.config_class = config_class
elif "config_class" not in cd:
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
_make_assignment_script(cls)(cls)
# update docstring for given entrypoint
for fn in {"generate", "generate_one", "generate_iterator"}:
original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
original_fn.__doc__ = (
original_fn.__doc__
or f"""\
'{fn}' implementation {cls.__name__}.
Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
The original can then be accessed with 'self.model.get_base_model()'.
"""
)
setattr(cls, fn, original_fn)
def __getitem__(self, item: t.LiteralString | t.Any) -> t.Any:
if item is None:
raise TypeError(f"{self} doesn't understand how to index None.")
item = inflection.underscore(item)
internal_attributes = f"__llm_{item}__"
if hasattr(self, internal_attributes):
return getattr(self, internal_attributes)
elif hasattr(self, item):
return getattr(self, item)
else:
raise KeyError(item)
@classmethod
@overload
@@ -1667,6 +1705,9 @@ def llm_runner_class(self: openllm.LLM[M, T]) -> type[LLMRunner]:
"__repr__": ReprMixin.__repr__,
"__repr_keys__": property(_wrapped_repr_keys),
"__repr_args__": _wrapped_repr_args,
"supports_embeddings": self["supports-embeddings"],
"supports_hf_agent": self["supports-generate-one"],
"has_adapters": self._adapters_mapping is not None,
}
),
)

View File

@@ -94,6 +94,8 @@ class MetadataOutput:
model_name: str
framework: str
configuration: str
supports_embeddings: bool
supports_hf_agent: bool
@attr.frozen(slots=True)

View File

@@ -89,48 +89,6 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
return openllm.GenerationOutput(responses=responses, configuration=config)
@svc.api(
input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
output=bentoml.io.JSON.from_sample(
sample={
"embeddings": [
0.007917795330286026,
-0.014421648345887661,
0.00481307040899992,
0.007331526838243008,
-0.0066398633643984795,
0.00945580005645752,
0.0087016262114048,
-0.010709521360695362,
0.012635177001357079,
0.010541186667978764,
-0.00730888033285737,
-0.001783102168701589,
0.02339819073677063,
-0.010825827717781067,
-0.015888236463069916,
0.01876218430697918,
0.0076906150206923485,
0.0009032754460349679,
-0.010024012066423893,
0.01090280432254076,
-0.008668390102684498,
0.02070549875497818,
0.0014594447566196322,
-0.018775740638375282,
-0.014814382418990135,
0.01796768605709076,
],
"num_tokens": 20,
}
),
route="/v1/embeddings",
)
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
responses = await runner.embeddings.async_run(phrases)
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"])
@svc.api(
input=bentoml.io.Text(),
output=bentoml.io.JSON.from_sample(
@@ -151,42 +109,96 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
model_name=llm_config["model_name"],
framework=llm_config["env"]["framework_value"],
configuration=llm_config.model_dump_json().decode(),
supports_embeddings=runner.supports_embeddings,
supports_hf_agent=runner.supports_hf_agent,
)
@svc.api(
input=bentoml.io.Text.from_sample(sample="default"),
output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
route="/v1/adapters",
)
async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
return await runner.set_adapter.async_run(adapter_name)
if runner.supports_embeddings:
@svc.api(
input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
output=bentoml.io.JSON.from_sample(
sample={
"embeddings": [
0.007917795330286026,
-0.014421648345887661,
0.00481307040899992,
0.007331526838243008,
-0.0066398633643984795,
0.00945580005645752,
0.0087016262114048,
-0.010709521360695362,
0.012635177001357079,
0.010541186667978764,
-0.00730888033285737,
-0.001783102168701589,
0.02339819073677063,
-0.010825827717781067,
-0.015888236463069916,
0.01876218430697918,
0.0076906150206923485,
0.0009032754460349679,
-0.010024012066423893,
0.01090280432254076,
-0.008668390102684498,
0.02070549875497818,
0.0014594447566196322,
-0.018775740638375282,
-0.014814382418990135,
0.01796768605709076,
],
"num_tokens": 20,
}
),
route="/v1/embeddings",
)
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
responses = await runner.embeddings.async_run(phrases)
return openllm.EmbeddingsOutput(
embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"]
)
@attr.define
class HfAgentInput:
inputs: str
parameters: t.Dict[str, t.Any]
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
@attr.define
class HfAgentInput:
inputs: str
parameters: t.Dict[str, t.Any]
async def hf_agent(request: Request) -> Response:
json_str = await request.body()
try:
input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
except orjson.JSONDecodeError as err:
raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
async def hf_agent(request: Request) -> Response:
json_str = await request.body()
try:
input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
except orjson.JSONDecodeError as err:
raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
stop = input_data.parameters.pop("stop", ["\n"])
try:
resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
return JSONResponse(resp, status_code=200)
except NotImplementedError:
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
stop = input_data.parameters.pop("stop", ["\n"])
try:
resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
return JSONResponse(resp, status_code=200)
except NotImplementedError:
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
svc.mount_asgi_app(hf_app, path="/hf")
svc.mount_asgi_app(hf_app, path="/hf")
if runner.has_adapters:
@svc.api(
input=bentoml.io.Text.from_sample(sample="default"),
output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
route="/v1/adapters",
)
async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
return await runner.set_adapter.async_run(adapter_name)
else:
async def adapters_v1(_: Request) -> Response:
return JSONResponse({"success": False, "message": "No available adapters for current running server"})
async def list_adapter_v1(_: Request) -> Response:
@@ -198,5 +210,8 @@ async def list_adapter_v1(_: Request) -> Response:
return JSONResponse(res, status_code=200)
metadata_app = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])])
svc.mount_asgi_app(metadata_app, path="/v1")
adapters_routes_v1 = [Route("/adapters", list_adapter_v1, methods=["GET"])]
if not runner.has_adapters:
adapters_routes_v1.append(Route("/adapters", adapters_v1, methods=["POST"]))
adapters_app_v1 = Starlette(debug=True, routes=adapters_routes_v1)
svc.mount_asgi_app(adapters_app_v1, path="/v1")

View File

@@ -145,6 +145,10 @@ class LLMRunner(bentoml.Runner):
generate_one: RunnerMethod[LLMRunnable, [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
generate_iterator: RunnerMethod[LLMRunnable, [str], t.Generator[t.Any, None, None]]
supports_embeddings: bool
supports_hf_agent: bool
has_adapters: bool
def __init__(
self,
runnable_class: type[LLMRunnable],

View File

@@ -34,6 +34,7 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
"""
from __future__ import annotations
import functools
import http.client
import importlib.machinery
import importlib.util
import inspect
@@ -470,9 +471,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return super().get_command(ctx, cmd_name)
def list_commands(self, ctx: click.Context) -> list[str]:
if ctx.command.name == "start" or ctx.command.name == "start-grpc":
if ctx.command.name in {"start", "start-grpc"}:
return list(openllm.CONFIG_MAPPING.keys())
return super().list_commands(ctx)
@override
@@ -883,7 +883,7 @@ def prerequisite_check(
requirements = llm_config["requirements"]
if requirements is not None and len(requirements) > 0:
missing_requirements = [i for i in requirements if importlib.util.find_spec(i) is None]
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
if len(missing_requirements) > 0:
_echo(
f"Make sure to have the following dependencies available: {missing_requirements}",
@@ -2339,6 +2339,11 @@ def instruct(
"""
client = openllm.client.HTTPClient(endpoint, timeout=timeout)
try:
client.call("metadata")
except http.client.BadStatusLine:
raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
if agent == "hf":
if not is_transformers_supports_agent():
raise click.UsageError(

View File

@@ -11,41 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_cpm_kernels_available
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available() or not is_cpm_kernels_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_baichuan"] = ["Baichuan"]
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_baichuan"] = ["Baichuan"]
if t.TYPE_CHECKING:
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_baichuan import START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
from .configuration_baichuan import BaichuanConfig as BaichuanConfig
try:
if not is_torch_available() or not is_cpm_kernels_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_baichuan import Baichuan as Baichuan
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_baichuan import Baichuan as Baichuan
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class BaichuanConfig(openllm.LLMConfig):
"""Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
@@ -26,7 +23,6 @@ class BaichuanConfig(openllm.LLMConfig):
and English benchmarks (C-Eval, MMLU, etc).
Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
"""
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
@@ -45,13 +41,10 @@ class BaichuanConfig(openllm.LLMConfig):
"hiyouga/baichuan-7b-sft",
],
}
class GenerationConfig:
max_new_tokens: int = 2048
top_p: float = 0.7
temperature: float = 0.95
START_BAICHUAN_COMMAND_DOCSTRING = """\
Run a LLMServer for Baichuan model.
@@ -71,5 +64,4 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
\b
$ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""

View File

@@ -13,69 +13,31 @@
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
top_p: float | None = None,
temperature: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
# NOTE: The rest of attrs should be kwargs for GenerationConfig
generate_kwargs = {
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"temperature": temperature,
**attrs,
}
generate_kwargs = {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}
return prompt_text, generate_kwargs, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
outputs = self.model.generate(
**inputs,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

View File

@@ -11,41 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_cpm_kernels_available
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available() or not is_cpm_kernels_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_chatglm"] = ["ChatGLM"]
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_chatglm"] = ["ChatGLM"]
if t.TYPE_CHECKING:
from .configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_chatglm import START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
try:
if not is_torch_available() or not is_cpm_kernels_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_chatglm import ChatGLM as ChatGLM
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_chatglm import ChatGLM as ChatGLM
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class ChatGLMConfig(openllm.LLMConfig):
"""ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
@@ -30,7 +27,6 @@ class ChatGLMConfig(openllm.LLMConfig):
Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
"""
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
@@ -48,22 +44,17 @@ class ChatGLMConfig(openllm.LLMConfig):
"thudm/chatglm2-6b-int4",
],
}
retain_history: bool = openllm.LLMConfig.Field(
False,
description="""Whether to retain history given to the model.
If set to True, then the model will retain given history.""",
)
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
max_new_tokens: int = 2048
num_beams: int = 1
top_p: float = 0.7
temperature: float = 0.95
START_CHATGLM_COMMAND_DOCSTRING = """\
Run a LLMServer for ChatGLM model.
@@ -83,5 +74,4 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
\b
$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""

View File

@@ -13,94 +13,34 @@
# limitations under the License.
from __future__ import annotations
import typing as t
import bentoml
import openllm
from ...utils import generate_labels
if t.TYPE_CHECKING:
import torch
import transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
__openllm_internal__ = True
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
return bentoml.transformers.save_model(
self.tag,
transformers.AutoModel.from_pretrained(self.model_id, trust_remote_code=trust_remote_code),
labels=generate_labels(self),
custom_objects={
"tokenizer": transformers.AutoTokenizer.from_pretrained(
self.model_id, trust_remote_code=trust_remote_code, **tokenizer_attrs
)
},
)
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
num_beams: int | None = None,
top_p: float | None = None,
temperature: float | None = None,
chat_history: list[str] | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[str] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
prompt_text = ""
if use_default_prompt_template and chat_history is not None:
for i, (old_query, response) in enumerate(chat_history):
prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" # noqa: RUF001
for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" # noqa: RUF001
prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:" # noqa: RUF001
else:
prompt_text = prompt
else: prompt_text = prompt
postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
# NOTE: The rest of attrs should be kwargs for GenerationConfig
generate_kwargs = {
"max_new_tokens": max_new_tokens,
"num_beams": num_beams,
"top_p": top_p,
"temperature": temperature,
**attrs,
}
generate_kwargs = {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}
return prompt_text, generate_kwargs, postprocess_generate_kwargs
def postprocess_generate(
self,
prompt: str,
generation_result: tuple[str, list[tuple[str, str]]],
*,
chat_history: list[tuple[str, str]] | None = None,
**attrs: t.Any,
):
def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any):
generated, history = generation_result
if self.config.retain_history:
assert chat_history is not None, "'retain_history' is True while there is no history provided."
chat_history.extend(history)
return generated
def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
with torch.inference_mode():
self.model.eval()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision:
self.model.half()
return self.model.chat(
self.tokenizer,
prompt,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
if self.config.use_half_precision: self.model.half()
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())

View File

@@ -11,40 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_dolly_v2"] = ["DollyV2"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_dolly_v2"] = ["DollyV2"]
if t.TYPE_CHECKING:
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_dolly_v2 import START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_dolly_v2 import DollyV2 as DollyV2
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_dolly_v2 import DollyV2 as DollyV2
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -13,14 +13,8 @@
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
if t.TYPE_CHECKING:
from transformers import PreTrainedTokenizer
if t.TYPE_CHECKING: import transformers
class DollyV2Config(openllm.LLMConfig):
"""Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
@@ -33,7 +27,6 @@ class DollyV2Config(openllm.LLMConfig):
Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
"""
__config__ = {
"timeout": 3600000,
"url": "https://github.com/databrickslabs/dolly",
@@ -41,19 +34,15 @@ class DollyV2Config(openllm.LLMConfig):
"default_id": "databricks/dolly-v2-3b",
"model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
}
return_full_text: bool = openllm.LLMConfig.Field(
False, description="Whether to return the full prompt to the users."
)
class GenerationConfig:
temperature: float = 0.9
top_p: float = 0.92
top_k: int = 5
max_new_tokens: int = 256
eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
START_DOLLY_V2_COMMAND_DOCSTRING = """\
Run a LLMServer for dolly-v2 model.
@@ -73,14 +62,10 @@ or provide `--model-id` flag when running ``openllm start dolly-v2``:
\b
$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
"""
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
# NOTE: This is the prompt that is used for generating responses using an already
# trained model. It ends with the response key, where the job of the model is to provide
# the completion that follows it (i.e. the response itself).
@@ -88,15 +73,8 @@ DEFAULT_PROMPT_TEMPLATE = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
"""Gets the token ID for a given string that has been added to the tokenizer as a special token.
When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
@@ -113,6 +91,5 @@ def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
int: the token ID for the given key.
"""
token_ids = tokenizer.encode(key)
if len(token_ids) > 1:
raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
return token_ids[0]

View File

@@ -15,288 +15,118 @@ from __future__ import annotations
import logging
import re
import typing as t
import openllm
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
from .configuration_dolly_v2 import END_KEY
from .configuration_dolly_v2 import RESPONSE_KEY
from .configuration_dolly_v2 import get_special_token_id
if t.TYPE_CHECKING:
import tensorflow as tf
import torch
import transformers
import tensorflow as tf
else:
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
@t.overload
def get_pipeline(
model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[True] = True,
**attrs: t.Any,
) -> transformers.Pipeline:
...
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ...
@t.overload
def get_pipeline(
model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[False] = ...,
**attrs: t.Any,
) -> type[transformers.Pipeline]:
...
def get_pipeline(
model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: bool = False,
**attrs: t.Any,
) -> type[transformers.Pipeline] | transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ...
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
class InstructionTextGenerationPipeline(transformers.Pipeline):
def __init__(
self,
*args: t.Any,
do_sample: bool = True,
max_new_tokens: int = 256,
top_p: float = 0.92,
top_k: int = 0,
**kwargs: t.Any,
):
"""Initialize the pipeline.
Args:
do_sample: Whether or not to use sampling. Defaults to True.
max_new_tokens: Max new tokens after the prompt to generate. Defaults to 128.
top_p: If set to float < 1, only the smallest set of most probable tokens with
probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to 0.
*args: Additional positional arguments to be passed to ``transformers.Pipeline``.
**kwargs: Additional keyword arguments to be passed to ``transformers.Pipeline``.
"""
super().__init__(
*args,
model=model,
tokenizer=tokenizer,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
top_p=top_p,
top_k=top_k,
**kwargs,
)
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any):
if t.TYPE_CHECKING:
assert self.tokenizer is not None
if t.TYPE_CHECKING: assert self.tokenizer is not None
preprocess_params: dict[str, t.Any] = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
# append a newline to yield a single token. find whatever token is configured for the response key.
tokenizer_response_key = next(
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
)
tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
response_key_token_id = None
end_key_token_id = None
if tokenizer_response_key:
try:
response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
# Ensure generation stops once it generates "### End"
generate_kwargs["eos_token_id"] = end_key_token_id
except ValueError:
pass
except ValueError: pass
forward_params = generate_kwargs
postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
if return_full_text is not None:
postprocess_params["return_full_text"] = return_full_text
if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
return preprocess_params, forward_params, postprocess_params
def preprocess(self, input_: str, **generate_kwargs: t.Any):
if t.TYPE_CHECKING:
assert self.tokenizer is not None
if t.TYPE_CHECKING: assert self.tokenizer is not None
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
inputs = self.tokenizer(prompt_text, return_tensors="pt")
inputs["prompt_text"] = prompt_text
inputs["instruction_text"] = input_
return inputs
def _forward(self, model_inputs: dict[str, t.Any], **generate_kwargs: t.Any):
if t.TYPE_CHECKING:
assert self.tokenizer is not None
input_ids = model_inputs["input_ids"]
attention_mask = model_inputs.get("attention_mask", None)
if input_ids.shape[1] == 0:
input_ids = None
attention_mask = None
in_b = 1
else:
in_b = input_ids.shape[0]
generated_sequence = self.model.generate(
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
pad_token_id=self.tokenizer.pad_token_id,
**generate_kwargs,
)
if t.TYPE_CHECKING: assert self.tokenizer is not None
input_ids, attention_mask = model_inputs["input_ids"], model_inputs.get("attention_mask", None)
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
else: in_b = input_ids.shape[0]
generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
out_b = generated_sequence.shape[0]
if self.framework == "pt":
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
elif self.framework == "tf":
generated_sequence = tf.reshape(
generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])
)
if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
instruction_text = model_inputs.pop("instruction_text")
return {
"generated_sequence": generated_sequence,
"input_ids": input_ids,
"instruction_text": instruction_text,
}
def postprocess(
self,
model_outputs: dict[str, t.Any],
response_key_token_id: int,
end_key_token_id: int,
return_full_text: bool = False,
):
if t.TYPE_CHECKING:
assert self.tokenizer is not None
generated_sequence = model_outputs["generated_sequence"][0]
instruction_text = model_outputs["instruction_text"]
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False):
if t.TYPE_CHECKING: assert self.tokenizer is not None
generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
generated_sequence: list[list[int]] = generated_sequence.numpy().tolist()
records: list[dict[t.Literal["generated_text"], str]] = []
for sequence in generated_sequence:
# The response will be set to this variable if we can identify it.
decoded = None
# If we have token IDs for the response and end, then we can find the tokens and only decode between them.
if response_key_token_id and end_key_token_id:
# Find where "### Response:" is first found in the generated tokens. Considering this is part of the
# prompt, we should definitely find it. We will return the tokens found after this token.
try:
response_pos = sequence.index(response_key_token_id)
except ValueError:
logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
response_pos = None
try: response_pos = sequence.index(response_key_token_id)
except ValueError: response_pos = None
if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
if response_pos:
# Next find where "### End" is located. The model has been trained to end its responses with this
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
# this token, as the response could be truncated. If we don't find it then just return everything
# to the end. Note that even though we set eos_token_id, we still see the this token at the end.
try:
end_pos = sequence.index(end_key_token_id)
except ValueError:
end_pos = None
try: end_pos = sequence.index(end_key_token_id)
except ValueError: end_pos = None
decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
if not decoded:
# Otherwise we'll decode everything and use a regex to find the response and end.
fully_decoded = self.tokenizer.decode(sequence)
# The response appears after "### Response:". The model has been trained to append "### End" at the
# end.
m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
if m:
decoded = m.group(1).strip()
if m: decoded = m.group(1).strip()
else:
# The model might not generate the "### End" sequence before reaching the max tokens. In this case,
# return everything after "### Response:".
m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
if m:
decoded = m.group(1).strip()
else:
logger.warning("Failed to find response in:\n%s", fully_decoded)
if m: decoded = m.group(1).strip()
else: logger.warning("Failed to find response in:\n%s", fully_decoded)
# If the full text is requested, then append the decoded text to the original instruction.
# This technically isn't the full text, as we format the instruction in the prompt the model has been
# trained on, but to the client it will appear to be the full text.
if return_full_text:
decoded = f"{instruction_text}\n{decoded}"
if return_full_text: decoded = f"{instruction_text}\n{decoded}"
rec = {"generated_text": decoded}
records.append(rec)
return records
if _init:
return InstructionTextGenerationPipeline()
if _init: return InstructionTextGenerationPipeline()
return InstructionTextGenerationPipeline
class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
__openllm_internal__ = True
@property
def import_kwargs(self):
model_kwds = {
"device_map": "auto" if torch.cuda.is_available() else None,
"torch_dtype": torch.bfloat16,
}
tokenizer_kwds = {"padding_side": "left"}
return model_kwds, tokenizer_kwds
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(
model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
tokenizer=self.tokenizer,
_init=True,
return_full_text=self.config.return_full_text,
)
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
# NOTE: The rest of attrs should be kwargs for GenerationConfig
generate_kwargs = {
"max_new_tokens": max_new_tokens,
"top_k": top_k,
"top_p": top_p,
"temperature": temperature,
**attrs,
}
return prompt, generate_kwargs, {}
def postprocess_generate(
self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
) -> str:
return generation_result[0]["generated_text"]
def import_kwargs(self): return {"device_map": "auto" if torch.cuda.is_available() else None, "torch_dtype": torch.bfloat16}, {"padding_side": "left"}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), tokenizer=self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return prompt, {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
with torch.inference_mode():
llm_config = self.config.model_construct_env(**attrs)
return self.model(
prompt,
return_full_text=llm_config.return_full_text,
generation_config=llm_config.to_generation_config(),
)
llm_config = self.config.model_construct_env(**attrs)
with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())

View File

@@ -11,40 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_falcon"] = ["Falcon"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_falcon"] = ["Falcon"]
if t.TYPE_CHECKING:
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_falcon import START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
from .configuration_falcon import FalconConfig as FalconConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_falcon import Falcon as Falcon
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_falcon import Falcon as Falcon
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class FalconConfig(openllm.LLMConfig):
"""Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
@@ -23,7 +20,6 @@ class FalconConfig(openllm.LLMConfig):
Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
"""
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
@@ -50,15 +46,12 @@ class FalconConfig(openllm.LLMConfig):
},
),
}
class GenerationConfig:
max_new_tokens: int = 200
top_k: int = 10
num_return_sequences: int = 1
num_beams: int = 4
early_stopping: bool = True
START_FALCON_COMMAND_DOCSTRING = """\
Run a LLMServer for FalconLM model.
@@ -78,7 +71,6 @@ or provide `--model-id` flag when running ``openllm start falcon``:
\b
$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
"""
DEFAULT_PROMPT_TEMPLATE = """{context}
{user_name}: {instruction}
{agent}:

View File

@@ -11,105 +11,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
@property
def import_kwargs(self):
model_kwds = {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}
tokenizer_kwds: dict[str, t.Any] = {}
return model_kwds, tokenizer_kwds
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
eos_token_id: int | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}, {}
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument instead of "
"kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"top_k": top_k,
"num_return_sequences": num_return_sequences,
"eos_token_id": eos_token_id,
**attrs,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
eos_token_id = attrs.pop("eos_token_id", self.tokenizer.eos_token_id)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
outputs = self.model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
generation_config=self.config.model_construct_env(
eos_token_id=eos_token_id, **attrs
).to_generation_config(),
)
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
def generate_one(
self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
) -> list[dict[t.Literal["generated_text"], str]]:
eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env( eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
from ..._generation import StopSequenceCriteria
max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
src_len = encoded_inputs["input_ids"].shape[1]
stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
outputs = self.model.generate(
encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
)
result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[: -len(stop_seq)]
if result.endswith(stop_seq): result = result[: -len(stop_seq)]
return [{"generated_text": result}]

View File

@@ -13,73 +13,40 @@
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_flax_available
from ...utils import is_tf_available
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_flan_t5"] = ["FlanT5"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_flan_t5"] = ["FlanT5"]
try:
if not is_flax_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
if not is_flax_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
try:
if not is_tf_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
if not is_tf_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
if t.TYPE_CHECKING:
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_flan_t5 import START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
from .configuration_flan_t5 import FlanT5Config as FlanT5Config
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_flan_t5 import FlanT5 as FlanT5
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_flan_t5 import FlanT5 as FlanT5
try:
if not is_flax_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
if not is_flax_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
try:
if not is_tf_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_tf_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,33 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class FlanT5Config(openllm.LLMConfig):
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
"""
__config__ = {
"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
"default_id": "google/flan-t5-large",
"architecture": "T5ForConditionalGeneration",
"model_ids": [
"google/flan-t5-small",
"google/flan-t5-base",
"google/flan-t5-large",
"google/flan-t5-xl",
"google/flan-t5-xxl",
],
"model_type": "seq2seq_lm",
}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 2048
top_k: int = 50
top_p: float = 0.4
repetition_penalty = 1.0
START_FLAN_T5_COMMAND_DOCSTRING = """\
Run a LLMServer for FLAN-T5 model.
@@ -41,35 +64,4 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
\b
$ openllm start flan-t5 --model-id google/flan-t5-xxl
"""
DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
class FlanT5Config(openllm.LLMConfig):
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
"""
__config__ = {
"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
"default_id": "google/flan-t5-large",
"architecture": "T5ForConditionalGeneration",
"model_ids": [
"google/flan-t5-small",
"google/flan-t5-base",
"google/flan-t5-large",
"google/flan-t5-xl",
"google/flan-t5-xxl",
],
"model_type": "seq2seq_lm",
}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 2048
top_k: int = 50
top_p: float = 0.4
repetition_penalty = 1.0

View File

@@ -13,71 +13,25 @@
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import transformers # noqa: F401
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
repetition_penalty: float | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
with torch.inference_mode():
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)

View File

@@ -26,64 +26,18 @@ if t.TYPE_CHECKING:
class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
repetition_penalty: float | None = None,
decoder_start_token_id: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
if decoder_start_token_id is None:
decoder_start_token_id = 0
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"decoder_start_token_id": decoder_start_token_id,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
if decoder_start_token_id is None: decoder_start_token_id = 0
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
# XXX: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main
# as it is required for encoder-decoder generation.
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
decoder_start_token_id=decoder_start_token_id,
)
return self.tokenizer.batch_decode(
result_tensor.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)

View File

@@ -13,66 +13,20 @@
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import transformers # noqa: F401
if t.TYPE_CHECKING: import transformers # noqa: F401
class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
repetition_penalty: float | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
outputs = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)

View File

@@ -11,40 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
if t.TYPE_CHECKING:
from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_gpt_neox import START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_gpt_neox import GPTNeoX as GPTNeoX
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class GPTNeoXConfig(openllm.LLMConfig):
"""GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
@@ -32,7 +28,6 @@ class GPTNeoXConfig(openllm.LLMConfig):
Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
for more information.
"""
__config__ = {
"model_name": "gpt_neox",
"start_name": "gpt-neox",
@@ -42,14 +37,10 @@ class GPTNeoXConfig(openllm.LLMConfig):
"default_id": "eleutherai/gpt-neox-20b",
"model_ids": ["eleutherai/gpt-neox-20b"],
}
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 100
START_GPT_NEOX_COMMAND_DOCSTRING = """\
Run a LLMServer for GPTNeoX model.
@@ -69,6 +60,4 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
\b
$ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""

View File

@@ -11,88 +11,34 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
import typing as t
import openllm
from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
if t.TYPE_CHECKING: import torch, transformers
else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
temperature: float | None = None,
max_new_tokens: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature}
return prompt_text, generation_config, {}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
@property
def import_kwargs(self):
model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
tokenizer_kwds: dict[str, t.Any] = {}
return model_kwds, tokenizer_kwds
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
if self.config.use_half_precision:
model.half()
if self.config.use_half_precision: model.half()
return model
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
from ..._generation import StopOnTokens
generation_kwargs = {
"do_sample": True,
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
"pad_token_id": self.tokenizer.eos_token_id,
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
}
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode():
gen_tokens = self.model.generate(inputs.input_ids, **generation_kwargs)
return self.tokenizer.batch_decode(gen_tokens)
generation_kwargs = {"do_sample": True, "generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "pad_token_id": self.tokenizer.eos_token_id, "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, **generation_kwargs))

View File

@@ -11,64 +11,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
from ...utils import is_vllm_available
_import_structure: dict[str, list[str]] = {
"configuration_llama": [
"LlaMAConfig",
"START_LLAMA_COMMAND_DOCSTRING",
"DEFAULT_PROMPT_TEMPLATE",
"PROMPT_MAPPING",
],
}
_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlaMAConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
try:
if not is_vllm_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
if not is_vllm_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_llama"] = ["LlaMA"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_llama"] = ["LlaMA"]
if t.TYPE_CHECKING:
from .configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_llama import PROMPT_MAPPING as PROMPT_MAPPING
from .configuration_llama import START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
from .configuration_llama import LlaMAConfig as LlaMAConfig
try:
if not is_vllm_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
if not is_vllm_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_llama import LlaMA as LlaMA
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_llama import LlaMA as LlaMA
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -11,13 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
class LlaMAConfig(openllm.LLMConfig):
"""LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
@@ -30,11 +26,7 @@ class LlaMAConfig(openllm.LLMConfig):
Refer to [LlaMA's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
for more information.
"""
use_llama2_prompt: bool = openllm.LLMConfig.Field(
True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1."
)
use_llama2_prompt: bool = openllm.LLMConfig.Field(True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1.")
__config__ = {
"model_name": "llama",
"start_name": "llama",
@@ -69,18 +61,14 @@ class LlaMAConfig(openllm.LLMConfig):
},
),
}
class GenerationConfig:
max_new_tokens: int = 256
temperature: float = 0.45
top_p: float = 0.95
top_k: int = 12
class SamplingParams:
best_of: int = 1
presence_penalty: float = 0.5
START_LLAMA_COMMAND_DOCSTRING = """\
Run a LLMServer for LlaMA model.
@@ -110,39 +98,14 @@ OpenLLM also supports running LlaMA-2 and its fine-tune and variants. To import
\b
$ CONVERTER=hf-llama2 openllm import llama /path/to/llama-2
"""
SYSTEM_MESSAGE = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
"""
SINST_KEY = "[INST]"
EINST_KEY = "[/INST]"
SYS_KEY = "<<SYS>>"
EOS_TOKEN = "</s>"
BOS_TOKEN = "<s>"
# TODO: support history
_v2_prompt = """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(
start_key=SINST_KEY,
sys_key=SYS_KEY,
system_message=SYSTEM_MESSAGE,
instruction="{instruction}",
end_key=EINST_KEY,
)
# XXX: implement me
_v1_prompt = """{instruction}"""
PROMPT_MAPPING = {
"v1": _v1_prompt,
"v2": _v2_prompt,
}
def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str:
return PROMPT_MAPPING[model_type]
SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "<<SYS>>", "</s>", "<s>"
# TODO: support history and v1 prompt implementation
_v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction="{instruction}", end_key=EINST_KEY)
PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
DEFAULT_PROMPT_TEMPLATE = _get_prompt

View File

@@ -11,110 +11,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
import typing as t
import openllm
from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
from ..._llm import LLMEmbeddings
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import torch.nn.functional as F
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
F = openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
logger = logging.getLogger(__name__)
class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
top_k: int | None = None,
top_p: float | None = None,
temperature: float | None = None,
max_new_tokens: int | None = None,
use_default_prompt_template: bool = True,
use_llama2_prompt: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
_PROMPT = DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1")
template_variables = default_formatter.extract_template_variables(_PROMPT)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
}
return prompt_text, generation_config, {}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
@property
def import_kwargs(self):
model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
tokenizer_kwds: dict[str, t.Any] = {}
return model_kwds, tokenizer_kwds
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
from ..._generation import StopOnTokens
generation_kwargs = {
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
}
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode():
gen_tokens = self.model.generate(**inputs, **generation_kwargs)
return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
generation_kwargs = {"generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), **generation_kwargs), skip_special_tokens=True, clean_up_tokenization_spaces=True)
def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
with torch.inference_mode():
model_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
data = model_outputs.hidden_states[-1]
data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
masked_embeddings = data * mask
sum_embeddings = torch.sum(masked_embeddings, dim=1)
seq_length = torch.sum(mask, dim=1)
embedding = sum_embeddings / seq_length
normalized_embeddings = F.normalize(embedding, p=2, dim=1)
return {
"embeddings": normalized_embeddings,
"num_tokens": torch.sum(attention_mask).item(),
}
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return {"embeddings": F.normalize(sum_embeddings / seq_length, p=2, dim=1), "num_tokens": torch.sum(attention_mask).item()}

View File

@@ -11,42 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"],
}
_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_mpt"] = ["MPT"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_mpt"] = ["MPT"]
if t.TYPE_CHECKING:
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_mpt import PROMPT_MAPPING as PROMPT_MAPPING
from .configuration_mpt import START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
from .configuration_mpt import MPTConfig as MPTConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_mpt import MPT as MPT
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_mpt import MPT as MPT
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -11,20 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing as t
import openllm
if t.TYPE_CHECKING:
MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
else:
# TODO: Support Literal string for LLMConfig
MPTPromptType = str
if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
else: MPTPromptType = str
class MPTConfig(openllm.LLMConfig):
"""MPT is a decoder-style transformer pretrained from scratch on English text and code.
@@ -34,7 +25,6 @@ class MPTConfig(openllm.LLMConfig):
on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
for more details on specific models.
"""
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
@@ -53,27 +43,12 @@ class MPTConfig(openllm.LLMConfig):
"mosaicml/mpt-30b-chat",
],
}
prompt_type: MPTPromptType = openllm.LLMConfig.Field(
'"default"',
description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""",
)
max_sequence_length: int = openllm.LLMConfig.Field(
2048,
description="""\
Max sequence length to run MPT with. Note that MPT is trained ith sequence length
of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096
(for 7b models) and 16384 (for 30b models)
""",
)
prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""")
max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
class GenerationConfig:
max_new_tokens: int = 128
temperature: float = 0
top_p: float = 0.8
START_MPT_COMMAND_DOCSTRING = """\
Run a LLMServer for MPT model.
@@ -100,43 +75,16 @@ or provide `--model-id` flag when running ``openllm start mpt``:
\b
$ openllm start mpt --model-id mosaicml/mpt-30b
"""
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INSTRUCTION_KEY, RESPONSE_KEY, END_KEY = "### Instruction:", "### Response:", "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
# NOTE: This is the prompt that is used for generating responses using an already
# trained model. It ends with the response key, where the job of the model is to provide
# the completion that follows it (i.e. the response itself).
_instruct_prompt = """{intro}
_chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instruction}""", """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
_default_prompt = """{instruction}"""
# TODO: XXX implement me
_chat_prompt = """{instruction}"""
PROMPT_MAPPING = {
"default": _default_prompt,
"instruct": _instruct_prompt,
"storywriter": _default_prompt,
"chat": _chat_prompt,
}
def _get_prompt(model_type: str) -> str:
return PROMPT_MAPPING[model_type]
""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
DEFAULT_PROMPT_TEMPLATE = _get_prompt

View File

@@ -15,189 +15,70 @@
from __future__ import annotations
import logging
import typing as t
import bentoml
import openllm
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
from ..._prompt import default_formatter
from ...utils import generate_labels
from ...utils import is_triton_available
if t.TYPE_CHECKING:
import torch
import transformers
from .configuration_mpt import MPTPromptType
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
from ...utils import generate_labels, is_triton_available
if t.TYPE_CHECKING: import transformers, torch
else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
logger = logging.getLogger(__name__)
def get_mpt_config(
model_id_or_path: str,
max_sequence_length: int,
device: torch.device | str | int | None,
device_map: str | None = None,
trust_remote_code: bool = True,
) -> transformers.PretrainedConfig:
def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)):
config.init_device = str(device)
if hasattr(config, "attn_config") and is_triton_available():
config.attn_config["attn_impl"] = "triton"
else:
logger.debug(
"'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
)
if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
# setting max_seq_len
config.max_seq_len = max_sequence_length
return config
class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
def llm_post_init(self): self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
model_kwds = {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}
tokenizer_kwds = {"padding_side": "left"}
return model_kwds, tokenizer_kwds
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
torch_dtype = attrs.pop("torch_dtype", self.dtype)
device_map = attrs.pop("device_map", None)
attrs.pop("low_cpu_mem_usage", None)
config = get_mpt_config(
self.model_id,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
if tokenizer.pad_token_id is None:
logger.warning("pad_token_id is not set. Setting it to eos_token")
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(
self.model_id,
config=config,
torch_dtype=torch_dtype,
trust_remote_code=trust_remote_code,
device_map=device_map,
**attrs,
)
try:
return bentoml.transformers.save_model(
self.tag,
model,
custom_objects={"tokenizer": tokenizer},
labels=generate_labels(self),
)
finally:
torch.cuda.empty_cache()
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
try: return bentoml.transformers.save_model( self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
finally: torch.cuda.empty_cache()
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
torch_dtype = attrs.pop("torch_dtype", self.dtype)
device_map = attrs.pop("device_map", None)
trust_remote_code = attrs.pop("trust_remote_code", True)
_ref = bentoml.transformers.get(self.tag)
config = get_mpt_config(
_ref.path,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
_ref.path,
config=config,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
device_map=device_map,
**attrs,
)
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
model.tie_weights()
return model
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_p: float | None = None,
prompt_type: MPTPromptType | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters( self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
if prompt_type is None:
if "instruct" in self.model_id:
prompt_type = "instruct"
elif "storywriter" in self.model_id:
prompt_type = "storywriter"
elif "chat" in self.model_id:
prompt_type = "chat"
else:
prompt_type = "default"
if "instruct" in self.model_id: prompt_type = "instruct"
elif "storywriter" in self.model_id: prompt_type = "storywriter"
elif "chat" in self.model_id: prompt_type = "chat"
else: prompt_type = "default"
_PROMPT = DEFAULT_PROMPT_TEMPLATE(prompt_type)
template_variables = default_formatter.extract_template_variables(_PROMPT)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_p": top_p,
}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
return generation_result[0]
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
llm_config = self.config.model_construct_env(**attrs)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
attrs = {
"do_sample": False if llm_config["temperature"] == 0 else True,
"eos_token_id": self.tokenizer.eos_token_id,
"pad_token_id": self.tokenizer.pad_token_id,
"generation_config": llm_config.to_generation_config(),
}
attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
with torch.inference_mode():
if torch.cuda.is_available():
with torch.autocast("cuda", torch.float16):
generated_tensors = self.model.generate(**inputs, **attrs)
else:
generated_tensors = self.model.generate(**inputs, **attrs)
else: generated_tensors = self.model.generate(**inputs, **attrs)
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)

View File

@@ -11,75 +11,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_flax_available
from ...utils import is_tf_available
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_opt"] = ["OPT"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_opt"] = ["OPT"]
try:
if not is_flax_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_flax_opt"] = ["FlaxOPT"]
if not is_flax_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
try:
if not is_tf_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_tf_opt"] = ["TFOPT"]
if not is_tf_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_tf_opt"] = ["TFOPT"]
if t.TYPE_CHECKING:
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_opt import START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
from .configuration_opt import OPTConfig as OPTConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_opt import OPT as OPT
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_opt import OPT as OPT
try:
if not is_flax_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_flax_opt import FlaxOPT as FlaxOPT
if not is_flax_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_flax_opt import FlaxOPT as FlaxOPT
try:
if not is_tf_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_tf_opt import TFOPT as TFOPT
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_tf_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_tf_opt import TFOPT as TFOPT
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class OPTConfig(openllm.LLMConfig):
"""OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
@@ -27,13 +23,12 @@ class OPTConfig(openllm.LLMConfig):
Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
"""
__config__ = {
"name_type": "lowercase",
"trust_remote_code": False,
"url": "https://huggingface.co/docs/transformers/model_doc/opt",
"default_id": "facebook/opt-1.3b",
"architecture": "MPTForCausalLM",
"architecture": "OPTForCausalLM",
"model_ids": [
"facebook/opt-125m",
"facebook/opt-350m",
@@ -53,20 +48,12 @@ class OPTConfig(openllm.LLMConfig):
},
),
}
format_outputs: bool = openllm.LLMConfig.Field(
False,
description="""Whether to format the outputs. This
can be used when num_return_sequences > 1.""",
)
format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
class GenerationConfig:
top_k: int = 15
temperature: float = 0.75
max_new_tokens: int = 1024
num_return_sequences: int = 1
START_OPT_COMMAND_DOCSTRING = """\
Run a LLMServer for OPT model.
@@ -92,5 +79,4 @@ or provide `--model-id` flag when running ``openllm start opt``:
\b
$ openllm start opt --model-id facebook/opt-6.7b
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""

View File

@@ -11,109 +11,37 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
import typing as t
import bentoml
import openllm
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
from ...utils import generate_labels
if t.TYPE_CHECKING:
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
if t.TYPE_CHECKING: import transformers
else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
__openllm_internal__ = True
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
tokenizer_kwds = {
"padding_side": "left",
"truncation_side": "left",
}
return {}, tokenizer_kwds
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
config = transformers.AutoConfig.from_pretrained(self.model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
model = t.cast(
"transformers.FlaxOPTForCausalLM",
transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
)
return bentoml.transformers.save_model(
self.tag,
model,
custom_objects={"tokenizer": tokenizer},
labels=generate_labels(self),
)
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
repetition_penalty: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"num_return_sequences": num_return_sequences,
"repetition_penalty": repetition_penalty,
}
return prompt_text, generation_config, {}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
if len(generation_result) == 1:
if self.config.format_outputs:
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
return generation_result[0]
if self.config.format_outputs:
return "Generated result:\n" + "\n -".join(generation_result)
else:
return "\n".join(generation_result)
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
input_ids = self.tokenizer(prompt, return_tensors="np")
generated_tensors = self.model.generate(
**input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(generated_tensors.sequences, skip_special_tokens=True)
if len(generation_result) == 1: return generation_result[0]
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
else: return "\n".join(generation_result)
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode( self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)

View File

@@ -11,129 +11,38 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
import typing as t
import bentoml
import openllm
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
from ...utils import generate_labels
if t.TYPE_CHECKING:
import torch
import transformers
import torch, transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
__openllm_internal__ = True
def llm_post_init(self):
self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
def llm_post_init(self): self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
model_kwds = {
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
}
tokenizer_kwds = {
"padding_side": "left",
"truncation_side": "left",
}
return model_kwds, tokenizer_kwds
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
torch_dtype = attrs.pop("torch_dtype", self.dtype)
config = transformers.AutoConfig.from_pretrained(self.model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
tokenizer.pad_token_id = config.pad_token_id
model = t.cast(
"transformers.OPTForCausalLM",
transformers.AutoModelForCausalLM.from_pretrained(
self.model_id, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, **attrs
),
)
return bentoml.transformers.save_model(
self.tag,
model,
custom_objects={"tokenizer": tokenizer},
labels=generate_labels(self),
)
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left", "truncation_side": "left"}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
torch_dtype = attrs.pop("torch_dtype", self.dtype)
model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, torch_dtype=torch_dtype, **attrs)
return model
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"num_return_sequences": num_return_sequences,
}
return prompt_text, generation_config, {}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
if len(generation_result) == 1:
if self.config.format_outputs:
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
return generation_result[0]
if self.config.format_outputs:
return "Generated result:\n" + "\n -".join(generation_result)
else:
return "\n".join(generation_result)
if len(generation_result) == 1: return generation_result[0]
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
else: return "\n".join(generation_result)
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
with torch.inference_mode():
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
generated_tensors = self.model.generate(
**inputs,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)

View File

@@ -11,107 +11,36 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
import typing as t
import bentoml
import openllm
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
from ..._prompt import default_formatter
from ...utils import generate_labels
if t.TYPE_CHECKING:
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
if t.TYPE_CHECKING: import transformers
else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
__openllm_internal__ = True
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
tokenizer_kwds = {
"padding_side": "left",
"truncation_side": "left",
}
return {}, tokenizer_kwds
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
config = transformers.AutoConfig.from_pretrained(self.model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
model: transformers.TFOPTForCausalLM = transformers.TFOPTForCausalLM.from_pretrained(
self.model_id, trust_remote_code=trust_remote_code, **attrs
)
return bentoml.transformers.save_model(
self.tag,
model,
custom_objects={"tokenizer": tokenizer},
labels=generate_labels(self),
)
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if use_default_prompt_template:
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
try:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
"Use 'use_default_prompt_template=False' to disable the default prompt template."
) from None
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"num_return_sequences": num_return_sequences,
}
return prompt_text, generation_config, {}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
if len(generation_result) == 1:
if self.config.format_outputs:
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
return generation_result[0]
if self.config.format_outputs:
return "Generated result:\n" + "\n -".join(generation_result)
else:
return "\n".join(generation_result)
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
input_ids = self.tokenizer(prompt, return_tensors="tf")
generated_tensors = self.model.generate(
**input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
if len(generation_result) == 1: return generation_result[0]
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
else: return "\n".join(generation_result)
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)

View File

@@ -11,40 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_stablelm"] = ["StableLM"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_stablelm"] = ["StableLM"]
if t.TYPE_CHECKING:
from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_stablelm import START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
from .configuration_stablelm import StableLMConfig as StableLMConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_stablelm import StableLM as StableLM
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_stablelm import StableLM as StableLM
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class StableLMConfig(openllm.LLMConfig):
"""StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
@@ -30,7 +27,6 @@ class StableLMConfig(openllm.LLMConfig):
and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
for more information.
"""
__config__ = {
"name_type": "lowercase",
"url": "https://github.com/Stability-AI/StableLM",
@@ -43,14 +39,11 @@ class StableLMConfig(openllm.LLMConfig):
"stabilityai/stablelm-base-alpha-7b",
],
}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 128
top_k: int = 0
top_p: float = 0.9
START_STABLELM_COMMAND_DOCSTRING = """\
Run a LLMServer for StableLM model.
@@ -70,12 +63,10 @@ or provide `--model-id` flag when running ``openllm start stablelm``:
\b
$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
"""
SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""
DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""

View File

@@ -14,91 +14,27 @@
from __future__ import annotations
import logging
import typing as t
import openllm
from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE
from .configuration_stablelm import SYSTEM_PROMPT
from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import transformers # noqa
import torch
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
if t.TYPE_CHECKING: import transformers, torch
else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
logger = logging.getLogger(__name__)
class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.bettertransformer = True if not torch.cuda.is_available() else False
def llm_post_init(self): self.bettertransformer = True if not torch.cuda.is_available() else False
@property
def import_kwargs(self):
model_kwds = {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}
tokenizer_kwds: dict[str, t.Any] = {}
return model_kwds, tokenizer_kwds
def sanitize_parameters(
self,
prompt: str,
temperature: float | None = None,
max_new_tokens: int | None = None,
top_k: int | None = None,
top_p: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if "tuned" in self._model_id and use_default_prompt_template:
prompt_variables = {
k: v
for k, v in attrs.items()
if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
}
if "instruction" in prompt_variables:
raise RuntimeError(
"'instruction' should be passed as the first argument "
"instead of kwargs when 'use_default_prompt_template=True'"
)
prompt_variables = {k: v for k, v in attrs.items() if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)}
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
system_prompt = prompt_variables.pop("system_prompt", SYSTEM_PROMPT)
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, system_prompt=system_prompt)
else:
prompt_text = prompt
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
"top_p": top_p,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
from ..._generation import StopOnTokens
generation_kwargs = {
"do_sample": True,
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
"pad_token_id": self.tokenizer.eos_token_id,
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
}
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode():
if torch.cuda.is_available():
with torch.autocast("cuda", torch.float16):
tokens = self.model.generate(**inputs, **generation_kwargs)
else:
tokens = self.model.generate(**inputs, **generation_kwargs)
return [self.tokenizer.decode(tokens[0], skip_special_tokens=True)]
with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=transformers.StoppingCriteriaList([StopOnTokens()]))[0], skip_special_tokens=True)]

View File

@@ -11,40 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
from ...exceptions import MissingDependencyError
from ...utils import LazyModule
from ...utils import is_torch_available
_import_structure: dict[str, list[str]] = {
"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
_import_structure["modeling_starcoder"] = ["StarCoder"]
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: _import_structure["modeling_starcoder"] = ["StarCoder"]
if t.TYPE_CHECKING:
from .configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_starcoder import START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
from .configuration_starcoder import StarCoderConfig as StarCoderConfig
try:
if not is_torch_available():
raise MissingDependencyError
except MissingDependencyError:
pass
else:
from .modeling_starcoder import StarCoder as StarCoder
else:
import sys
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass
else: from .modeling_starcoder import StarCoder as StarCoder
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class StarCoderConfig(openllm.LLMConfig):
"""The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
@@ -25,7 +22,6 @@ class StarCoderConfig(openllm.LLMConfig):
Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
"""
__config__ = {
"name_type": "lowercase",
"requires_gpu": True,
@@ -36,7 +32,6 @@ class StarCoderConfig(openllm.LLMConfig):
"default_id": "bigcode/starcoder",
"model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"],
}
class GenerationConfig:
temperature: float = 0.2
max_new_tokens: int = 256
@@ -45,8 +40,6 @@ class StarCoderConfig(openllm.LLMConfig):
top_p: float = 0.95
pad_token_id: int = 49152
repetition_penalty: float = 1.2
START_STARCODER_COMMAND_DOCSTRING = """\
Run a LLMServer for StarCoder model.
@@ -66,5 +59,4 @@ or provide `--model-id` flag when running ``openllm start starcoder``:
\b
$ openllm start starcoder --model-id 'bigcode/starcoder'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""

View File

@@ -14,143 +14,53 @@
from __future__ import annotations
import logging
import typing as t
import bentoml
import openllm
from ...utils import generate_labels
if t.TYPE_CHECKING:
import torch
import transformers
import torch, transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
logger = logging.getLogger(__name__)
FIM_PREFIX = "<fim-prefix>"
FIM_MIDDLE = "<fim-middle>"
FIM_SUFFIX = "<fim-suffix>"
FIM_PAD = "<fim-pad>"
EOD = "<|endoftext|>"
FIM_INDICATOR = "<FILL_HERE>"
FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
__openllm_internal__ = True
@property
def import_kwargs(self):
model_kwds = {
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
}
tokenizer_kwds = {"padding_side": "left"}
return model_kwds, tokenizer_kwds
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters
torch_dtype = attrs.pop("torch_dtype", torch.float16)
device_map = attrs.pop("device_map", "auto")
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
tokenizer.add_special_tokens(
{
"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
"pad_token": EOD,
}
)
model = transformers.AutoModelForCausalLM.from_pretrained(
self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
)
try:
return bentoml.transformers.save_model(
self.tag,
model,
custom_objects={"tokenizer": tokenizer},
labels=generate_labels(self),
)
finally:
# NOTE: We need to free the cache after saving here so that we can load it back later on.
torch.cuda.empty_cache()
def sanitize_parameters(
self,
prompt: str,
temperature: float | None = None,
top_p: float | None = None,
max_new_tokens: int | None = None,
repetition_penalty: float | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
fim_mode = FIM_INDICATOR in prompt
prefix, suffix = None, None
torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
finally: torch.cuda.empty_cache()
def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
if fim_mode:
try:
prefix, suffix = prompt.split(FIM_INDICATOR)
except Exception as err:
logger.error("Error while processing prompt with FIM mode:\n", exc_info=err)
raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
try: prefix, suffix = prompt.split(FIM_INDICATOR)
except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
else:
prompt_text = prompt
generation_config = {
"temperature": temperature,
"top_p": top_p,
"max_new_tokens": max_new_tokens,
"repetition_penalty": repetition_penalty,
# XXX: This value is currently a hack, need more investigate why the
# default starcoder doesn't include the same value as santacoder EOD
"pad_token_id": 49152,
**attrs,
}
return prompt_text, generation_config, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
else: prompt_text = prompt
# XXX: This value for pad_token_id is currently a hack, need more investigate why the
# default starcoder doesn't include the same value as santacoder EOD
return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
with torch.inference_mode():
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
result_tensor = self.model.generate(
inputs,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
# NOTE: support fine-tuning starcoder
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(
result_tensor[0],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
def generate_one(
self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
) -> list[dict[t.Literal["generated_text"], str]]:
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
from ..._generation import StopSequenceCriteria
max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
src_len = encoded_inputs["input_ids"].shape[1]
stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
outputs = self.model.generate(
encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
)
result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[: -len(stop_seq)]
if result.endswith(stop_seq): result = result[: -len(stop_seq)]
return [{"generated_text": result}]

View File

@@ -193,6 +193,10 @@ def import_model(
if _tokenizer.pad_token is None:
_tokenizer.pad_token = _tokenizer.eos_token
# NOTE: quick hack to set the loaded into llm object
object.__setattr__(llm, "__llm_model__", model)
object.__setattr__(llm, "__llm_tokenizer__", _tokenizer)
try:
with bentoml.models.create(
llm.tag,
@@ -210,9 +214,7 @@ def import_model(
else None,
metadata=metadata,
) as bentomodel:
save_pretrained(
llm, bentomodel.path, model=model, tokenizer=_tokenizer, safe_serialization=safe_serialisation
)
save_pretrained(llm, bentomodel.path, safe_serialization=safe_serialisation)
return bentomodel
finally:
# NOTE: We need to free up the cache after importing the model
@@ -296,12 +298,12 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
or getattr(model, "is_loaded_in_4bit", False)
or getattr(model, "is_quantized", False)
)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
if torch.cuda.is_available() and not loaded_in_kbit:
try:
model = model.to("cuda")
except torch.cuda.OutOfMemoryError as err:
raise RuntimeError(
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
f"Failed to convert {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
) from err
if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
# BetterTransformer is currently only supported on PyTorch.
@@ -314,27 +316,19 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
def save_pretrained(
llm: openllm.LLM[M, T],
save_directory: str,
model: M | None = None,
tokenizer: T | None = None,
is_main_process: bool = True,
state_dict: DictStrAny | None = None,
save_function: t.Callable[..., None] | None = None,
push_to_hub: bool = False,
max_shard_size: int | str = "10GB",
max_shard_size: int | str = "2GB",
safe_serialization: bool = False,
variant: str | None = None,
**attrs: t.Any,
) -> None:
"""Light wrapper around ``transformers.PreTrainedTokenizer.save_pretrained`` and ``transformers.PreTrainedModel.save_pretrained``."""
model = first_not_none(model, default=llm.__llm_model__)
tokenizer = first_not_none(tokenizer, default=llm.__llm_tokenizer__)
save_function = first_not_none(save_function, default=torch.save)
model_save_attrs, tokenizer_save_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
if model is None or tokenizer is None:
raise RuntimeError("Failed to find loaded model or tokenizer to save to local store.")
if llm._quantize_method == "gptq":
if not is_autogptq_available():
raise OpenLLMException(
@@ -342,11 +336,11 @@ def save_pretrained(
)
if llm.config["model_type"] != "causal_lm":
raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
model.save_quantized(save_directory, use_safetensors=safe_serialization)
elif isinstance(model, _transformers.Pipeline):
model.save_pretrained(save_directory, safe_serialization=safe_serialization)
llm.model.save_quantized(save_directory, use_safetensors=safe_serialization)
elif isinstance(llm.model, _transformers.Pipeline):
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
else:
model.save_pretrained(
llm.model.save_pretrained(
save_directory,
is_main_process=is_main_process,
state_dict=state_dict,
@@ -357,4 +351,4 @@ def save_pretrained(
variant=variant,
**model_save_attrs,
)
tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)

View File

@@ -90,6 +90,10 @@ class ClientMeta(t.Generic[T]):
@property
def _hf_agent(self) -> transformers.HfAgent:
if not self.supports_hf_agent:
raise openllm.exceptions.OpenLLMException(
f"{self.model_name} ({self.framework}) does not support running HF agent."
)
if self.__agent__ is None:
if not openllm.utils.is_transformers_supports_agent():
raise RuntimeError(
@@ -130,6 +134,16 @@ class ClientMeta(t.Generic[T]):
def configuration(self) -> dict[str, t.Any]:
raise NotImplementedError
@property
@abstractmethod
def supports_embeddings(self) -> bool:
raise NotImplementedError
@property
@abstractmethod
def supports_hf_agent(self) -> bool:
raise NotImplementedError
@property
def llm(self) -> openllm.LLM[t.Any, t.Any]:
if self.__llm__ is None:

View File

@@ -80,6 +80,20 @@ class GrpcClientMixin:
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try:
return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try:
return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
if isinstance(result, dict):
return openllm.GenerationOutput(**result)

View File

@@ -77,6 +77,20 @@ class HTTPClientMixin:
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try:
return self._metadata.get("supports_embeddings", False)
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try:
return self._metadata.get("supports_hf_agent", False)
except KeyError:
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput:
return openllm.GenerationOutput(**result)

View File

@@ -12,6 +12,7 @@ from typing import Protocol
from typing import Sequence
from typing import Tuple
from typing import Type
from typing import TypeAlias
from typing import TypeGuard
from typing import TypeVar
from typing import Union
@@ -40,16 +41,16 @@ __copyright__: str
_T = TypeVar("_T")
_C = TypeVar("_C", bound=type)
_P = ParamSpec("_P")
_EqOrderType = Union[bool, Callable[[Any], Any]]
_ValidatorType = Callable[[Any, Attribute[_T], _T], Any]
_ConverterType = Callable[[Any], Any]
_FilterType = Callable[[Attribute[_T], _T], bool]
_ReprType = Callable[[Any], str]
_ReprArgType = Union[bool, _ReprType]
_OnSetAttrType = Callable[[Any, Attribute[Any], Any], Any]
_OnSetAttrArgType = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
_FieldTransformer = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
_ValidatorArgType = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
_EqOrderType: TypeAlias = Union[bool, Callable[[Any], Any]]
_ValidatorType: TypeAlias = Callable[[Any, Attribute[_T], _T], Any]
_ConverterType: TypeAlias = Callable[[Any], Any]
_FilterType: TypeAlias = Callable[[Attribute[_T], _T], bool]
_ReprType: TypeAlias = Callable[[Any], str]
_ReprArgType: TypeAlias = Union[bool, _ReprType]
_OnSetAttrType: TypeAlias = Callable[[Any, Attribute[Any], Any], Any]
_OnSetAttrArgType: TypeAlias = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
_FieldTransformer: TypeAlias = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
_ValidatorArgType: TypeAlias = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
class AttrsInstance(AttrsInstance_, Protocol): ...
@@ -535,8 +536,10 @@ def get_run_validators() -> bool: ...
# aliases --
s = attributes = attrs
ib = attr = attrib
s = attrs
attributes = attrs
ib = attrib
attr = attrib
dataclass = attrs # Technically, partial(attrs, auto_attribs=True) ;)
class ReprProtocol(Protocol):

3
typings/attr/_cmp.pyi generated
View File

@@ -1,8 +1,9 @@
from typing import Any
from typing import Callable
from typing import Optional
from typing import TypeAlias
_CompareWithType = Callable[[Any, Any], bool]
_CompareWithType: TypeAlias = Callable[[Any, Any], bool]
def cmp_using(
eq: Optional[_CompareWithType] = ...,

View File

@@ -1,5 +1,5 @@
from typing import Any
import threading
from typing import Any
def set_closure_cell(cell: Any, value: Any) -> None: ...

View File

@@ -1,4 +1,4 @@
from . import _CountingAttr as _CountingAttr
from . import _make_repr as _make_repr
from . import _make_init as _make_init
from . import _make_repr as _make_repr
from . import _transform_attrs as _transform_attrs

View File

@@ -15,7 +15,7 @@ import click
_R = TypeVar("_R")
_T = TypeVar("_T")
AnyCallable = Callable[..., Any]
AnyCallable: TypeAlias = Callable[..., Any]
Decorator: TypeAlias = Callable[[_T], _T]
_FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])

View File

@@ -2,6 +2,7 @@ from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
from typing import TypeAlias
from typing import Union
from .strategy.core import StrategyList
@@ -9,7 +10,7 @@ from .strategy.dict import DictStrategies
from .strategy.list import ListStrategies
from .strategy.set import SetStrategies
ConfigDictType = Dict[str, Any]
ConfigDictType: TypeAlias = Dict[str, Any]
class Merger:
PROVIDED_TYPE_STRATEGIES: Dict[type, Union[ListStrategies, DictStrategies, SetStrategies]] = ...

View File

@@ -2,9 +2,10 @@ from typing import Any
from typing import Callable
from typing import List
from typing import Optional
from typing import TypeAlias
from typing import Union
_StringOrFunction = Union[str, Callable[..., Any]]
_StringOrFunction: TypeAlias = Union[str, Callable[..., Any]]
STRATEGY_END: object = ...
class StrategyList:

View File

@@ -1,6 +1,6 @@
from collections.abc import Generator
from typing import Any
from typing import Dict
from collections.abc import Generator
from _typeshed import Incomplete

View File

@@ -1,4 +1,5 @@
from typing import Any
from _typeshed import Incomplete
class JupytextFormatError(ValueError): ...