mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-10 19:22:10 -04:00
feat(service): provisional API (#133)
This commit is contained in:
@@ -20,7 +20,7 @@ ci:
|
||||
exclude: '.*\.(css|js|svg)$'
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: 'v0.0.278'
|
||||
rev: 'v0.0.280'
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--exit-non-zero-on-fix, --show-fixes]
|
||||
@@ -28,6 +28,8 @@ repos:
|
||||
rev: 23.7.0
|
||||
hooks:
|
||||
- id: black-jupyter
|
||||
args: [--config=pyproject.toml]
|
||||
exclude: (?x)^(src/openllm/models/.*)$
|
||||
- repo: https://github.com/econchick/interrogate
|
||||
rev: 1.5.0
|
||||
hooks:
|
||||
@@ -50,7 +52,6 @@ repos:
|
||||
tools/.*|
|
||||
tests/.*|
|
||||
src/openllm/playground/.*|
|
||||
src/openllm/models/.*|
|
||||
.github/.*
|
||||
)$
|
||||
additional_dependencies: ["mypy==1.4.1", "types-tabulate", "types-Deprecated", "types-PyYAML", "types-decorator", "types-protobuf", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "types-ujson", "pandas-stubs", "types-Pillow", "types-Pygments", "types-appdirs", "types-colorama", "types-google-cloud-ndb", "types-jsonschema", "types-psutil", "types-pywin32", "types-tqdm", "types-openpyxl"]
|
||||
|
||||
@@ -299,7 +299,7 @@ pip install "openllm[mpt]"
|
||||
<tr>
|
||||
|
||||
<td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
|
||||
<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.MPTForCausalLM><code>MPTForCausalLM</code></a></td>
|
||||
<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
|
||||
<td>✅</td>
|
||||
<td>✅</td>
|
||||
<td>
|
||||
|
||||
14
changelog.d/133.feature.md
Normal file
14
changelog.d/133.feature.md
Normal file
@@ -0,0 +1,14 @@
|
||||
APIs for LLMService are now provisional based on the capabilities of the LLM.
|
||||
|
||||
The following APIs are considered provisional:
|
||||
|
||||
- `/v1/embeddings`: This will be available if the LLM supports embeddings (i.e: ``LLM.embeddings`` is implemented. Example model are ``llama``)
|
||||
- `/hf/agent`: This will be available if LLM supports running HF agents (i.e: ``LLM.generate_one`` is implemented. Example model are ``starcoder``, ``falcon``.)
|
||||
- `POST /v1/adapters` and `GET /v1/adapters`: This will be available if the server is running with LoRA weights
|
||||
|
||||
``openllm.LLMRunner`` now include three additional boolean:
|
||||
- `runner.supports_embeddings`: Whether this runner supports embeddings
|
||||
- `runner.supports_hf_agent`: Whether this runner support HF agents
|
||||
- `runner.has_adapters`: Whether this runner is loaded with LoRA adapters.
|
||||
|
||||
Optimized ``openllm.models``'s bytecode performance
|
||||
@@ -157,7 +157,7 @@ python_files = ["test_*.py", "*_test.py"]
|
||||
testpaths = ["tests"]
|
||||
|
||||
[tool.black]
|
||||
exclude = '''
|
||||
extend-exclude = '''
|
||||
(
|
||||
/(
|
||||
\.eggs
|
||||
@@ -174,14 +174,15 @@ exclude = '''
|
||||
| tools
|
||||
)/
|
||||
| src/openllm/__about__.py
|
||||
| src/openllm/models
|
||||
)
|
||||
'''
|
||||
line-length = 119
|
||||
target-version = ["py38", "py39", "py310", "py311"]
|
||||
|
||||
[tool.ruff]
|
||||
exclude = ["tools", "src/openllm/playground"]
|
||||
extend-include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
|
||||
extend-exclude = ["tools", "src/openllm/playground", "src/openllm/models", "src/openllm/_types.py"]
|
||||
extend-include = ["*.ipynb"]
|
||||
extend-select = [
|
||||
"B", # flake8-bugbear
|
||||
"I", # isort
|
||||
@@ -223,12 +224,14 @@ ignore = [
|
||||
"TCH004", # don't move runtime import out, just warn about it
|
||||
"RUF012", # mutable attributes to be used with ClassVar
|
||||
"B905", # zip warning about strict, only applicable for 3.10+
|
||||
"D105", # magic docstring
|
||||
]
|
||||
line-length = 119
|
||||
target-version = "py312"
|
||||
unfixable = [
|
||||
"F401", # Don't touch unused imports, just warn about it.
|
||||
"TCH004", # Don't touch import outside of TYPE_CHECKING block
|
||||
"RUF100", # unused noqa, just warn about it
|
||||
]
|
||||
[tool.ruff.flake8-type-checking]
|
||||
exempt-modules = ["typing", "typing_extensions", "."]
|
||||
@@ -255,17 +258,9 @@ avoid-escape = false
|
||||
# Tests can use magic values, assertions, and relative imports
|
||||
"__init__.py" = ["E402", "F401", "F403", "F811"]
|
||||
"examples/**/*" = ["D"]
|
||||
"src/openllm/_llm.py" = ["B010", "B009"]
|
||||
"src/openllm/_strategies.py" = ["B904"]
|
||||
"src/openllm/_types.py" = ["E402"]
|
||||
"src/openllm/cli.py" = ["D301", "S101"]
|
||||
"src/openllm/models/**/*" = ["D106", "S101", "D104"]
|
||||
"src/openllm/playground/**/*" = ["E402", "F401", "PLR", "D"]
|
||||
"src/openllm/utils/dummy_*" = ["D107"]
|
||||
"src/openllm/utils/import_utils.py" = [
|
||||
"PLW0603", # OK to ignore global access here
|
||||
"D105", # magic docstring
|
||||
]
|
||||
"src/openllm/utils/import_utils.py" = ["PLW0603"]
|
||||
"src/openllm_client/runtimes/*" = ["D107"]
|
||||
"tests/**/*" = [
|
||||
"S101",
|
||||
|
||||
@@ -28,6 +28,7 @@ from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
import attr
|
||||
import inflection
|
||||
import orjson
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
@@ -82,6 +83,7 @@ if t.TYPE_CHECKING:
|
||||
from ._configuration import PeftType
|
||||
from ._types import AdaptersMapping
|
||||
from ._types import AdaptersTuple
|
||||
from ._types import AnyCallable
|
||||
from ._types import DictStrAny
|
||||
from ._types import ListStr
|
||||
from ._types import LiteralRuntime
|
||||
@@ -161,13 +163,12 @@ def make_tag(
|
||||
model_version = tag.version
|
||||
model_name = tag.name
|
||||
else:
|
||||
if model_version is None: # noqa: PLR5501
|
||||
if not quiet:
|
||||
logger.warning(
|
||||
"Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
|
||||
model_id,
|
||||
)
|
||||
model_version = generate_hash_from_file(model_id)
|
||||
if not quiet and model_version is None:
|
||||
logger.warning(
|
||||
"Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
|
||||
model_id,
|
||||
)
|
||||
model_version = first_not_none(model_version, default=generate_hash_from_file(model_id))
|
||||
else:
|
||||
config = t.cast(
|
||||
"transformers.PretrainedConfig",
|
||||
@@ -418,6 +419,15 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
__llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
|
||||
"""A reference to the the cached LoRA adapter mapping."""
|
||||
|
||||
__llm_supports_embeddings__: bool
|
||||
"""A boolean to determine whether models does implement ``LLM.embeddings``."""
|
||||
__llm_supports_generate__: bool
|
||||
"""A boolean to determine whether models does implement ``LLM.generate``."""
|
||||
__llm_supports_generate_one__: bool
|
||||
"""A boolean to determine whether models does implement ``LLM.generate_one``."""
|
||||
__llm_supports_generate_iterator__: bool
|
||||
"""A boolean to determine whether models does implement ``LLM.generate_iterator``."""
|
||||
|
||||
if t.TYPE_CHECKING and not MYPY:
|
||||
|
||||
def __attrs_init__(
|
||||
@@ -528,6 +538,21 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]):
|
||||
return wrapper
|
||||
|
||||
|
||||
def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
|
||||
# update docstring for given entrypoint
|
||||
original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
|
||||
original_fn.__doc__ = (
|
||||
original_fn.__doc__
|
||||
or f"""\
|
||||
{cls.__name__}'s implementation for {fn}.
|
||||
|
||||
Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
|
||||
The original model can then be accessed with 'self.model.get_base_model()'.
|
||||
"""
|
||||
)
|
||||
setattr(cls, fn, original_fn)
|
||||
|
||||
|
||||
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
|
||||
attributes = {
|
||||
"import_model": _wrapped_import_model,
|
||||
@@ -539,7 +564,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
|
||||
args: ListStr = []
|
||||
anns: DictStrAny = {}
|
||||
lines: ListStr = []
|
||||
globs: DictStrAny = {"cls": cls, "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface)}
|
||||
globs: DictStrAny = {
|
||||
"cls": cls,
|
||||
"_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface),
|
||||
"__gen_docstring": _update_docstring,
|
||||
}
|
||||
# function initialisation
|
||||
for func, impl in attributes.items():
|
||||
impl_name = f"__wrapped_{func}"
|
||||
@@ -561,9 +590,22 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
|
||||
interface_anns = codegen.get_annotations(LLMInterface)
|
||||
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
|
||||
lines.append(_setattr_class(f"__llm_{v}__", None))
|
||||
anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
|
||||
anns[f"__llm_{v}__"] = interface_anns.get(f"__llm_{v}__")
|
||||
|
||||
return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
|
||||
# boolean to determine whether LLM has defined an implementation for a function
|
||||
for fn in {"generate", "generate_one", "generate_iterator", "embeddings"}:
|
||||
key = f"__llm_supports_{fn}__"
|
||||
lines.extend(
|
||||
[
|
||||
_setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"),
|
||||
f"__gen_docstring(cls, '{fn}')",
|
||||
]
|
||||
)
|
||||
anns[key] = interface_anns.get(key)
|
||||
|
||||
return codegen.generate_function(
|
||||
cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns
|
||||
)
|
||||
|
||||
|
||||
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
|
||||
@@ -607,28 +649,24 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
|
||||
cls.__llm_implementation__ = implementation
|
||||
config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
|
||||
|
||||
if "__openllm_internal__" in cd:
|
||||
if "config_class" not in cd:
|
||||
cls.config_class = config_class
|
||||
elif "config_class" not in cd:
|
||||
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
|
||||
|
||||
_make_assignment_script(cls)(cls)
|
||||
|
||||
# update docstring for given entrypoint
|
||||
for fn in {"generate", "generate_one", "generate_iterator"}:
|
||||
original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
|
||||
original_fn.__doc__ = (
|
||||
original_fn.__doc__
|
||||
or f"""\
|
||||
'{fn}' implementation {cls.__name__}.
|
||||
|
||||
Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
|
||||
The original can then be accessed with 'self.model.get_base_model()'.
|
||||
"""
|
||||
)
|
||||
setattr(cls, fn, original_fn)
|
||||
def __getitem__(self, item: t.LiteralString | t.Any) -> t.Any:
|
||||
if item is None:
|
||||
raise TypeError(f"{self} doesn't understand how to index None.")
|
||||
item = inflection.underscore(item)
|
||||
internal_attributes = f"__llm_{item}__"
|
||||
if hasattr(self, internal_attributes):
|
||||
return getattr(self, internal_attributes)
|
||||
elif hasattr(self, item):
|
||||
return getattr(self, item)
|
||||
else:
|
||||
raise KeyError(item)
|
||||
|
||||
@classmethod
|
||||
@overload
|
||||
@@ -1667,6 +1705,9 @@ def llm_runner_class(self: openllm.LLM[M, T]) -> type[LLMRunner]:
|
||||
"__repr__": ReprMixin.__repr__,
|
||||
"__repr_keys__": property(_wrapped_repr_keys),
|
||||
"__repr_args__": _wrapped_repr_args,
|
||||
"supports_embeddings": self["supports-embeddings"],
|
||||
"supports_hf_agent": self["supports-generate-one"],
|
||||
"has_adapters": self._adapters_mapping is not None,
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
@@ -94,6 +94,8 @@ class MetadataOutput:
|
||||
model_name: str
|
||||
framework: str
|
||||
configuration: str
|
||||
supports_embeddings: bool
|
||||
supports_hf_agent: bool
|
||||
|
||||
|
||||
@attr.frozen(slots=True)
|
||||
|
||||
@@ -89,48 +89,6 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
return openllm.GenerationOutput(responses=responses, configuration=config)
|
||||
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
|
||||
output=bentoml.io.JSON.from_sample(
|
||||
sample={
|
||||
"embeddings": [
|
||||
0.007917795330286026,
|
||||
-0.014421648345887661,
|
||||
0.00481307040899992,
|
||||
0.007331526838243008,
|
||||
-0.0066398633643984795,
|
||||
0.00945580005645752,
|
||||
0.0087016262114048,
|
||||
-0.010709521360695362,
|
||||
0.012635177001357079,
|
||||
0.010541186667978764,
|
||||
-0.00730888033285737,
|
||||
-0.001783102168701589,
|
||||
0.02339819073677063,
|
||||
-0.010825827717781067,
|
||||
-0.015888236463069916,
|
||||
0.01876218430697918,
|
||||
0.0076906150206923485,
|
||||
0.0009032754460349679,
|
||||
-0.010024012066423893,
|
||||
0.01090280432254076,
|
||||
-0.008668390102684498,
|
||||
0.02070549875497818,
|
||||
0.0014594447566196322,
|
||||
-0.018775740638375282,
|
||||
-0.014814382418990135,
|
||||
0.01796768605709076,
|
||||
],
|
||||
"num_tokens": 20,
|
||||
}
|
||||
),
|
||||
route="/v1/embeddings",
|
||||
)
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
responses = await runner.embeddings.async_run(phrases)
|
||||
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"])
|
||||
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.Text(),
|
||||
output=bentoml.io.JSON.from_sample(
|
||||
@@ -151,42 +109,96 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
model_name=llm_config["model_name"],
|
||||
framework=llm_config["env"]["framework_value"],
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
supports_embeddings=runner.supports_embeddings,
|
||||
supports_hf_agent=runner.supports_hf_agent,
|
||||
)
|
||||
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.Text.from_sample(sample="default"),
|
||||
output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
|
||||
route="/v1/adapters",
|
||||
)
|
||||
async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
|
||||
return await runner.set_adapter.async_run(adapter_name)
|
||||
if runner.supports_embeddings:
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
|
||||
output=bentoml.io.JSON.from_sample(
|
||||
sample={
|
||||
"embeddings": [
|
||||
0.007917795330286026,
|
||||
-0.014421648345887661,
|
||||
0.00481307040899992,
|
||||
0.007331526838243008,
|
||||
-0.0066398633643984795,
|
||||
0.00945580005645752,
|
||||
0.0087016262114048,
|
||||
-0.010709521360695362,
|
||||
0.012635177001357079,
|
||||
0.010541186667978764,
|
||||
-0.00730888033285737,
|
||||
-0.001783102168701589,
|
||||
0.02339819073677063,
|
||||
-0.010825827717781067,
|
||||
-0.015888236463069916,
|
||||
0.01876218430697918,
|
||||
0.0076906150206923485,
|
||||
0.0009032754460349679,
|
||||
-0.010024012066423893,
|
||||
0.01090280432254076,
|
||||
-0.008668390102684498,
|
||||
0.02070549875497818,
|
||||
0.0014594447566196322,
|
||||
-0.018775740638375282,
|
||||
-0.014814382418990135,
|
||||
0.01796768605709076,
|
||||
],
|
||||
"num_tokens": 20,
|
||||
}
|
||||
),
|
||||
route="/v1/embeddings",
|
||||
)
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
responses = await runner.embeddings.async_run(phrases)
|
||||
return openllm.EmbeddingsOutput(
|
||||
embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"]
|
||||
)
|
||||
|
||||
|
||||
@attr.define
|
||||
class HfAgentInput:
|
||||
inputs: str
|
||||
parameters: t.Dict[str, t.Any]
|
||||
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
|
||||
|
||||
@attr.define
|
||||
class HfAgentInput:
|
||||
inputs: str
|
||||
parameters: t.Dict[str, t.Any]
|
||||
|
||||
async def hf_agent(request: Request) -> Response:
|
||||
json_str = await request.body()
|
||||
try:
|
||||
input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
|
||||
except orjson.JSONDecodeError as err:
|
||||
raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
|
||||
async def hf_agent(request: Request) -> Response:
|
||||
json_str = await request.body()
|
||||
try:
|
||||
input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
|
||||
except orjson.JSONDecodeError as err:
|
||||
raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
|
||||
|
||||
stop = input_data.parameters.pop("stop", ["\n"])
|
||||
try:
|
||||
resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
|
||||
return JSONResponse(resp, status_code=200)
|
||||
except NotImplementedError:
|
||||
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
|
||||
stop = input_data.parameters.pop("stop", ["\n"])
|
||||
try:
|
||||
resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
|
||||
return JSONResponse(resp, status_code=200)
|
||||
except NotImplementedError:
|
||||
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
|
||||
|
||||
hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
|
||||
|
||||
hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
|
||||
svc.mount_asgi_app(hf_app, path="/hf")
|
||||
|
||||
svc.mount_asgi_app(hf_app, path="/hf")
|
||||
if runner.has_adapters:
|
||||
|
||||
@svc.api(
|
||||
input=bentoml.io.Text.from_sample(sample="default"),
|
||||
output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
|
||||
route="/v1/adapters",
|
||||
)
|
||||
async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
|
||||
return await runner.set_adapter.async_run(adapter_name)
|
||||
|
||||
else:
|
||||
|
||||
async def adapters_v1(_: Request) -> Response:
|
||||
return JSONResponse({"success": False, "message": "No available adapters for current running server"})
|
||||
|
||||
|
||||
async def list_adapter_v1(_: Request) -> Response:
|
||||
@@ -198,5 +210,8 @@ async def list_adapter_v1(_: Request) -> Response:
|
||||
return JSONResponse(res, status_code=200)
|
||||
|
||||
|
||||
metadata_app = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])])
|
||||
svc.mount_asgi_app(metadata_app, path="/v1")
|
||||
adapters_routes_v1 = [Route("/adapters", list_adapter_v1, methods=["GET"])]
|
||||
if not runner.has_adapters:
|
||||
adapters_routes_v1.append(Route("/adapters", adapters_v1, methods=["POST"]))
|
||||
adapters_app_v1 = Starlette(debug=True, routes=adapters_routes_v1)
|
||||
svc.mount_asgi_app(adapters_app_v1, path="/v1")
|
||||
|
||||
@@ -145,6 +145,10 @@ class LLMRunner(bentoml.Runner):
|
||||
generate_one: RunnerMethod[LLMRunnable, [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
|
||||
generate_iterator: RunnerMethod[LLMRunnable, [str], t.Generator[t.Any, None, None]]
|
||||
|
||||
supports_embeddings: bool
|
||||
supports_hf_agent: bool
|
||||
has_adapters: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runnable_class: type[LLMRunnable],
|
||||
|
||||
@@ -34,6 +34,7 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import http.client
|
||||
import importlib.machinery
|
||||
import importlib.util
|
||||
import inspect
|
||||
@@ -470,9 +471,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
def list_commands(self, ctx: click.Context) -> list[str]:
|
||||
if ctx.command.name == "start" or ctx.command.name == "start-grpc":
|
||||
if ctx.command.name in {"start", "start-grpc"}:
|
||||
return list(openllm.CONFIG_MAPPING.keys())
|
||||
|
||||
return super().list_commands(ctx)
|
||||
|
||||
@override
|
||||
@@ -883,7 +883,7 @@ def prerequisite_check(
|
||||
|
||||
requirements = llm_config["requirements"]
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(i) is None]
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
_echo(
|
||||
f"Make sure to have the following dependencies available: {missing_requirements}",
|
||||
@@ -2339,6 +2339,11 @@ def instruct(
|
||||
"""
|
||||
client = openllm.client.HTTPClient(endpoint, timeout=timeout)
|
||||
|
||||
try:
|
||||
client.call("metadata")
|
||||
except http.client.BadStatusLine:
|
||||
raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
|
||||
|
||||
if agent == "hf":
|
||||
if not is_transformers_supports_agent():
|
||||
raise click.UsageError(
|
||||
|
||||
@@ -11,41 +11,25 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_cpm_kernels_available
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available() or not is_cpm_kernels_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_baichuan"] = ["Baichuan"]
|
||||
|
||||
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_baichuan"] = ["Baichuan"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_baichuan import START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
|
||||
from .configuration_baichuan import BaichuanConfig as BaichuanConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available() or not is_cpm_kernels_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_baichuan import Baichuan as Baichuan
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_baichuan import Baichuan as Baichuan
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class BaichuanConfig(openllm.LLMConfig):
|
||||
"""Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
|
||||
|
||||
@@ -26,7 +23,6 @@ class BaichuanConfig(openllm.LLMConfig):
|
||||
and English benchmarks (C-Eval, MMLU, etc).
|
||||
Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"trust_remote_code": True,
|
||||
@@ -45,13 +41,10 @@ class BaichuanConfig(openllm.LLMConfig):
|
||||
"hiyouga/baichuan-7b-sft",
|
||||
],
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 2048
|
||||
top_p: float = 0.7
|
||||
temperature: float = 0.95
|
||||
|
||||
|
||||
START_BAICHUAN_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for Baichuan model.
|
||||
|
||||
@@ -71,5 +64,4 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
|
||||
\b
|
||||
$ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -13,69 +13,31 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
|
||||
class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
top_p: float | None = None,
|
||||
temperature: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
# NOTE: The rest of attrs should be kwargs for GenerationConfig
|
||||
generate_kwargs = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"top_p": top_p,
|
||||
"temperature": temperature,
|
||||
**attrs,
|
||||
}
|
||||
|
||||
generate_kwargs = {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}
|
||||
return prompt_text, generate_kwargs, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
|
||||
@@ -11,41 +11,24 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_cpm_kernels_available
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available() or not is_cpm_kernels_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_chatglm"] = ["ChatGLM"]
|
||||
|
||||
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_chatglm"] = ["ChatGLM"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_chatglm import START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
|
||||
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available() or not is_cpm_kernels_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_chatglm import ChatGLM as ChatGLM
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_chatglm import ChatGLM as ChatGLM
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class ChatGLMConfig(openllm.LLMConfig):
|
||||
"""ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
|
||||
|
||||
@@ -30,7 +27,6 @@ class ChatGLMConfig(openllm.LLMConfig):
|
||||
|
||||
Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"trust_remote_code": True,
|
||||
@@ -48,22 +44,17 @@ class ChatGLMConfig(openllm.LLMConfig):
|
||||
"thudm/chatglm2-6b-int4",
|
||||
],
|
||||
}
|
||||
|
||||
retain_history: bool = openllm.LLMConfig.Field(
|
||||
False,
|
||||
description="""Whether to retain history given to the model.
|
||||
If set to True, then the model will retain given history.""",
|
||||
)
|
||||
|
||||
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
|
||||
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 2048
|
||||
num_beams: int = 1
|
||||
top_p: float = 0.7
|
||||
temperature: float = 0.95
|
||||
|
||||
|
||||
START_CHATGLM_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for ChatGLM model.
|
||||
|
||||
@@ -83,5 +74,4 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
|
||||
\b
|
||||
$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -13,94 +13,34 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from ...utils import generate_labels
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
|
||||
class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
transformers.AutoModel.from_pretrained(self.model_id, trust_remote_code=trust_remote_code),
|
||||
labels=generate_labels(self),
|
||||
custom_objects={
|
||||
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
||||
self.model_id, trust_remote_code=trust_remote_code, **tokenizer_attrs
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
num_beams: int | None = None,
|
||||
top_p: float | None = None,
|
||||
temperature: float | None = None,
|
||||
chat_history: list[str] | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[str] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
prompt_text = ""
|
||||
|
||||
if use_default_prompt_template and chat_history is not None:
|
||||
for i, (old_query, response) in enumerate(chat_history):
|
||||
prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" # noqa: RUF001
|
||||
for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" # noqa: RUF001
|
||||
prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:" # noqa: RUF001
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
else: prompt_text = prompt
|
||||
postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
|
||||
|
||||
# NOTE: The rest of attrs should be kwargs for GenerationConfig
|
||||
generate_kwargs = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"num_beams": num_beams,
|
||||
"top_p": top_p,
|
||||
"temperature": temperature,
|
||||
**attrs,
|
||||
}
|
||||
|
||||
generate_kwargs = {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}
|
||||
return prompt_text, generate_kwargs, postprocess_generate_kwargs
|
||||
|
||||
def postprocess_generate(
|
||||
self,
|
||||
prompt: str,
|
||||
generation_result: tuple[str, list[tuple[str, str]]],
|
||||
*,
|
||||
chat_history: list[tuple[str, str]] | None = None,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any):
|
||||
generated, history = generation_result
|
||||
if self.config.retain_history:
|
||||
assert chat_history is not None, "'retain_history' is True while there is no history provided."
|
||||
chat_history.extend(history)
|
||||
return generated
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
|
||||
with torch.inference_mode():
|
||||
self.model.eval()
|
||||
# Only use half precision if the model is not yet quantized
|
||||
if self.config.use_half_precision:
|
||||
self.model.half()
|
||||
return self.model.chat(
|
||||
self.tokenizer,
|
||||
prompt,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
if self.config.use_half_precision: self.model.half()
|
||||
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
|
||||
@@ -11,40 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_dolly_v2"] = ["DollyV2"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_dolly_v2"] = ["DollyV2"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_dolly_v2 import START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
|
||||
from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_dolly_v2 import DollyV2 as DollyV2
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_dolly_v2 import DollyV2 as DollyV2
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -13,14 +13,8 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import transformers
|
||||
class DollyV2Config(openllm.LLMConfig):
|
||||
"""Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
|
||||
|
||||
@@ -33,7 +27,6 @@ class DollyV2Config(openllm.LLMConfig):
|
||||
|
||||
Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"timeout": 3600000,
|
||||
"url": "https://github.com/databrickslabs/dolly",
|
||||
@@ -41,19 +34,15 @@ class DollyV2Config(openllm.LLMConfig):
|
||||
"default_id": "databricks/dolly-v2-3b",
|
||||
"model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
|
||||
}
|
||||
|
||||
return_full_text: bool = openllm.LLMConfig.Field(
|
||||
False, description="Whether to return the full prompt to the users."
|
||||
)
|
||||
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.9
|
||||
top_p: float = 0.92
|
||||
top_k: int = 5
|
||||
max_new_tokens: int = 256
|
||||
eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
|
||||
|
||||
|
||||
START_DOLLY_V2_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for dolly-v2 model.
|
||||
|
||||
@@ -73,14 +62,10 @@ or provide `--model-id` flag when running ``openllm start dolly-v2``:
|
||||
\b
|
||||
$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
|
||||
"""
|
||||
|
||||
INSTRUCTION_KEY = "### Instruction:"
|
||||
RESPONSE_KEY = "### Response:"
|
||||
END_KEY = "### End"
|
||||
INTRO_BLURB = (
|
||||
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
)
|
||||
|
||||
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
# NOTE: This is the prompt that is used for generating responses using an already
|
||||
# trained model. It ends with the response key, where the job of the model is to provide
|
||||
# the completion that follows it (i.e. the response itself).
|
||||
@@ -88,15 +73,8 @@ DEFAULT_PROMPT_TEMPLATE = """{intro}
|
||||
{instruction_key}
|
||||
{instruction}
|
||||
{response_key}
|
||||
""".format(
|
||||
intro=INTRO_BLURB,
|
||||
instruction_key=INSTRUCTION_KEY,
|
||||
instruction="{instruction}",
|
||||
response_key=RESPONSE_KEY,
|
||||
)
|
||||
|
||||
|
||||
def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
|
||||
""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
|
||||
def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
|
||||
"""Gets the token ID for a given string that has been added to the tokenizer as a special token.
|
||||
|
||||
When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
|
||||
@@ -113,6 +91,5 @@ def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
|
||||
int: the token ID for the given key.
|
||||
"""
|
||||
token_ids = tokenizer.encode(key)
|
||||
if len(token_ids) > 1:
|
||||
raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
|
||||
if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
|
||||
return token_ids[0]
|
||||
|
||||
@@ -15,288 +15,118 @@ from __future__ import annotations
|
||||
import logging
|
||||
import re
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_dolly_v2 import END_KEY
|
||||
from .configuration_dolly_v2 import RESPONSE_KEY
|
||||
from .configuration_dolly_v2 import get_special_token_id
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
import tensorflow as tf
|
||||
else:
|
||||
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@t.overload
|
||||
def get_pipeline(
|
||||
model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: t.Literal[True] = True,
|
||||
**attrs: t.Any,
|
||||
) -> transformers.Pipeline:
|
||||
...
|
||||
|
||||
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ...
|
||||
@t.overload
|
||||
def get_pipeline(
|
||||
model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: t.Literal[False] = ...,
|
||||
**attrs: t.Any,
|
||||
) -> type[transformers.Pipeline]:
|
||||
...
|
||||
|
||||
|
||||
def get_pipeline(
|
||||
model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> type[transformers.Pipeline] | transformers.Pipeline:
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ...
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
|
||||
class InstructionTextGenerationPipeline(transformers.Pipeline):
|
||||
def __init__(
|
||||
self,
|
||||
*args: t.Any,
|
||||
do_sample: bool = True,
|
||||
max_new_tokens: int = 256,
|
||||
top_p: float = 0.92,
|
||||
top_k: int = 0,
|
||||
**kwargs: t.Any,
|
||||
):
|
||||
"""Initialize the pipeline.
|
||||
|
||||
Args:
|
||||
do_sample: Whether or not to use sampling. Defaults to True.
|
||||
max_new_tokens: Max new tokens after the prompt to generate. Defaults to 128.
|
||||
top_p: If set to float < 1, only the smallest set of most probable tokens with
|
||||
probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
|
||||
top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to 0.
|
||||
*args: Additional positional arguments to be passed to ``transformers.Pipeline``.
|
||||
**kwargs: Additional keyword arguments to be passed to ``transformers.Pipeline``.
|
||||
"""
|
||||
super().__init__(
|
||||
*args,
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
do_sample=do_sample,
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
|
||||
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any):
|
||||
if t.TYPE_CHECKING:
|
||||
assert self.tokenizer is not None
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
preprocess_params: dict[str, t.Any] = {}
|
||||
|
||||
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
|
||||
# append a newline to yield a single token. find whatever token is configured for the response key.
|
||||
tokenizer_response_key = next(
|
||||
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
|
||||
)
|
||||
|
||||
tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
|
||||
response_key_token_id = None
|
||||
end_key_token_id = None
|
||||
if tokenizer_response_key:
|
||||
try:
|
||||
response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
|
||||
end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
|
||||
|
||||
# Ensure generation stops once it generates "### End"
|
||||
generate_kwargs["eos_token_id"] = end_key_token_id
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
except ValueError: pass
|
||||
forward_params = generate_kwargs
|
||||
postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
|
||||
|
||||
if return_full_text is not None:
|
||||
postprocess_params["return_full_text"] = return_full_text
|
||||
|
||||
if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
|
||||
return preprocess_params, forward_params, postprocess_params
|
||||
|
||||
def preprocess(self, input_: str, **generate_kwargs: t.Any):
|
||||
if t.TYPE_CHECKING:
|
||||
assert self.tokenizer is not None
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
|
||||
inputs = self.tokenizer(prompt_text, return_tensors="pt")
|
||||
inputs["prompt_text"] = prompt_text
|
||||
inputs["instruction_text"] = input_
|
||||
return inputs
|
||||
|
||||
def _forward(self, model_inputs: dict[str, t.Any], **generate_kwargs: t.Any):
|
||||
if t.TYPE_CHECKING:
|
||||
assert self.tokenizer is not None
|
||||
input_ids = model_inputs["input_ids"]
|
||||
attention_mask = model_inputs.get("attention_mask", None)
|
||||
|
||||
if input_ids.shape[1] == 0:
|
||||
input_ids = None
|
||||
attention_mask = None
|
||||
in_b = 1
|
||||
else:
|
||||
in_b = input_ids.shape[0]
|
||||
|
||||
generated_sequence = self.model.generate(
|
||||
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
|
||||
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
|
||||
pad_token_id=self.tokenizer.pad_token_id,
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
input_ids, attention_mask = model_inputs["input_ids"], model_inputs.get("attention_mask", None)
|
||||
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
|
||||
else: in_b = input_ids.shape[0]
|
||||
generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
|
||||
out_b = generated_sequence.shape[0]
|
||||
if self.framework == "pt":
|
||||
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == "tf":
|
||||
generated_sequence = tf.reshape(
|
||||
generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
)
|
||||
|
||||
if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
|
||||
instruction_text = model_inputs.pop("instruction_text")
|
||||
return {
|
||||
"generated_sequence": generated_sequence,
|
||||
"input_ids": input_ids,
|
||||
"instruction_text": instruction_text,
|
||||
}
|
||||
|
||||
def postprocess(
|
||||
self,
|
||||
model_outputs: dict[str, t.Any],
|
||||
response_key_token_id: int,
|
||||
end_key_token_id: int,
|
||||
return_full_text: bool = False,
|
||||
):
|
||||
if t.TYPE_CHECKING:
|
||||
assert self.tokenizer is not None
|
||||
generated_sequence = model_outputs["generated_sequence"][0]
|
||||
instruction_text = model_outputs["instruction_text"]
|
||||
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
|
||||
|
||||
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False):
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
|
||||
generated_sequence: list[list[int]] = generated_sequence.numpy().tolist()
|
||||
records: list[dict[t.Literal["generated_text"], str]] = []
|
||||
for sequence in generated_sequence:
|
||||
# The response will be set to this variable if we can identify it.
|
||||
decoded = None
|
||||
|
||||
# If we have token IDs for the response and end, then we can find the tokens and only decode between them.
|
||||
if response_key_token_id and end_key_token_id:
|
||||
# Find where "### Response:" is first found in the generated tokens. Considering this is part of the
|
||||
# prompt, we should definitely find it. We will return the tokens found after this token.
|
||||
try:
|
||||
response_pos = sequence.index(response_key_token_id)
|
||||
except ValueError:
|
||||
logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
|
||||
response_pos = None
|
||||
|
||||
try: response_pos = sequence.index(response_key_token_id)
|
||||
except ValueError: response_pos = None
|
||||
if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
|
||||
if response_pos:
|
||||
# Next find where "### End" is located. The model has been trained to end its responses with this
|
||||
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
|
||||
# this token, as the response could be truncated. If we don't find it then just return everything
|
||||
# to the end. Note that even though we set eos_token_id, we still see the this token at the end.
|
||||
try:
|
||||
end_pos = sequence.index(end_key_token_id)
|
||||
except ValueError:
|
||||
end_pos = None
|
||||
|
||||
try: end_pos = sequence.index(end_key_token_id)
|
||||
except ValueError: end_pos = None
|
||||
decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
|
||||
|
||||
if not decoded:
|
||||
# Otherwise we'll decode everything and use a regex to find the response and end.
|
||||
|
||||
fully_decoded = self.tokenizer.decode(sequence)
|
||||
|
||||
# The response appears after "### Response:". The model has been trained to append "### End" at the
|
||||
# end.
|
||||
m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
|
||||
|
||||
if m:
|
||||
decoded = m.group(1).strip()
|
||||
if m: decoded = m.group(1).strip()
|
||||
else:
|
||||
# The model might not generate the "### End" sequence before reaching the max tokens. In this case,
|
||||
# return everything after "### Response:".
|
||||
m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
|
||||
if m:
|
||||
decoded = m.group(1).strip()
|
||||
else:
|
||||
logger.warning("Failed to find response in:\n%s", fully_decoded)
|
||||
|
||||
if m: decoded = m.group(1).strip()
|
||||
else: logger.warning("Failed to find response in:\n%s", fully_decoded)
|
||||
# If the full text is requested, then append the decoded text to the original instruction.
|
||||
# This technically isn't the full text, as we format the instruction in the prompt the model has been
|
||||
# trained on, but to the client it will appear to be the full text.
|
||||
if return_full_text:
|
||||
decoded = f"{instruction_text}\n{decoded}"
|
||||
|
||||
if return_full_text: decoded = f"{instruction_text}\n{decoded}"
|
||||
rec = {"generated_text": decoded}
|
||||
|
||||
records.append(rec)
|
||||
|
||||
return records
|
||||
|
||||
if _init:
|
||||
return InstructionTextGenerationPipeline()
|
||||
if _init: return InstructionTextGenerationPipeline()
|
||||
return InstructionTextGenerationPipeline
|
||||
|
||||
|
||||
class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {
|
||||
"device_map": "auto" if torch.cuda.is_available() else None,
|
||||
"torch_dtype": torch.bfloat16,
|
||||
}
|
||||
tokenizer_kwds = {"padding_side": "left"}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
|
||||
return get_pipeline(
|
||||
model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
|
||||
tokenizer=self.tokenizer,
|
||||
_init=True,
|
||||
return_full_text=self.config.return_full_text,
|
||||
)
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
# NOTE: The rest of attrs should be kwargs for GenerationConfig
|
||||
generate_kwargs = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
"temperature": temperature,
|
||||
**attrs,
|
||||
}
|
||||
|
||||
return prompt, generate_kwargs, {}
|
||||
|
||||
def postprocess_generate(
|
||||
self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
|
||||
) -> str:
|
||||
return generation_result[0]["generated_text"]
|
||||
|
||||
def import_kwargs(self): return {"device_map": "auto" if torch.cuda.is_available() else None, "torch_dtype": torch.bfloat16}, {"padding_side": "left"}
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), tokenizer=self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return prompt, {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
with torch.inference_mode():
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
return self.model(
|
||||
prompt,
|
||||
return_full_text=llm_config.return_full_text,
|
||||
generation_config=llm_config.to_generation_config(),
|
||||
)
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
|
||||
|
||||
@@ -11,40 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_falcon"] = ["Falcon"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_falcon"] = ["Falcon"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_falcon import START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
|
||||
from .configuration_falcon import FalconConfig as FalconConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_falcon import Falcon as Falcon
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_falcon import Falcon as Falcon
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class FalconConfig(openllm.LLMConfig):
|
||||
"""Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
|
||||
|
||||
@@ -23,7 +20,6 @@ class FalconConfig(openllm.LLMConfig):
|
||||
|
||||
Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"trust_remote_code": True,
|
||||
@@ -50,15 +46,12 @@ class FalconConfig(openllm.LLMConfig):
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 200
|
||||
top_k: int = 10
|
||||
num_return_sequences: int = 1
|
||||
num_beams: int = 4
|
||||
early_stopping: bool = True
|
||||
|
||||
|
||||
START_FALCON_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for FalconLM model.
|
||||
|
||||
@@ -78,7 +71,6 @@ or provide `--model-id` flag when running ``openllm start falcon``:
|
||||
\b
|
||||
$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{context}
|
||||
{user_name}: {instruction}
|
||||
{agent}:
|
||||
|
||||
@@ -11,105 +11,41 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
|
||||
class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
eos_token_id: int | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}, {}
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument instead of "
|
||||
"kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"top_k": top_k,
|
||||
"num_return_sequences": num_return_sequences,
|
||||
"eos_token_id": eos_token_id,
|
||||
**attrs,
|
||||
}
|
||||
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
eos_token_id = attrs.pop("eos_token_id", self.tokenizer.eos_token_id)
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
|
||||
outputs = self.model.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
generation_config=self.config.model_construct_env(
|
||||
eos_token_id=eos_token_id, **attrs
|
||||
).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
|
||||
def generate_one(
|
||||
self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
|
||||
) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env( eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
from ..._generation import StopSequenceCriteria
|
||||
|
||||
max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
|
||||
encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
src_len = encoded_inputs["input_ids"].shape[1]
|
||||
stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
|
||||
stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
|
||||
outputs = self.model.generate(
|
||||
encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
|
||||
)
|
||||
|
||||
result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
|
||||
result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
# Inference API returns the stop sequence
|
||||
for stop_seq in stop:
|
||||
if result.endswith(stop_seq):
|
||||
result = result[: -len(stop_seq)]
|
||||
if result.endswith(stop_seq): result = result[: -len(stop_seq)]
|
||||
return [{"generated_text": result}]
|
||||
|
||||
@@ -13,73 +13,40 @@
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_flax_available
|
||||
from ...utils import is_tf_available
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flan_t5"] = ["FlanT5"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_flan_t5"] = ["FlanT5"]
|
||||
try:
|
||||
if not is_flax_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
|
||||
|
||||
if not is_flax_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
|
||||
|
||||
|
||||
if not is_tf_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_flan_t5 import START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
|
||||
from .configuration_flan_t5 import FlanT5Config as FlanT5Config
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_flan_t5 import FlanT5 as FlanT5
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_flan_t5 import FlanT5 as FlanT5
|
||||
try:
|
||||
if not is_flax_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
|
||||
|
||||
if not is_flax_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_tf_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,33 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
class FlanT5Config(openllm.LLMConfig):
|
||||
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
|
||||
|
||||
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
|
||||
|
||||
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
|
||||
"""
|
||||
__config__ = {
|
||||
"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
|
||||
"default_id": "google/flan-t5-large",
|
||||
"architecture": "T5ForConditionalGeneration",
|
||||
"model_ids": [
|
||||
"google/flan-t5-small",
|
||||
"google/flan-t5-base",
|
||||
"google/flan-t5-large",
|
||||
"google/flan-t5-xl",
|
||||
"google/flan-t5-xxl",
|
||||
],
|
||||
"model_type": "seq2seq_lm",
|
||||
}
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.9
|
||||
max_new_tokens: int = 2048
|
||||
top_k: int = 50
|
||||
top_p: float = 0.4
|
||||
repetition_penalty = 1.0
|
||||
START_FLAN_T5_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for FLAN-T5 model.
|
||||
|
||||
@@ -41,35 +64,4 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
|
||||
\b
|
||||
$ openllm start flan-t5 --model-id google/flan-t5-xxl
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
|
||||
|
||||
|
||||
class FlanT5Config(openllm.LLMConfig):
|
||||
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
|
||||
|
||||
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
|
||||
|
||||
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
|
||||
"default_id": "google/flan-t5-large",
|
||||
"architecture": "T5ForConditionalGeneration",
|
||||
"model_ids": [
|
||||
"google/flan-t5-small",
|
||||
"google/flan-t5-base",
|
||||
"google/flan-t5-large",
|
||||
"google/flan-t5-xl",
|
||||
"google/flan-t5-xxl",
|
||||
],
|
||||
"model_type": "seq2seq_lm",
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.9
|
||||
max_new_tokens: int = 2048
|
||||
top_k: int = 50
|
||||
top_p: float = 0.4
|
||||
repetition_penalty = 1.0
|
||||
|
||||
@@ -13,71 +13,25 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers # noqa: F401
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
with torch.inference_mode():
|
||||
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
|
||||
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
|
||||
@@ -26,64 +26,18 @@ if t.TYPE_CHECKING:
|
||||
|
||||
class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
decoder_start_token_id: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
if decoder_start_token_id is None:
|
||||
decoder_start_token_id = 0
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
"decoder_start_token_id": decoder_start_token_id,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
if decoder_start_token_id is None: decoder_start_token_id = 0
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
# XXX: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main
|
||||
# as it is required for encoder-decoder generation.
|
||||
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
|
||||
decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
|
||||
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
)
|
||||
return self.tokenizer.batch_decode(
|
||||
result_tensor.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
||||
)
|
||||
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
|
||||
@@ -13,66 +13,20 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import transformers # noqa: F401
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import transformers # noqa: F401
|
||||
class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
|
||||
outputs = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
|
||||
@@ -11,40 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_gpt_neox import START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
|
||||
from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_gpt_neox import GPTNeoX as GPTNeoX
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -11,12 +11,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class GPTNeoXConfig(openllm.LLMConfig):
|
||||
"""GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
|
||||
|
||||
@@ -32,7 +28,6 @@ class GPTNeoXConfig(openllm.LLMConfig):
|
||||
Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
|
||||
for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"model_name": "gpt_neox",
|
||||
"start_name": "gpt-neox",
|
||||
@@ -42,14 +37,10 @@ class GPTNeoXConfig(openllm.LLMConfig):
|
||||
"default_id": "eleutherai/gpt-neox-20b",
|
||||
"model_ids": ["eleutherai/gpt-neox-20b"],
|
||||
}
|
||||
|
||||
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
|
||||
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.9
|
||||
max_new_tokens: int = 100
|
||||
|
||||
|
||||
START_GPT_NEOX_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for GPTNeoX model.
|
||||
|
||||
@@ -69,6 +60,4 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
|
||||
\b
|
||||
$ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
|
||||
"""
|
||||
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -11,88 +11,34 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import torch, transformers
|
||||
else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
temperature: float | None = None,
|
||||
max_new_tokens: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature}
|
||||
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
|
||||
if self.config.use_half_precision:
|
||||
model.half()
|
||||
if self.config.use_half_precision: model.half()
|
||||
return model
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
from ..._generation import StopOnTokens
|
||||
|
||||
generation_kwargs = {
|
||||
"do_sample": True,
|
||||
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
"pad_token_id": self.tokenizer.eos_token_id,
|
||||
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
|
||||
}
|
||||
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode():
|
||||
gen_tokens = self.model.generate(inputs.input_ids, **generation_kwargs)
|
||||
return self.tokenizer.batch_decode(gen_tokens)
|
||||
generation_kwargs = {"do_sample": True, "generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "pad_token_id": self.tokenizer.eos_token_id, "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
|
||||
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, **generation_kwargs))
|
||||
|
||||
@@ -11,64 +11,33 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
from ...utils import is_vllm_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_llama": [
|
||||
"LlaMAConfig",
|
||||
"START_LLAMA_COMMAND_DOCSTRING",
|
||||
"DEFAULT_PROMPT_TEMPLATE",
|
||||
"PROMPT_MAPPING",
|
||||
],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlaMAConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
|
||||
try:
|
||||
if not is_vllm_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
|
||||
|
||||
if not is_vllm_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_llama"] = ["LlaMA"]
|
||||
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_llama"] = ["LlaMA"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_llama import PROMPT_MAPPING as PROMPT_MAPPING
|
||||
from .configuration_llama import START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
|
||||
from .configuration_llama import LlaMAConfig as LlaMAConfig
|
||||
|
||||
try:
|
||||
if not is_vllm_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
|
||||
|
||||
if not is_vllm_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_llama import LlaMA as LlaMA
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_llama import LlaMA as LlaMA
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -11,13 +11,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class LlaMAConfig(openllm.LLMConfig):
|
||||
"""LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
|
||||
@@ -30,11 +26,7 @@ class LlaMAConfig(openllm.LLMConfig):
|
||||
Refer to [LlaMA's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
|
||||
for more information.
|
||||
"""
|
||||
|
||||
use_llama2_prompt: bool = openllm.LLMConfig.Field(
|
||||
True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1."
|
||||
)
|
||||
|
||||
use_llama2_prompt: bool = openllm.LLMConfig.Field(True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1.")
|
||||
__config__ = {
|
||||
"model_name": "llama",
|
||||
"start_name": "llama",
|
||||
@@ -69,18 +61,14 @@ class LlaMAConfig(openllm.LLMConfig):
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 256
|
||||
temperature: float = 0.45
|
||||
top_p: float = 0.95
|
||||
top_k: int = 12
|
||||
|
||||
class SamplingParams:
|
||||
best_of: int = 1
|
||||
presence_penalty: float = 0.5
|
||||
|
||||
|
||||
START_LLAMA_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for LlaMA model.
|
||||
|
||||
@@ -110,39 +98,14 @@ OpenLLM also supports running LlaMA-2 and its fine-tune and variants. To import
|
||||
\b
|
||||
$ CONVERTER=hf-llama2 openllm import llama /path/to/llama-2
|
||||
"""
|
||||
|
||||
SYSTEM_MESSAGE = """
|
||||
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
||||
|
||||
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
|
||||
"""
|
||||
|
||||
SINST_KEY = "[INST]"
|
||||
EINST_KEY = "[/INST]"
|
||||
SYS_KEY = "<<SYS>>"
|
||||
EOS_TOKEN = "</s>"
|
||||
BOS_TOKEN = "<s>"
|
||||
|
||||
# TODO: support history
|
||||
_v2_prompt = """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(
|
||||
start_key=SINST_KEY,
|
||||
sys_key=SYS_KEY,
|
||||
system_message=SYSTEM_MESSAGE,
|
||||
instruction="{instruction}",
|
||||
end_key=EINST_KEY,
|
||||
)
|
||||
|
||||
# XXX: implement me
|
||||
_v1_prompt = """{instruction}"""
|
||||
|
||||
PROMPT_MAPPING = {
|
||||
"v1": _v1_prompt,
|
||||
"v2": _v2_prompt,
|
||||
}
|
||||
|
||||
|
||||
def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str:
|
||||
return PROMPT_MAPPING[model_type]
|
||||
|
||||
|
||||
SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "<<SYS>>", "</s>", "<s>"
|
||||
# TODO: support history and v1 prompt implementation
|
||||
_v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction="{instruction}", end_key=EINST_KEY)
|
||||
PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
|
||||
def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
|
||||
DEFAULT_PROMPT_TEMPLATE = _get_prompt
|
||||
|
||||
@@ -11,110 +11,41 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._llm import LLMEmbeddings
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
F = openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
|
||||
else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
temperature: float | None = None,
|
||||
max_new_tokens: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
use_llama2_prompt: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
_PROMPT = DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1")
|
||||
template_variables = default_formatter.extract_template_variables(_PROMPT)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"top_k": top_k,
|
||||
}
|
||||
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
from ..._generation import StopOnTokens
|
||||
|
||||
generation_kwargs = {
|
||||
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
|
||||
}
|
||||
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode():
|
||||
gen_tokens = self.model.generate(**inputs, **generation_kwargs)
|
||||
return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
|
||||
generation_kwargs = {"generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
|
||||
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), **generation_kwargs), skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
|
||||
encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
|
||||
input_ids = encoding["input_ids"]
|
||||
attention_mask = encoding["attention_mask"]
|
||||
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
|
||||
with torch.inference_mode():
|
||||
model_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
|
||||
data = model_outputs.hidden_states[-1]
|
||||
data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
|
||||
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
|
||||
masked_embeddings = data * mask
|
||||
sum_embeddings = torch.sum(masked_embeddings, dim=1)
|
||||
seq_length = torch.sum(mask, dim=1)
|
||||
embedding = sum_embeddings / seq_length
|
||||
normalized_embeddings = F.normalize(embedding, p=2, dim=1)
|
||||
return {
|
||||
"embeddings": normalized_embeddings,
|
||||
"num_tokens": torch.sum(attention_mask).item(),
|
||||
}
|
||||
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
|
||||
return {"embeddings": F.normalize(sum_embeddings / seq_length, p=2, dim=1), "num_tokens": torch.sum(attention_mask).item()}
|
||||
|
||||
@@ -11,42 +11,24 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_mpt"] = ["MPT"]
|
||||
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_mpt"] = ["MPT"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_mpt import PROMPT_MAPPING as PROMPT_MAPPING
|
||||
from .configuration_mpt import START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
|
||||
from .configuration_mpt import MPTConfig as MPTConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_mpt import MPT as MPT
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_mpt import MPT as MPT
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -11,20 +11,11 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
|
||||
else:
|
||||
# TODO: Support Literal string for LLMConfig
|
||||
MPTPromptType = str
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
|
||||
else: MPTPromptType = str
|
||||
class MPTConfig(openllm.LLMConfig):
|
||||
"""MPT is a decoder-style transformer pretrained from scratch on English text and code.
|
||||
|
||||
@@ -34,7 +25,6 @@ class MPTConfig(openllm.LLMConfig):
|
||||
on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
|
||||
for more details on specific models.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"trust_remote_code": True,
|
||||
@@ -53,27 +43,12 @@ class MPTConfig(openllm.LLMConfig):
|
||||
"mosaicml/mpt-30b-chat",
|
||||
],
|
||||
}
|
||||
|
||||
prompt_type: MPTPromptType = openllm.LLMConfig.Field(
|
||||
'"default"',
|
||||
description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""",
|
||||
)
|
||||
|
||||
max_sequence_length: int = openllm.LLMConfig.Field(
|
||||
2048,
|
||||
description="""\
|
||||
Max sequence length to run MPT with. Note that MPT is trained ith sequence length
|
||||
of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096
|
||||
(for 7b models) and 16384 (for 30b models)
|
||||
""",
|
||||
)
|
||||
|
||||
prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""")
|
||||
max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 128
|
||||
temperature: float = 0
|
||||
top_p: float = 0.8
|
||||
|
||||
|
||||
START_MPT_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for MPT model.
|
||||
|
||||
@@ -100,43 +75,16 @@ or provide `--model-id` flag when running ``openllm start mpt``:
|
||||
\b
|
||||
$ openllm start mpt --model-id mosaicml/mpt-30b
|
||||
"""
|
||||
|
||||
INSTRUCTION_KEY = "### Instruction:"
|
||||
RESPONSE_KEY = "### Response:"
|
||||
END_KEY = "### End"
|
||||
INTRO_BLURB = (
|
||||
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
)
|
||||
|
||||
INSTRUCTION_KEY, RESPONSE_KEY, END_KEY = "### Instruction:", "### Response:", "### End"
|
||||
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
# NOTE: This is the prompt that is used for generating responses using an already
|
||||
# trained model. It ends with the response key, where the job of the model is to provide
|
||||
# the completion that follows it (i.e. the response itself).
|
||||
_instruct_prompt = """{intro}
|
||||
_chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instruction}""", """{intro}
|
||||
{instruction_key}
|
||||
{instruction}
|
||||
{response_key}
|
||||
""".format(
|
||||
intro=INTRO_BLURB,
|
||||
instruction_key=INSTRUCTION_KEY,
|
||||
instruction="{instruction}",
|
||||
response_key=RESPONSE_KEY,
|
||||
)
|
||||
|
||||
_default_prompt = """{instruction}"""
|
||||
|
||||
# TODO: XXX implement me
|
||||
_chat_prompt = """{instruction}"""
|
||||
|
||||
PROMPT_MAPPING = {
|
||||
"default": _default_prompt,
|
||||
"instruct": _instruct_prompt,
|
||||
"storywriter": _default_prompt,
|
||||
"chat": _chat_prompt,
|
||||
}
|
||||
|
||||
|
||||
def _get_prompt(model_type: str) -> str:
|
||||
return PROMPT_MAPPING[model_type]
|
||||
|
||||
|
||||
""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
|
||||
PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
|
||||
def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
|
||||
DEFAULT_PROMPT_TEMPLATE = _get_prompt
|
||||
|
||||
@@ -15,189 +15,70 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
|
||||
from ..._prompt import default_formatter
|
||||
from ...utils import generate_labels
|
||||
from ...utils import is_triton_available
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
|
||||
from .configuration_mpt import MPTPromptType
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
from ...utils import generate_labels, is_triton_available
|
||||
if t.TYPE_CHECKING: import transformers, torch
|
||||
else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_mpt_config(
|
||||
model_id_or_path: str,
|
||||
max_sequence_length: int,
|
||||
device: torch.device | str | int | None,
|
||||
device_map: str | None = None,
|
||||
trust_remote_code: bool = True,
|
||||
) -> transformers.PretrainedConfig:
|
||||
def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
|
||||
config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
|
||||
if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)):
|
||||
config.init_device = str(device)
|
||||
if hasattr(config, "attn_config") and is_triton_available():
|
||||
config.attn_config["attn_impl"] = "triton"
|
||||
else:
|
||||
logger.debug(
|
||||
"'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
|
||||
)
|
||||
if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
|
||||
if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
|
||||
else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
|
||||
# setting max_seq_len
|
||||
config.max_seq_len = max_sequence_length
|
||||
return config
|
||||
|
||||
|
||||
class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
||||
|
||||
def llm_post_init(self): self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
|
||||
model_kwds = {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}
|
||||
tokenizer_kwds = {"padding_side": "left"}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
device_map = attrs.pop("device_map", None)
|
||||
attrs.pop("low_cpu_mem_usage", None)
|
||||
|
||||
config = get_mpt_config(
|
||||
self.model_id,
|
||||
self.config.max_sequence_length,
|
||||
self.device,
|
||||
device_map=device_map,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
if tokenizer.pad_token_id is None:
|
||||
logger.warning("pad_token_id is not set. Setting it to eos_token")
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
config=config,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
device_map=device_map,
|
||||
**attrs,
|
||||
)
|
||||
try:
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
model,
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
|
||||
try: return bentoml.transformers.save_model( self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
finally: torch.cuda.empty_cache()
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
device_map = attrs.pop("device_map", None)
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
|
||||
_ref = bentoml.transformers.get(self.tag)
|
||||
config = get_mpt_config(
|
||||
_ref.path,
|
||||
self.config.max_sequence_length,
|
||||
self.device,
|
||||
device_map=device_map,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
_ref.path,
|
||||
config=config,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device_map,
|
||||
**attrs,
|
||||
)
|
||||
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
|
||||
model.tie_weights()
|
||||
return model
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_p: float | None = None,
|
||||
prompt_type: MPTPromptType | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters( self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
if prompt_type is None:
|
||||
if "instruct" in self.model_id:
|
||||
prompt_type = "instruct"
|
||||
elif "storywriter" in self.model_id:
|
||||
prompt_type = "storywriter"
|
||||
elif "chat" in self.model_id:
|
||||
prompt_type = "chat"
|
||||
else:
|
||||
prompt_type = "default"
|
||||
if "instruct" in self.model_id: prompt_type = "instruct"
|
||||
elif "storywriter" in self.model_id: prompt_type = "storywriter"
|
||||
elif "chat" in self.model_id: prompt_type = "chat"
|
||||
else: prompt_type = "default"
|
||||
_PROMPT = DEFAULT_PROMPT_TEMPLATE(prompt_type)
|
||||
template_variables = default_formatter.extract_template_variables(_PROMPT)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
}
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
|
||||
attrs = {
|
||||
"do_sample": False if llm_config["temperature"] == 0 else True,
|
||||
"eos_token_id": self.tokenizer.eos_token_id,
|
||||
"pad_token_id": self.tokenizer.pad_token_id,
|
||||
"generation_config": llm_config.to_generation_config(),
|
||||
}
|
||||
|
||||
attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
|
||||
with torch.inference_mode():
|
||||
if torch.cuda.is_available():
|
||||
with torch.autocast("cuda", torch.float16):
|
||||
generated_tensors = self.model.generate(**inputs, **attrs)
|
||||
else:
|
||||
generated_tensors = self.model.generate(**inputs, **attrs)
|
||||
|
||||
else: generated_tensors = self.model.generate(**inputs, **attrs)
|
||||
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
|
||||
|
||||
@@ -11,75 +11,41 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_flax_available
|
||||
from ...utils import is_tf_available
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_opt"] = ["OPT"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_opt"] = ["OPT"]
|
||||
try:
|
||||
if not is_flax_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flax_opt"] = ["FlaxOPT"]
|
||||
|
||||
if not is_flax_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_opt"] = ["TFOPT"]
|
||||
|
||||
|
||||
if not is_tf_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_tf_opt"] = ["TFOPT"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_opt import START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
|
||||
from .configuration_opt import OPTConfig as OPTConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_opt import OPT as OPT
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_opt import OPT as OPT
|
||||
try:
|
||||
if not is_flax_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_flax_opt import FlaxOPT as FlaxOPT
|
||||
|
||||
if not is_flax_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_flax_opt import FlaxOPT as FlaxOPT
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_opt import TFOPT as TFOPT
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_tf_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_tf_opt import TFOPT as TFOPT
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -11,12 +11,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class OPTConfig(openllm.LLMConfig):
|
||||
"""OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
|
||||
|
||||
@@ -27,13 +23,12 @@ class OPTConfig(openllm.LLMConfig):
|
||||
|
||||
Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"trust_remote_code": False,
|
||||
"url": "https://huggingface.co/docs/transformers/model_doc/opt",
|
||||
"default_id": "facebook/opt-1.3b",
|
||||
"architecture": "MPTForCausalLM",
|
||||
"architecture": "OPTForCausalLM",
|
||||
"model_ids": [
|
||||
"facebook/opt-125m",
|
||||
"facebook/opt-350m",
|
||||
@@ -53,20 +48,12 @@ class OPTConfig(openllm.LLMConfig):
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
format_outputs: bool = openllm.LLMConfig.Field(
|
||||
False,
|
||||
description="""Whether to format the outputs. This
|
||||
can be used when num_return_sequences > 1.""",
|
||||
)
|
||||
|
||||
format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
|
||||
class GenerationConfig:
|
||||
top_k: int = 15
|
||||
temperature: float = 0.75
|
||||
max_new_tokens: int = 1024
|
||||
num_return_sequences: int = 1
|
||||
|
||||
|
||||
START_OPT_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for OPT model.
|
||||
|
||||
@@ -92,5 +79,4 @@ or provide `--model-id` flag when running ``openllm start opt``:
|
||||
\b
|
||||
$ openllm start opt --model-id facebook/opt-6.7b
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -11,109 +11,37 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
from ...utils import generate_labels
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
if t.TYPE_CHECKING: import transformers
|
||||
else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
|
||||
tokenizer_kwds = {
|
||||
"padding_side": "left",
|
||||
"truncation_side": "left",
|
||||
}
|
||||
return {}, tokenizer_kwds
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
model = t.cast(
|
||||
"transformers.FlaxOPTForCausalLM",
|
||||
transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
|
||||
)
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
model,
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"num_return_sequences": num_return_sequences,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
|
||||
if len(generation_result) == 1:
|
||||
if self.config.format_outputs:
|
||||
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
|
||||
return generation_result[0]
|
||||
|
||||
if self.config.format_outputs:
|
||||
return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else:
|
||||
return "\n".join(generation_result)
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
input_ids = self.tokenizer(prompt, return_tensors="np")
|
||||
generated_tensors = self.model.generate(
|
||||
**input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(generated_tensors.sequences, skip_special_tokens=True)
|
||||
if len(generation_result) == 1: return generation_result[0]
|
||||
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else: return "\n".join(generation_result)
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode( self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
|
||||
|
||||
@@ -11,129 +11,38 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
from ...utils import generate_labels
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
import torch, transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
|
||||
def llm_post_init(self): self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
|
||||
model_kwds = {
|
||||
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
||||
}
|
||||
tokenizer_kwds = {
|
||||
"padding_side": "left",
|
||||
"truncation_side": "left",
|
||||
}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
model = t.cast(
|
||||
"transformers.OPTForCausalLM",
|
||||
transformers.AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, **attrs
|
||||
),
|
||||
)
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
model,
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left", "truncation_side": "left"}
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
|
||||
)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, torch_dtype=torch_dtype, **attrs)
|
||||
return model
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"num_return_sequences": num_return_sequences,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
|
||||
if len(generation_result) == 1:
|
||||
if self.config.format_outputs:
|
||||
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
|
||||
return generation_result[0]
|
||||
|
||||
if self.config.format_outputs:
|
||||
return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else:
|
||||
return "\n".join(generation_result)
|
||||
|
||||
if len(generation_result) == 1: return generation_result[0]
|
||||
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else: return "\n".join(generation_result)
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
with torch.inference_mode():
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
generated_tensors = self.model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
|
||||
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
|
||||
@@ -11,107 +11,36 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
|
||||
from ..._prompt import default_formatter
|
||||
from ...utils import generate_labels
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import transformers
|
||||
else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
|
||||
tokenizer_kwds = {
|
||||
"padding_side": "left",
|
||||
"truncation_side": "left",
|
||||
}
|
||||
return {}, tokenizer_kwds
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
model: transformers.TFOPTForCausalLM = transformers.TFOPTForCausalLM.from_pretrained(
|
||||
self.model_id, trust_remote_code=trust_remote_code, **attrs
|
||||
)
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
model,
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if use_default_prompt_template:
|
||||
template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
try:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e:
|
||||
raise RuntimeError(
|
||||
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
|
||||
"Use 'use_default_prompt_template=False' to disable the default prompt template."
|
||||
) from None
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"num_return_sequences": num_return_sequences,
|
||||
}
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
|
||||
except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
|
||||
if len(generation_result) == 1:
|
||||
if self.config.format_outputs:
|
||||
logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
|
||||
return generation_result[0]
|
||||
|
||||
if self.config.format_outputs:
|
||||
return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else:
|
||||
return "\n".join(generation_result)
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
input_ids = self.tokenizer(prompt, return_tensors="tf")
|
||||
generated_tensors = self.model.generate(
|
||||
**input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
|
||||
if len(generation_result) == 1: return generation_result[0]
|
||||
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
|
||||
else: return "\n".join(generation_result)
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
|
||||
@@ -11,40 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_stablelm"] = ["StableLM"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_stablelm"] = ["StableLM"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_stablelm import START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
|
||||
from .configuration_stablelm import StableLMConfig as StableLMConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_stablelm import StableLM as StableLM
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_stablelm import StableLM as StableLM
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class StableLMConfig(openllm.LLMConfig):
|
||||
"""StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
|
||||
|
||||
@@ -30,7 +27,6 @@ class StableLMConfig(openllm.LLMConfig):
|
||||
and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
|
||||
for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"url": "https://github.com/Stability-AI/StableLM",
|
||||
@@ -43,14 +39,11 @@ class StableLMConfig(openllm.LLMConfig):
|
||||
"stabilityai/stablelm-base-alpha-7b",
|
||||
],
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.9
|
||||
max_new_tokens: int = 128
|
||||
top_k: int = 0
|
||||
top_p: float = 0.9
|
||||
|
||||
|
||||
START_STABLELM_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for StableLM model.
|
||||
|
||||
@@ -70,12 +63,10 @@ or provide `--model-id` flag when running ``openllm start stablelm``:
|
||||
\b
|
||||
$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
|
||||
"""
|
||||
|
||||
SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
|
||||
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
|
||||
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
|
||||
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
|
||||
- StableLM will refuse to participate in anything that could harm a human.
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
|
||||
|
||||
@@ -14,91 +14,27 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_stablelm import SYSTEM_PROMPT
|
||||
from ..._prompt import default_formatter
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import transformers # noqa
|
||||
import torch
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
if t.TYPE_CHECKING: import transformers, torch
|
||||
else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.bettertransformer = True if not torch.cuda.is_available() else False
|
||||
|
||||
def llm_post_init(self): self.bettertransformer = True if not torch.cuda.is_available() else False
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
temperature: float | None = None,
|
||||
max_new_tokens: int | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
|
||||
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if "tuned" in self._model_id and use_default_prompt_template:
|
||||
prompt_variables = {
|
||||
k: v
|
||||
for k, v in attrs.items()
|
||||
if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
|
||||
}
|
||||
if "instruction" in prompt_variables:
|
||||
raise RuntimeError(
|
||||
"'instruction' should be passed as the first argument "
|
||||
"instead of kwargs when 'use_default_prompt_template=True'"
|
||||
)
|
||||
prompt_variables = {k: v for k, v in attrs.items() if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)}
|
||||
if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
|
||||
system_prompt = prompt_variables.pop("system_prompt", SYSTEM_PROMPT)
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, system_prompt=system_prompt)
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
}
|
||||
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
else: prompt_text = prompt
|
||||
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
from ..._generation import StopOnTokens
|
||||
|
||||
generation_kwargs = {
|
||||
"do_sample": True,
|
||||
"generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
"pad_token_id": self.tokenizer.eos_token_id,
|
||||
"stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
|
||||
}
|
||||
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
|
||||
with torch.inference_mode():
|
||||
if torch.cuda.is_available():
|
||||
with torch.autocast("cuda", torch.float16):
|
||||
tokens = self.model.generate(**inputs, **generation_kwargs)
|
||||
else:
|
||||
tokens = self.model.generate(**inputs, **generation_kwargs)
|
||||
return [self.tokenizer.decode(tokens[0], skip_special_tokens=True)]
|
||||
with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=transformers.StoppingCriteriaList([StopOnTokens()]))[0], skip_special_tokens=True)]
|
||||
|
||||
@@ -11,40 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from ...exceptions import MissingDependencyError
|
||||
from ...utils import LazyModule
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_starcoder"] = ["StarCoder"]
|
||||
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: _import_structure["modeling_starcoder"] = ["StarCoder"]
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_starcoder import START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
|
||||
from .configuration_starcoder import StarCoderConfig as StarCoderConfig
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise MissingDependencyError
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_starcoder import StarCoder as StarCoder
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
if not is_torch_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else: from .modeling_starcoder import StarCoder as StarCoder
|
||||
else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|
||||
|
||||
@@ -12,10 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class StarCoderConfig(openllm.LLMConfig):
|
||||
"""The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
|
||||
|
||||
@@ -25,7 +22,6 @@ class StarCoderConfig(openllm.LLMConfig):
|
||||
|
||||
Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
"name_type": "lowercase",
|
||||
"requires_gpu": True,
|
||||
@@ -36,7 +32,6 @@ class StarCoderConfig(openllm.LLMConfig):
|
||||
"default_id": "bigcode/starcoder",
|
||||
"model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"],
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
temperature: float = 0.2
|
||||
max_new_tokens: int = 256
|
||||
@@ -45,8 +40,6 @@ class StarCoderConfig(openllm.LLMConfig):
|
||||
top_p: float = 0.95
|
||||
pad_token_id: int = 49152
|
||||
repetition_penalty: float = 1.2
|
||||
|
||||
|
||||
START_STARCODER_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for StarCoder model.
|
||||
|
||||
@@ -66,5 +59,4 @@ or provide `--model-id` flag when running ``openllm start starcoder``:
|
||||
\b
|
||||
$ openllm start starcoder --model-id 'bigcode/starcoder'
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -14,143 +14,53 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from ...utils import generate_labels
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
import torch, transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FIM_PREFIX = "<fim-prefix>"
|
||||
FIM_MIDDLE = "<fim-middle>"
|
||||
FIM_SUFFIX = "<fim-suffix>"
|
||||
FIM_PAD = "<fim-pad>"
|
||||
EOD = "<|endoftext|>"
|
||||
FIM_INDICATOR = "<FILL_HERE>"
|
||||
|
||||
|
||||
FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
|
||||
class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {
|
||||
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
||||
}
|
||||
tokenizer_kwds = {"padding_side": "left"}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
torch_dtype = attrs.pop("torch_dtype", torch.float16)
|
||||
device_map = attrs.pop("device_map", "auto")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
|
||||
"pad_token": EOD,
|
||||
}
|
||||
)
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
|
||||
)
|
||||
try:
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
model,
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
finally:
|
||||
# NOTE: We need to free the cache after saving here so that we can load it back later on.
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
temperature: float | None = None,
|
||||
top_p: float | None = None,
|
||||
max_new_tokens: int | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
fim_mode = FIM_INDICATOR in prompt
|
||||
prefix, suffix = None, None
|
||||
torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
|
||||
try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
finally: torch.cuda.empty_cache()
|
||||
def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
|
||||
if fim_mode:
|
||||
try:
|
||||
prefix, suffix = prompt.split(FIM_INDICATOR)
|
||||
except Exception as err:
|
||||
logger.error("Error while processing prompt with FIM mode:\n", exc_info=err)
|
||||
raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
|
||||
try: prefix, suffix = prompt.split(FIM_INDICATOR)
|
||||
except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
|
||||
prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = {
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
# XXX: This value is currently a hack, need more investigate why the
|
||||
# default starcoder doesn't include the same value as santacoder EOD
|
||||
"pad_token_id": 49152,
|
||||
**attrs,
|
||||
}
|
||||
|
||||
return prompt_text, generation_config, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
else: prompt_text = prompt
|
||||
# XXX: This value for pad_token_id is currently a hack, need more investigate why the
|
||||
# default starcoder doesn't include the same value as santacoder EOD
|
||||
return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
with torch.inference_mode():
|
||||
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
inputs,
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
# NOTE: support fine-tuning starcoder
|
||||
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(
|
||||
result_tensor[0],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
def generate_one(
|
||||
self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
|
||||
) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
from ..._generation import StopSequenceCriteria
|
||||
|
||||
max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
|
||||
encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
src_len = encoded_inputs["input_ids"].shape[1]
|
||||
stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
|
||||
stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
|
||||
outputs = self.model.generate(
|
||||
encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
|
||||
)
|
||||
|
||||
result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
|
||||
result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
# Inference API returns the stop sequence
|
||||
for stop_seq in stop:
|
||||
if result.endswith(stop_seq):
|
||||
result = result[: -len(stop_seq)]
|
||||
if result.endswith(stop_seq): result = result[: -len(stop_seq)]
|
||||
return [{"generated_text": result}]
|
||||
|
||||
@@ -193,6 +193,10 @@ def import_model(
|
||||
if _tokenizer.pad_token is None:
|
||||
_tokenizer.pad_token = _tokenizer.eos_token
|
||||
|
||||
# NOTE: quick hack to set the loaded into llm object
|
||||
object.__setattr__(llm, "__llm_model__", model)
|
||||
object.__setattr__(llm, "__llm_tokenizer__", _tokenizer)
|
||||
|
||||
try:
|
||||
with bentoml.models.create(
|
||||
llm.tag,
|
||||
@@ -210,9 +214,7 @@ def import_model(
|
||||
else None,
|
||||
metadata=metadata,
|
||||
) as bentomodel:
|
||||
save_pretrained(
|
||||
llm, bentomodel.path, model=model, tokenizer=_tokenizer, safe_serialization=safe_serialisation
|
||||
)
|
||||
save_pretrained(llm, bentomodel.path, safe_serialization=safe_serialisation)
|
||||
return bentomodel
|
||||
finally:
|
||||
# NOTE: We need to free up the cache after importing the model
|
||||
@@ -296,12 +298,12 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
or getattr(model, "is_loaded_in_4bit", False)
|
||||
or getattr(model, "is_quantized", False)
|
||||
)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
if torch.cuda.is_available() and not loaded_in_kbit:
|
||||
try:
|
||||
model = model.to("cuda")
|
||||
except torch.cuda.OutOfMemoryError as err:
|
||||
raise RuntimeError(
|
||||
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
|
||||
f"Failed to convert {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
|
||||
) from err
|
||||
if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
|
||||
# BetterTransformer is currently only supported on PyTorch.
|
||||
@@ -314,27 +316,19 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
def save_pretrained(
|
||||
llm: openllm.LLM[M, T],
|
||||
save_directory: str,
|
||||
model: M | None = None,
|
||||
tokenizer: T | None = None,
|
||||
is_main_process: bool = True,
|
||||
state_dict: DictStrAny | None = None,
|
||||
save_function: t.Callable[..., None] | None = None,
|
||||
push_to_hub: bool = False,
|
||||
max_shard_size: int | str = "10GB",
|
||||
max_shard_size: int | str = "2GB",
|
||||
safe_serialization: bool = False,
|
||||
variant: str | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> None:
|
||||
"""Light wrapper around ``transformers.PreTrainedTokenizer.save_pretrained`` and ``transformers.PreTrainedModel.save_pretrained``."""
|
||||
model = first_not_none(model, default=llm.__llm_model__)
|
||||
tokenizer = first_not_none(tokenizer, default=llm.__llm_tokenizer__)
|
||||
save_function = first_not_none(save_function, default=torch.save)
|
||||
model_save_attrs, tokenizer_save_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
|
||||
|
||||
if model is None or tokenizer is None:
|
||||
raise RuntimeError("Failed to find loaded model or tokenizer to save to local store.")
|
||||
|
||||
if llm._quantize_method == "gptq":
|
||||
if not is_autogptq_available():
|
||||
raise OpenLLMException(
|
||||
@@ -342,11 +336,11 @@ def save_pretrained(
|
||||
)
|
||||
if llm.config["model_type"] != "causal_lm":
|
||||
raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
model.save_quantized(save_directory, use_safetensors=safe_serialization)
|
||||
elif isinstance(model, _transformers.Pipeline):
|
||||
model.save_pretrained(save_directory, safe_serialization=safe_serialization)
|
||||
llm.model.save_quantized(save_directory, use_safetensors=safe_serialization)
|
||||
elif isinstance(llm.model, _transformers.Pipeline):
|
||||
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
|
||||
else:
|
||||
model.save_pretrained(
|
||||
llm.model.save_pretrained(
|
||||
save_directory,
|
||||
is_main_process=is_main_process,
|
||||
state_dict=state_dict,
|
||||
@@ -357,4 +351,4 @@ def save_pretrained(
|
||||
variant=variant,
|
||||
**model_save_attrs,
|
||||
)
|
||||
tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
|
||||
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
|
||||
|
||||
@@ -90,6 +90,10 @@ class ClientMeta(t.Generic[T]):
|
||||
|
||||
@property
|
||||
def _hf_agent(self) -> transformers.HfAgent:
|
||||
if not self.supports_hf_agent:
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f"{self.model_name} ({self.framework}) does not support running HF agent."
|
||||
)
|
||||
if self.__agent__ is None:
|
||||
if not openllm.utils.is_transformers_supports_agent():
|
||||
raise RuntimeError(
|
||||
@@ -130,6 +134,16 @@ class ClientMeta(t.Generic[T]):
|
||||
def configuration(self) -> dict[str, t.Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supports_embeddings(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supports_hf_agent(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def llm(self) -> openllm.LLM[t.Any, t.Any]:
|
||||
if self.__llm__ is None:
|
||||
|
||||
@@ -80,6 +80,20 @@ class GrpcClientMixin:
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
@property
|
||||
def supports_embeddings(self) -> bool:
|
||||
try:
|
||||
return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
@property
|
||||
def supports_hf_agent(self) -> bool:
|
||||
try:
|
||||
return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
if isinstance(result, dict):
|
||||
return openllm.GenerationOutput(**result)
|
||||
|
||||
@@ -77,6 +77,20 @@ class HTTPClientMixin:
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
@property
|
||||
def supports_embeddings(self) -> bool:
|
||||
try:
|
||||
return self._metadata.get("supports_embeddings", False)
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
@property
|
||||
def supports_hf_agent(self) -> bool:
|
||||
try:
|
||||
return self._metadata.get("supports_hf_agent", False)
|
||||
except KeyError:
|
||||
raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
|
||||
|
||||
def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
return openllm.GenerationOutput(**result)
|
||||
|
||||
|
||||
27
typings/attr/__init__.pyi
generated
27
typings/attr/__init__.pyi
generated
@@ -12,6 +12,7 @@ from typing import Protocol
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
from typing import Type
|
||||
from typing import TypeAlias
|
||||
from typing import TypeGuard
|
||||
from typing import TypeVar
|
||||
from typing import Union
|
||||
@@ -40,16 +41,16 @@ __copyright__: str
|
||||
_T = TypeVar("_T")
|
||||
_C = TypeVar("_C", bound=type)
|
||||
_P = ParamSpec("_P")
|
||||
_EqOrderType = Union[bool, Callable[[Any], Any]]
|
||||
_ValidatorType = Callable[[Any, Attribute[_T], _T], Any]
|
||||
_ConverterType = Callable[[Any], Any]
|
||||
_FilterType = Callable[[Attribute[_T], _T], bool]
|
||||
_ReprType = Callable[[Any], str]
|
||||
_ReprArgType = Union[bool, _ReprType]
|
||||
_OnSetAttrType = Callable[[Any, Attribute[Any], Any], Any]
|
||||
_OnSetAttrArgType = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
|
||||
_FieldTransformer = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
|
||||
_ValidatorArgType = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
|
||||
_EqOrderType: TypeAlias = Union[bool, Callable[[Any], Any]]
|
||||
_ValidatorType: TypeAlias = Callable[[Any, Attribute[_T], _T], Any]
|
||||
_ConverterType: TypeAlias = Callable[[Any], Any]
|
||||
_FilterType: TypeAlias = Callable[[Attribute[_T], _T], bool]
|
||||
_ReprType: TypeAlias = Callable[[Any], str]
|
||||
_ReprArgType: TypeAlias = Union[bool, _ReprType]
|
||||
_OnSetAttrType: TypeAlias = Callable[[Any, Attribute[Any], Any], Any]
|
||||
_OnSetAttrArgType: TypeAlias = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
|
||||
_FieldTransformer: TypeAlias = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
|
||||
_ValidatorArgType: TypeAlias = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
|
||||
|
||||
class AttrsInstance(AttrsInstance_, Protocol): ...
|
||||
|
||||
@@ -535,8 +536,10 @@ def get_run_validators() -> bool: ...
|
||||
|
||||
# aliases --
|
||||
|
||||
s = attributes = attrs
|
||||
ib = attr = attrib
|
||||
s = attrs
|
||||
attributes = attrs
|
||||
ib = attrib
|
||||
attr = attrib
|
||||
dataclass = attrs # Technically, partial(attrs, auto_attribs=True) ;)
|
||||
|
||||
class ReprProtocol(Protocol):
|
||||
|
||||
3
typings/attr/_cmp.pyi
generated
3
typings/attr/_cmp.pyi
generated
@@ -1,8 +1,9 @@
|
||||
from typing import Any
|
||||
from typing import Callable
|
||||
from typing import Optional
|
||||
from typing import TypeAlias
|
||||
|
||||
_CompareWithType = Callable[[Any, Any], bool]
|
||||
_CompareWithType: TypeAlias = Callable[[Any, Any], bool]
|
||||
|
||||
def cmp_using(
|
||||
eq: Optional[_CompareWithType] = ...,
|
||||
|
||||
2
typings/attr/_compat.pyi
generated
2
typings/attr/_compat.pyi
generated
@@ -1,5 +1,5 @@
|
||||
from typing import Any
|
||||
import threading
|
||||
from typing import Any
|
||||
|
||||
def set_closure_cell(cell: Any, value: Any) -> None: ...
|
||||
|
||||
|
||||
2
typings/attr/_make.pyi
generated
2
typings/attr/_make.pyi
generated
@@ -1,4 +1,4 @@
|
||||
from . import _CountingAttr as _CountingAttr
|
||||
from . import _make_repr as _make_repr
|
||||
from . import _make_init as _make_init
|
||||
from . import _make_repr as _make_repr
|
||||
from . import _transform_attrs as _transform_attrs
|
||||
|
||||
2
typings/click_option_group/_core.pyi
generated
2
typings/click_option_group/_core.pyi
generated
@@ -15,7 +15,7 @@ import click
|
||||
|
||||
_R = TypeVar("_R")
|
||||
_T = TypeVar("_T")
|
||||
AnyCallable = Callable[..., Any]
|
||||
AnyCallable: TypeAlias = Callable[..., Any]
|
||||
Decorator: TypeAlias = Callable[[_T], _T]
|
||||
_FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])
|
||||
|
||||
|
||||
3
typings/deepmerge/merger.pyi
generated
3
typings/deepmerge/merger.pyi
generated
@@ -2,6 +2,7 @@ from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import TypeAlias
|
||||
from typing import Union
|
||||
|
||||
from .strategy.core import StrategyList
|
||||
@@ -9,7 +10,7 @@ from .strategy.dict import DictStrategies
|
||||
from .strategy.list import ListStrategies
|
||||
from .strategy.set import SetStrategies
|
||||
|
||||
ConfigDictType = Dict[str, Any]
|
||||
ConfigDictType: TypeAlias = Dict[str, Any]
|
||||
|
||||
class Merger:
|
||||
PROVIDED_TYPE_STRATEGIES: Dict[type, Union[ListStrategies, DictStrategies, SetStrategies]] = ...
|
||||
|
||||
3
typings/deepmerge/strategy/core.pyi
generated
3
typings/deepmerge/strategy/core.pyi
generated
@@ -2,9 +2,10 @@ from typing import Any
|
||||
from typing import Callable
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from typing import TypeAlias
|
||||
from typing import Union
|
||||
|
||||
_StringOrFunction = Union[str, Callable[..., Any]]
|
||||
_StringOrFunction: TypeAlias = Union[str, Callable[..., Any]]
|
||||
STRATEGY_END: object = ...
|
||||
|
||||
class StrategyList:
|
||||
|
||||
2
typings/jupytext/config.pyi
generated
2
typings/jupytext/config.pyi
generated
@@ -1,6 +1,6 @@
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from collections.abc import Generator
|
||||
|
||||
from _typeshed import Incomplete
|
||||
|
||||
|
||||
1
typings/jupytext/formats.pyi
generated
1
typings/jupytext/formats.pyi
generated
@@ -1,4 +1,5 @@
|
||||
from typing import Any
|
||||
|
||||
from _typeshed import Incomplete
|
||||
|
||||
class JupytextFormatError(ValueError): ...
|
||||
|
||||
Reference in New Issue
Block a user