mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-05 06:12:43 -04:00
infra: add structure and cleanup separation of tokenizer
since tokenizer are relatively light, all default LLM will bundle the tokenizer with itself. Maybe we can put the tokenizer in its own runner in the future Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -34,7 +34,7 @@ _import_structure = {
|
||||
"cli": [],
|
||||
"configuration_utils": ["LLMConfig"],
|
||||
"exceptions": [],
|
||||
"runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
|
||||
"runner_utils": ["LLMRunner", "LLMRunnable"],
|
||||
"schema": ["PromptTemplate"],
|
||||
"server_utils": ["start", "start_grpc"],
|
||||
"types": [],
|
||||
@@ -42,13 +42,7 @@ _import_structure = {
|
||||
"models": [],
|
||||
"client": [],
|
||||
# NOTE: models
|
||||
"models.auto": [
|
||||
"AutoConfig",
|
||||
"CONFIG_MAPPING",
|
||||
"AutoTokenizer",
|
||||
"TOKENIZER_MAPPING",
|
||||
"TOKENIZER_MAPPING_NAMES",
|
||||
],
|
||||
"models.auto": ["AutoConfig", "CONFIG_MAPPING"],
|
||||
"models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
|
||||
}
|
||||
|
||||
@@ -58,17 +52,8 @@ try:
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
|
||||
_import_structure["models.auto"].extend(
|
||||
[
|
||||
"AutoLLM",
|
||||
"AutoLLMWithTokenizer",
|
||||
"MODEL_MAPPING_NAMES",
|
||||
"MODEL_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_MAPPING",
|
||||
"MODEL_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
)
|
||||
_import_structure["models.flan_t5"].extend(["FlanT5"])
|
||||
_import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -76,17 +61,8 @@ try:
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
|
||||
_import_structure["models.auto"].extend(
|
||||
[
|
||||
"AutoFlaxLLM",
|
||||
"AutoFlaxLLMWithTokenizer",
|
||||
"MODEL_FLAX_MAPPING_NAMES",
|
||||
"MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_FLAX_MAPPING",
|
||||
"MODEL_FLAX_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
)
|
||||
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
|
||||
_import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"])
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -94,17 +70,8 @@ try:
|
||||
except MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
|
||||
_import_structure["models.auto"].extend(
|
||||
[
|
||||
"AutoTFLLM",
|
||||
"AutoTFLLMWithTokenizer",
|
||||
"MODEL_TF_MAPPING_NAMES",
|
||||
"MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_TF_MAPPING",
|
||||
"MODEL_TF_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
)
|
||||
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
|
||||
_import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"])
|
||||
|
||||
|
||||
# declaration for OpenLLM-related modules
|
||||
@@ -123,17 +90,12 @@ if t.TYPE_CHECKING:
|
||||
# Specific types import
|
||||
from .configuration_utils import LLMConfig as LLMConfig
|
||||
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
|
||||
from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
|
||||
from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
|
||||
from .models.auto import AutoConfig as AutoConfig
|
||||
from .models.auto import AutoTokenizer as AutoTokenizer
|
||||
from .models.flan_t5 import \
|
||||
START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
|
||||
from .models.flan_t5 import FlanT5Config as FlanT5Config
|
||||
from .runner_utils import LLMRunnable as LLMRunnable
|
||||
from .runner_utils import LLMRunner as LLMRunner
|
||||
from .runner_utils import \
|
||||
generate_tokenizer_runner as generate_tokenizer_runner
|
||||
from .schema import PromptTemplate as PromptTemplate
|
||||
from .server_utils import start as start
|
||||
from .server_utils import start_grpc as start_grpc
|
||||
@@ -146,16 +108,8 @@ if t.TYPE_CHECKING:
|
||||
else:
|
||||
from .models.auto import MODEL_MAPPING as MODEL_MAPPING
|
||||
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
|
||||
from .models.auto import \
|
||||
MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
|
||||
from .models.auto import \
|
||||
MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .models.auto import AutoLLM as AutoLLM
|
||||
from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
|
||||
from .models.flan_t5 import FlanT5 as FlanT5
|
||||
from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
|
||||
from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -166,18 +120,8 @@ if t.TYPE_CHECKING:
|
||||
from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
|
||||
from .models.auto import \
|
||||
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
|
||||
from .models.auto import \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING
|
||||
from .models.auto import \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .models.auto import AutoFlaxLLM as AutoFlaxLLM
|
||||
from .models.auto import \
|
||||
AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
|
||||
from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
|
||||
from .models.flan_t5 import \
|
||||
FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -188,17 +132,8 @@ if t.TYPE_CHECKING:
|
||||
from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
|
||||
from .models.auto import \
|
||||
MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
|
||||
from .models.auto import \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
|
||||
from .models.auto import \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .models.auto import AutoTFLLM as AutoTFLLM
|
||||
from .models.auto import \
|
||||
AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
|
||||
from .models.flan_t5 import TFFlanT5 as TFFlanT5
|
||||
from .models.flan_t5 import \
|
||||
TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
|
||||
"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports
|
||||
|
||||
_import_structure = {
|
||||
"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
|
||||
"tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -34,14 +33,7 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_auto"] = [
|
||||
"AutoLLM",
|
||||
"AutoLLMWithTokenizer",
|
||||
"MODEL_MAPPING_NAMES",
|
||||
"MODEL_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_MAPPING",
|
||||
"MODEL_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
_import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -49,14 +41,7 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flax_auto"] = [
|
||||
"AutoFlaxLLM",
|
||||
"AutoFlaxLLMWithTokenizer",
|
||||
"MODEL_FLAX_MAPPING_NAMES",
|
||||
"MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_FLAX_MAPPING",
|
||||
"MODEL_FLAX_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
_import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -64,24 +49,13 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_auto"] = [
|
||||
"AutoTFLLM",
|
||||
"AutoTFLLMWithTokenizer",
|
||||
"MODEL_TF_MAPPING_NAMES",
|
||||
"MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
|
||||
"MODEL_TF_MAPPING",
|
||||
"MODEL_TF_WITH_TOKENIZER_MAPPING",
|
||||
]
|
||||
_import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
|
||||
from .configuration_auto import \
|
||||
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
|
||||
from .configuration_auto import AutoConfig as AutoConfig
|
||||
from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
|
||||
from .tokenization_auto import \
|
||||
TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
|
||||
from .tokenization_auto import AutoTokenizer as AutoTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_torch_available():
|
||||
@@ -91,13 +65,7 @@ if t.TYPE_CHECKING:
|
||||
else:
|
||||
from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
|
||||
from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
|
||||
from .modeling_auto import \
|
||||
MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
|
||||
from .modeling_auto import \
|
||||
MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .modeling_auto import AutoLLM as AutoLLM
|
||||
from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -109,15 +77,7 @@ if t.TYPE_CHECKING:
|
||||
MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
|
||||
from .modeling_flax_auto import \
|
||||
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
|
||||
from .modeling_flax_auto import \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING
|
||||
from .modeling_flax_auto import \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM
|
||||
from .modeling_flax_auto import \
|
||||
AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -128,14 +88,7 @@ if t.TYPE_CHECKING:
|
||||
from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
|
||||
from .modeling_tf_auto import \
|
||||
MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
|
||||
from .modeling_tf_auto import \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
|
||||
from .modeling_tf_auto import \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
|
||||
from .modeling_tf_auto import AutoTFLLM as AutoTFLLM
|
||||
from .modeling_tf_auto import \
|
||||
AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@ import openllm
|
||||
|
||||
from .configuration_auto import AutoConfig
|
||||
|
||||
def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
|
||||
|
||||
def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]:
|
||||
supported_runnables = runnable_mapping[type(config)]
|
||||
if not isinstance(supported_runnables, (list, tuple)):
|
||||
return supported_runnables
|
||||
@@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
|
||||
def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]):
|
||||
"""
|
||||
Register a new model for this class.
|
||||
|
||||
|
||||
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
|
||||
|
||||
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])
|
||||
|
||||
MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
|
||||
|
||||
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
|
||||
|
||||
MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES)
|
||||
|
||||
|
||||
class AutoLLM(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_MAPPING
|
||||
|
||||
|
||||
class AutoLLMWithTokenizer(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_WITH_TOKENIZER_MAPPING
|
||||
|
||||
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
|
||||
|
||||
MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])
|
||||
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
|
||||
|
||||
MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
|
||||
|
||||
MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES)
|
||||
|
||||
|
||||
class AutoFlaxLLM(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_FLAX_MAPPING
|
||||
|
||||
|
||||
class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING
|
||||
|
||||
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
|
||||
|
||||
MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])
|
||||
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
|
||||
|
||||
MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
|
||||
|
||||
MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES)
|
||||
MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
|
||||
|
||||
|
||||
class AutoTFLLM(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_TF_MAPPING
|
||||
|
||||
|
||||
class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory):
|
||||
_model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import typing as t
|
||||
from collections import OrderedDict
|
||||
|
||||
import openllm
|
||||
|
||||
from .configuration_auto import _LazyConfigMapping
|
||||
|
||||
TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
|
||||
|
||||
TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
|
||||
|
||||
|
||||
class AutoTokenizer:
|
||||
def __init__(self):
|
||||
raise EnvironmentError(
|
||||
"This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
|
||||
model_name = openllm.utils.kebab_to_snake_case(model_name)
|
||||
if model_name in TOKENIZER_MAPPING:
|
||||
tokenizer_class = TOKENIZER_MAPPING[model_name]
|
||||
if pretrained_or_path is None:
|
||||
pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
|
||||
return tokenizer_class(pretrained_or_path, **kwargs)
|
||||
raise ValueError(
|
||||
f"Unrecognized model {model_name} to build an Tokenizer.\n"
|
||||
f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
|
||||
)
|
||||
1
src/openllm/models/chatglm/__init__.py
Normal file
1
src/openllm/models/chatglm/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
1
src/openllm/models/dolly_v2/__init__.py
Normal file
1
src/openllm/models/dolly_v2/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
@@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports
|
||||
|
||||
_import_structure = {
|
||||
"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
"service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
|
||||
"service_flan_t5": ["svc", "model_runner", "generate"],
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -30,7 +30,7 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
|
||||
_import_structure["modeling_flan_t5"] = ["FlanT5"]
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -38,7 +38,7 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
|
||||
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -46,7 +46,7 @@ try:
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
|
||||
_import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"]
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -55,6 +55,7 @@ if t.TYPE_CHECKING:
|
||||
from .configuration_flan_t5 import \
|
||||
START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
|
||||
from .configuration_flan_t5 import FlanT5Config as FlanT5Config
|
||||
from .service_flan_t5 import svc as svc
|
||||
|
||||
try:
|
||||
if not imports.is_torch_available():
|
||||
@@ -63,9 +64,6 @@ if t.TYPE_CHECKING:
|
||||
pass
|
||||
else:
|
||||
from .modeling_flan_t5 import FlanT5 as FlanT5
|
||||
from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
|
||||
from .modeling_flan_t5 import \
|
||||
FlanT5WithTokenizer as FlanT5WithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_flax_available():
|
||||
@@ -74,8 +72,6 @@ if t.TYPE_CHECKING:
|
||||
pass
|
||||
else:
|
||||
from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
|
||||
from .modeling_flax_flan_t5 import \
|
||||
FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
|
||||
|
||||
try:
|
||||
if not imports.is_tf_available():
|
||||
@@ -84,8 +80,6 @@ if t.TYPE_CHECKING:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
|
||||
from .modeling_tf_flan_t5 import \
|
||||
TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -17,8 +17,7 @@ import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from ...runner_utils import (LLMRunnable, assign_start_model_name,
|
||||
generate_tokenizer_runner)
|
||||
from ...runner_utils import LLMRunnable
|
||||
from .configuration_flan_t5 import FlanT5Config
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -62,37 +61,11 @@ def import_model(
|
||||
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
|
||||
|
||||
|
||||
def _FlanT5Tokenizer(
|
||||
pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
|
||||
) -> openllm.types.TokenizerRunner:
|
||||
"""Get the runner for the tokenizer.
|
||||
|
||||
Args:
|
||||
model_name: The name of the FLAN-T5 model to import.
|
||||
embedded: Whether to use the embedded runner or not.
|
||||
**kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
|
||||
|
||||
Returns:
|
||||
The runner for the tokenizer.
|
||||
"""
|
||||
if pretrained_or_path is None:
|
||||
pretrained_or_path = FlanT5.default_model
|
||||
|
||||
return generate_tokenizer_runner(
|
||||
import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
|
||||
)
|
||||
|
||||
|
||||
FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
|
||||
|
||||
|
||||
class FlanT5(
|
||||
LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
|
||||
):
|
||||
class FlanT5(LLMRunnable, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
config_class = FlanT5Config
|
||||
|
||||
ATTACH_TOKENIZER = False
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
_llm_config: FlanT5Config
|
||||
|
||||
@@ -106,9 +79,10 @@ class FlanT5(
|
||||
"google/flan-t5-xxl",
|
||||
]
|
||||
|
||||
@torch.inference_mode()
|
||||
def _generate(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
prompt: str,
|
||||
max_length: int | None = None,
|
||||
do_sample: bool = True,
|
||||
temperature: float | None = None,
|
||||
@@ -117,7 +91,9 @@ class FlanT5(
|
||||
repetition_penalty: float | None = None,
|
||||
**kwargs: t.Any,
|
||||
) -> torch.Tensor:
|
||||
return self.model.generate(
|
||||
input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
|
||||
input_ids = input_ids.to(self.device)
|
||||
outputs = self.model.generate(
|
||||
input_ids,
|
||||
max_length=max_length if max_length is not None else self._llm_config.max_length,
|
||||
do_sample=do_sample,
|
||||
@@ -129,15 +105,4 @@ class FlanT5(
|
||||
else self._llm_config.repetition_penalty,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
|
||||
input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
|
||||
input_ids = input_ids.to(self.device)
|
||||
outputs = super()._generate(input_ids, **kwargs)
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
|
||||
@@ -61,13 +61,11 @@ def import_model(
|
||||
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
|
||||
|
||||
|
||||
class FlaxFlanT5(
|
||||
LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
|
||||
):
|
||||
class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
config_class = FlanT5Config
|
||||
|
||||
ATTACH_TOKENIZER = False
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
_llm_config: FlanT5Config
|
||||
|
||||
@@ -81,7 +79,7 @@ class FlaxFlanT5(
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
input_ids: jnp.ndarray,
|
||||
prompt: str,
|
||||
max_length: int | None = None,
|
||||
do_sample: bool = True,
|
||||
temperature: float | None = None,
|
||||
@@ -90,7 +88,8 @@ class FlaxFlanT5(
|
||||
repetition_penalty: float | None = None,
|
||||
**kwargs: t.Any,
|
||||
) -> jnp.ndarray:
|
||||
return self.model.generate(
|
||||
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
|
||||
outputs = self.model.generate(
|
||||
input_ids,
|
||||
max_length=max_length if max_length is not None else self._llm_config.max_length,
|
||||
do_sample=do_sample,
|
||||
@@ -102,16 +101,6 @@ class FlaxFlanT5(
|
||||
else self._llm_config.repetition_penalty,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
|
||||
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
|
||||
outputs = super()._generate(input_ids, **kwargs)
|
||||
return self.tokenizer.batch_decode(
|
||||
outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
||||
)
|
||||
|
||||
@@ -60,13 +60,11 @@ def import_model(
|
||||
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
|
||||
|
||||
|
||||
class TFFlanT5(
|
||||
LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
|
||||
):
|
||||
class TFFlanT5(LLMRunnable, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
config_class = FlanT5Config
|
||||
|
||||
ATTACH_TOKENIZER = False
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
_llm_config: FlanT5Config
|
||||
|
||||
@@ -80,7 +78,7 @@ class TFFlanT5(
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
input_ids: tf.Tensor,
|
||||
prompt: str,
|
||||
max_length: int | None = None,
|
||||
do_sample: bool = True,
|
||||
temperature: float | None = None,
|
||||
@@ -89,7 +87,8 @@ class TFFlanT5(
|
||||
repetition_penalty: float | None = None,
|
||||
**kwargs: t.Any,
|
||||
) -> tf.Tensor:
|
||||
return self.model.generate(
|
||||
input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
|
||||
outputs = self.model.generate(
|
||||
input_ids,
|
||||
max_length=max_length if max_length is not None else self._llm_config.max_length,
|
||||
do_sample=do_sample,
|
||||
@@ -101,14 +100,4 @@ class TFFlanT5(
|
||||
else self._llm_config.repetition_penalty,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
|
||||
default_model: str = "google/flan-t5-large"
|
||||
|
||||
ATTACH_TOKENIZER = True
|
||||
|
||||
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
|
||||
input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
|
||||
outputs = super()._generate(input_ids, **kwargs)
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
|
||||
@@ -31,9 +31,8 @@ else:
|
||||
raise ValueError(f"Invalid framework {framework}")
|
||||
|
||||
model_runner = klass.create_runner("flan-t5")
|
||||
tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5")
|
||||
|
||||
svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
|
||||
svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner])
|
||||
|
||||
|
||||
@svc.api(
|
||||
@@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru
|
||||
async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput:
|
||||
"""Returns the generated text from given prompts."""
|
||||
llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict()
|
||||
|
||||
return_tensors = "np" if framework == "flax" else framework
|
||||
input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
|
||||
if framework == "flax":
|
||||
outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
|
||||
responses = await tokenizer_runner.batch_decode.async_run(
|
||||
outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
||||
)
|
||||
else:
|
||||
outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
|
||||
responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
|
||||
responses = await model_runner.generate.async_run(qa.prompt, **llm_config)
|
||||
return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config)
|
||||
|
||||
|
||||
|
||||
1
src/openllm/models/gpt_neox/__init__.py
Normal file
1
src/openllm/models/gpt_neox/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
1
src/openllm/models/gptj/__init__.py
Normal file
1
src/openllm/models/gptj/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
1
src/openllm/models/llama/__init__.py
Normal file
1
src/openllm/models/llama/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
1
src/openllm/models/roberta/__init__.py
Normal file
1
src/openllm/models/roberta/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
1
src/openllm/models/stablelm/__init__.py
Normal file
1
src/openllm/models/stablelm/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
@@ -44,9 +44,6 @@ else:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
M = t.TypeVar("M")
|
||||
T = t.TypeVar("T")
|
||||
|
||||
|
||||
def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
|
||||
def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
|
||||
@@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC):
|
||||
|
||||
|
||||
# TODO: Add support for model validation
|
||||
class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
|
||||
class LLMRunnable(BaseLLMRunnable):
|
||||
# The section below defines a loose contract with langchain's LLM interface.
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
@@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
|
||||
|
||||
# XXX: INTERNAL
|
||||
_module: LLMModuleType
|
||||
_model: M | None = None
|
||||
_tokenizer: T | None = None
|
||||
_model: t.Any | None = None
|
||||
_tokenizer: t.Any | None = None
|
||||
|
||||
def __setattr__(self, attr_name: str, value: t.Any) -> None:
|
||||
if attr_name in ("ATTACH_TOKENIZER",):
|
||||
@@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
|
||||
return super().__getattribute__(item)
|
||||
|
||||
@classmethod
|
||||
def dummy_object(cls) -> LLMRunnable[M, T]:
|
||||
def dummy_object(cls) -> LLMRunnable:
|
||||
return cls(_dummy=True, _internal=True)
|
||||
|
||||
@property
|
||||
def model(self) -> M:
|
||||
def model(self) -> t.Any:
|
||||
# NOTE: should we have support for nested runner here?
|
||||
if self._model is None:
|
||||
self._model = self._bentomodel.load_model()
|
||||
return self._model
|
||||
|
||||
@property
|
||||
def tokenizer(self) -> T:
|
||||
def tokenizer(self) -> t.Any:
|
||||
# This is the runner generated from the bento model. This can
|
||||
# then be used for implementation of _generate.
|
||||
if self._tokenizer is None:
|
||||
@@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runnable_class: type[LLMRunnable[t.Any, t.Any]],
|
||||
runnable_class: type[LLMRunnable],
|
||||
llm_config: LLMConfig,
|
||||
**kwargs: t.Any,
|
||||
):
|
||||
|
||||
@@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader):
|
||||
) -> bentoml.Model:
|
||||
...
|
||||
|
||||
class LLMConfigImpl(LLMConfig):
|
||||
class LLMConfigImpl(LLMConfig, model_name="dummy"):
|
||||
...
|
||||
|
||||
class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
|
||||
class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"):
|
||||
...
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user