infra: add structure and cleanup separation of tokenizer

since tokenizer are relatively light, all default LLM will bundle the
tokenizer with itself.

Maybe we can put the tokenizer in its own runner in the future

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-05-05 11:57:39 -07:00
parent 426a61713f
commit 2a53faee9c
21 changed files with 57 additions and 312 deletions

View File

@@ -34,7 +34,7 @@ _import_structure = {
"cli": [],
"configuration_utils": ["LLMConfig"],
"exceptions": [],
"runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
"runner_utils": ["LLMRunner", "LLMRunnable"],
"schema": ["PromptTemplate"],
"server_utils": ["start", "start_grpc"],
"types": [],
@@ -42,13 +42,7 @@ _import_structure = {
"models": [],
"client": [],
# NOTE: models
"models.auto": [
"AutoConfig",
"CONFIG_MAPPING",
"AutoTokenizer",
"TOKENIZER_MAPPING",
"TOKENIZER_MAPPING_NAMES",
],
"models.auto": ["AutoConfig", "CONFIG_MAPPING"],
"models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
}
@@ -58,17 +52,8 @@ try:
except MissingDependencyError:
pass
else:
_import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
_import_structure["models.auto"].extend(
[
"AutoLLM",
"AutoLLMWithTokenizer",
"MODEL_MAPPING_NAMES",
"MODEL_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_MAPPING",
"MODEL_WITH_TOKENIZER_MAPPING",
]
)
_import_structure["models.flan_t5"].extend(["FlanT5"])
_import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
try:
if not imports.is_flax_available():
@@ -76,17 +61,8 @@ try:
except MissingDependencyError:
pass
else:
_import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
_import_structure["models.auto"].extend(
[
"AutoFlaxLLM",
"AutoFlaxLLMWithTokenizer",
"MODEL_FLAX_MAPPING_NAMES",
"MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_FLAX_MAPPING",
"MODEL_FLAX_WITH_TOKENIZER_MAPPING",
]
)
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
_import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"])
try:
if not imports.is_tf_available():
@@ -94,17 +70,8 @@ try:
except MissingDependencyError:
pass
else:
_import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
_import_structure["models.auto"].extend(
[
"AutoTFLLM",
"AutoTFLLMWithTokenizer",
"MODEL_TF_MAPPING_NAMES",
"MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_TF_MAPPING",
"MODEL_TF_WITH_TOKENIZER_MAPPING",
]
)
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
_import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"])
# declaration for OpenLLM-related modules
@@ -123,17 +90,12 @@ if t.TYPE_CHECKING:
# Specific types import
from .configuration_utils import LLMConfig as LLMConfig
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
from .models.auto import AutoConfig as AutoConfig
from .models.auto import AutoTokenizer as AutoTokenizer
from .models.flan_t5 import \
START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
from .models.flan_t5 import FlanT5Config as FlanT5Config
from .runner_utils import LLMRunnable as LLMRunnable
from .runner_utils import LLMRunner as LLMRunner
from .runner_utils import \
generate_tokenizer_runner as generate_tokenizer_runner
from .schema import PromptTemplate as PromptTemplate
from .server_utils import start as start
from .server_utils import start_grpc as start_grpc
@@ -146,16 +108,8 @@ if t.TYPE_CHECKING:
else:
from .models.auto import MODEL_MAPPING as MODEL_MAPPING
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
from .models.auto import \
MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
from .models.auto import \
MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_WITH_TOKENIZER_MAPPING_NAMES
from .models.auto import AutoLLM as AutoLLM
from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
from .models.flan_t5 import FlanT5 as FlanT5
from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer
try:
if not imports.is_flax_available():
@@ -166,18 +120,8 @@ if t.TYPE_CHECKING:
from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
from .models.auto import \
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
from .models.auto import \
MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
MODEL_FLAX_WITH_TOKENIZER_MAPPING
from .models.auto import \
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
from .models.auto import AutoFlaxLLM as AutoFlaxLLM
from .models.auto import \
AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
from .models.flan_t5 import \
FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
try:
if not imports.is_tf_available():
@@ -188,17 +132,8 @@ if t.TYPE_CHECKING:
from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
from .models.auto import \
MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
from .models.auto import \
MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
from .models.auto import \
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
from .models.auto import AutoTFLLM as AutoTFLLM
from .models.auto import \
AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
from .models.flan_t5 import TFFlanT5 as TFFlanT5
from .models.flan_t5 import \
TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
else:
import sys

View File

@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc."""
from __future__ import annotations
@@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports
_import_structure = {
"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
"tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
}
try:
@@ -34,14 +33,7 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_auto"] = [
"AutoLLM",
"AutoLLMWithTokenizer",
"MODEL_MAPPING_NAMES",
"MODEL_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_MAPPING",
"MODEL_WITH_TOKENIZER_MAPPING",
]
_import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]
try:
if not imports.is_flax_available():
@@ -49,14 +41,7 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_flax_auto"] = [
"AutoFlaxLLM",
"AutoFlaxLLMWithTokenizer",
"MODEL_FLAX_MAPPING_NAMES",
"MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_FLAX_MAPPING",
"MODEL_FLAX_WITH_TOKENIZER_MAPPING",
]
_import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]
try:
if not imports.is_tf_available():
@@ -64,24 +49,13 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_tf_auto"] = [
"AutoTFLLM",
"AutoTFLLMWithTokenizer",
"MODEL_TF_MAPPING_NAMES",
"MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
"MODEL_TF_MAPPING",
"MODEL_TF_WITH_TOKENIZER_MAPPING",
]
_import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]
if t.TYPE_CHECKING:
from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
from .configuration_auto import \
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
from .configuration_auto import AutoConfig as AutoConfig
from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
from .tokenization_auto import \
TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
from .tokenization_auto import AutoTokenizer as AutoTokenizer
try:
if not imports.is_torch_available():
@@ -91,13 +65,7 @@ if t.TYPE_CHECKING:
else:
from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
from .modeling_auto import \
MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
from .modeling_auto import \
MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_WITH_TOKENIZER_MAPPING_NAMES
from .modeling_auto import AutoLLM as AutoLLM
from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
try:
if not imports.is_flax_available():
@@ -109,15 +77,7 @@ if t.TYPE_CHECKING:
MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
from .modeling_flax_auto import \
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
from .modeling_flax_auto import \
MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
MODEL_FLAX_WITH_TOKENIZER_MAPPING
from .modeling_flax_auto import \
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM
from .modeling_flax_auto import \
AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
try:
if not imports.is_tf_available():
@@ -128,14 +88,7 @@ if t.TYPE_CHECKING:
from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
from .modeling_tf_auto import \
MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
from .modeling_tf_auto import \
MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
from .modeling_tf_auto import \
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
from .modeling_tf_auto import AutoTFLLM as AutoTFLLM
from .modeling_tf_auto import \
AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
else:
import sys

View File

@@ -23,7 +23,8 @@ import openllm
from .configuration_auto import AutoConfig
def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]:
supported_runnables = runnable_mapping[type(config)]
if not isinstance(supported_runnables, (list, tuple)):
return supported_runnables
@@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory:
)
@classmethod
def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]):
"""
Register a new model for this class.

View File

@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])
MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES)
class AutoLLM(_BaseAutoRunnerFactory):
_model_mapping = MODEL_MAPPING
class AutoLLMWithTokenizer(_BaseAutoRunnerFactory):
_model_mapping = MODEL_WITH_TOKENIZER_MAPPING

View File

@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])
MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES)
class AutoFlaxLLM(_BaseAutoRunnerFactory):
_model_mapping = MODEL_FLAX_MAPPING
class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory):
_model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING

View File

@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])
MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES)
MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
class AutoTFLLM(_BaseAutoRunnerFactory):
_model_mapping = MODEL_TF_MAPPING
class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory):
_model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING

View File

@@ -1,46 +0,0 @@
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing as t
from collections import OrderedDict
import openllm
from .configuration_auto import _LazyConfigMapping
TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
class AutoTokenizer:
def __init__(self):
raise EnvironmentError(
"This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
)
@classmethod
def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
model_name = openllm.utils.kebab_to_snake_case(model_name)
if model_name in TOKENIZER_MAPPING:
tokenizer_class = TOKENIZER_MAPPING[model_name]
if pretrained_or_path is None:
pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
return tokenizer_class(pretrained_or_path, **kwargs)
raise ValueError(
f"Unrecognized model {model_name} to build an Tokenizer.\n"
f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
)

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports
_import_structure = {
"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
"service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
"service_flan_t5": ["svc", "model_runner", "generate"],
}
try:
@@ -30,7 +30,7 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
_import_structure["modeling_flan_t5"] = ["FlanT5"]
try:
if not imports.is_flax_available():
@@ -38,7 +38,7 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
_import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
try:
if not imports.is_tf_available():
@@ -46,7 +46,7 @@ try:
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
_import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"]
if t.TYPE_CHECKING:
@@ -55,6 +55,7 @@ if t.TYPE_CHECKING:
from .configuration_flan_t5 import \
START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
from .configuration_flan_t5 import FlanT5Config as FlanT5Config
from .service_flan_t5 import svc as svc
try:
if not imports.is_torch_available():
@@ -63,9 +64,6 @@ if t.TYPE_CHECKING:
pass
else:
from .modeling_flan_t5 import FlanT5 as FlanT5
from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
from .modeling_flan_t5 import \
FlanT5WithTokenizer as FlanT5WithTokenizer
try:
if not imports.is_flax_available():
@@ -74,8 +72,6 @@ if t.TYPE_CHECKING:
pass
else:
from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
from .modeling_flax_flan_t5 import \
FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
try:
if not imports.is_tf_available():
@@ -84,8 +80,6 @@ if t.TYPE_CHECKING:
pass
else:
from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
from .modeling_tf_flan_t5 import \
TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
else:
import sys

View File

@@ -17,8 +17,7 @@ import typing as t
import openllm
from ...runner_utils import (LLMRunnable, assign_start_model_name,
generate_tokenizer_runner)
from ...runner_utils import LLMRunnable
from .configuration_flan_t5 import FlanT5Config
if t.TYPE_CHECKING:
@@ -62,37 +61,11 @@ def import_model(
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
def _FlanT5Tokenizer(
pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
) -> openllm.types.TokenizerRunner:
"""Get the runner for the tokenizer.
Args:
model_name: The name of the FLAN-T5 model to import.
embedded: Whether to use the embedded runner or not.
**kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
Returns:
The runner for the tokenizer.
"""
if pretrained_or_path is None:
pretrained_or_path = FlanT5.default_model
return generate_tokenizer_runner(
import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
)
FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
class FlanT5(
LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
):
class FlanT5(LLMRunnable, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
config_class = FlanT5Config
ATTACH_TOKENIZER = False
ATTACH_TOKENIZER = True
_llm_config: FlanT5Config
@@ -106,9 +79,10 @@ class FlanT5(
"google/flan-t5-xxl",
]
@torch.inference_mode()
def _generate(
self,
input_ids: torch.Tensor,
prompt: str,
max_length: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
@@ -117,7 +91,9 @@ class FlanT5(
repetition_penalty: float | None = None,
**kwargs: t.Any,
) -> torch.Tensor:
return self.model.generate(
input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(self.device)
outputs = self.model.generate(
input_ids,
max_length=max_length if max_length is not None else self._llm_config.max_length,
do_sample=do_sample,
@@ -129,15 +105,4 @@ class FlanT5(
else self._llm_config.repetition_penalty,
**kwargs,
)
class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
ATTACH_TOKENIZER = True
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(self.device)
outputs = super()._generate(input_ids, **kwargs)
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

View File

@@ -61,13 +61,11 @@ def import_model(
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
class FlaxFlanT5(
LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
):
class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
config_class = FlanT5Config
ATTACH_TOKENIZER = False
ATTACH_TOKENIZER = True
_llm_config: FlanT5Config
@@ -81,7 +79,7 @@ class FlaxFlanT5(
def _generate(
self,
input_ids: jnp.ndarray,
prompt: str,
max_length: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
@@ -90,7 +88,8 @@ class FlaxFlanT5(
repetition_penalty: float | None = None,
**kwargs: t.Any,
) -> jnp.ndarray:
return self.model.generate(
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
outputs = self.model.generate(
input_ids,
max_length=max_length if max_length is not None else self._llm_config.max_length,
do_sample=do_sample,
@@ -102,16 +101,6 @@ class FlaxFlanT5(
else self._llm_config.repetition_penalty,
**kwargs,
)
class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
ATTACH_TOKENIZER = True
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
outputs = super()._generate(input_ids, **kwargs)
return self.tokenizer.batch_decode(
outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

View File

@@ -60,13 +60,11 @@ def import_model(
return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
class TFFlanT5(
LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
):
class TFFlanT5(LLMRunnable, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
config_class = FlanT5Config
ATTACH_TOKENIZER = False
ATTACH_TOKENIZER = True
_llm_config: FlanT5Config
@@ -80,7 +78,7 @@ class TFFlanT5(
def _generate(
self,
input_ids: tf.Tensor,
prompt: str,
max_length: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
@@ -89,7 +87,8 @@ class TFFlanT5(
repetition_penalty: float | None = None,
**kwargs: t.Any,
) -> tf.Tensor:
return self.model.generate(
input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
outputs = self.model.generate(
input_ids,
max_length=max_length if max_length is not None else self._llm_config.max_length,
do_sample=do_sample,
@@ -101,14 +100,4 @@ class TFFlanT5(
else self._llm_config.repetition_penalty,
**kwargs,
)
class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
default_model: str = "google/flan-t5-large"
ATTACH_TOKENIZER = True
def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
outputs = super()._generate(input_ids, **kwargs)
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

View File

@@ -31,9 +31,8 @@ else:
raise ValueError(f"Invalid framework {framework}")
model_runner = klass.create_runner("flan-t5")
tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5")
svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner])
@svc.api(
@@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru
async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput:
"""Returns the generated text from given prompts."""
llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict()
return_tensors = "np" if framework == "flax" else framework
input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
if framework == "flax":
outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
responses = await tokenizer_runner.batch_decode.async_run(
outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
else:
outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
responses = await model_runner.generate.async_run(qa.prompt, **llm_config)
return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config)

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -0,0 +1 @@
raise NotImplementedError("This module is not implemented yet.")

View File

@@ -44,9 +44,6 @@ else:
logger = logging.getLogger(__name__)
M = t.TypeVar("M")
T = t.TypeVar("T")
def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
@@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC):
# TODO: Add support for model validation
class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
class LLMRunnable(BaseLLMRunnable):
# The section below defines a loose contract with langchain's LLM interface.
@property
def _llm_type(self) -> str:
@@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
# XXX: INTERNAL
_module: LLMModuleType
_model: M | None = None
_tokenizer: T | None = None
_model: t.Any | None = None
_tokenizer: t.Any | None = None
def __setattr__(self, attr_name: str, value: t.Any) -> None:
if attr_name in ("ATTACH_TOKENIZER",):
@@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
return super().__getattribute__(item)
@classmethod
def dummy_object(cls) -> LLMRunnable[M, T]:
def dummy_object(cls) -> LLMRunnable:
return cls(_dummy=True, _internal=True)
@property
def model(self) -> M:
def model(self) -> t.Any:
# NOTE: should we have support for nested runner here?
if self._model is None:
self._model = self._bentomodel.load_model()
return self._model
@property
def tokenizer(self) -> T:
def tokenizer(self) -> t.Any:
# This is the runner generated from the bento model. This can
# then be used for implementation of _generate.
if self._tokenizer is None:
@@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner):
def __init__(
self,
runnable_class: type[LLMRunnable[t.Any, t.Any]],
runnable_class: type[LLMRunnable],
llm_config: LLMConfig,
**kwargs: t.Any,
):

View File

@@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader):
) -> bentoml.Model:
...
class LLMConfigImpl(LLMConfig):
class LLMConfigImpl(LLMConfig, model_name="dummy"):
...
class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
...
@staticmethod
def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"):
...