feat: chatglm and configuration naming type

by default, it is dasherize, but for cases like chatglm, it can be
lowercase as well

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-05-24 04:19:49 -07:00
parent 11c7783a0e
commit 2676085b59
11 changed files with 294 additions and 49 deletions

View File

@@ -42,6 +42,7 @@ _import_structure = {
"models.auto": ["AutoConfig", "CONFIG_MAPPING"],
"models.flan_t5": ["FlanT5Config"],
"models.dolly_v2": ["DollyV2Config"],
"models.chatglm": ["ChatGLMConfig"],
}
try:
@@ -54,6 +55,7 @@ except MissingDependencyError:
else:
_import_structure["models.flan_t5"].extend(["FlanT5"])
_import_structure["models.dolly_v2"].extend(["DollyV2"])
_import_structure["models.chatglm"].extend(["ChatGLM"])
_import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
try:
@@ -99,6 +101,7 @@ if t.TYPE_CHECKING:
from .cli import start_grpc as start_grpc
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
from .models.auto import AutoConfig as AutoConfig
from .models.chatglm import ChatGLMConfig as ChatGLMConfig
from .models.dolly_v2 import DollyV2Config as DollyV2Config
from .models.flan_t5 import FlanT5Config as FlanT5Config
@@ -111,6 +114,7 @@ if t.TYPE_CHECKING:
from .models.auto import MODEL_MAPPING as MODEL_MAPPING
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
from .models.auto import AutoLLM as AutoLLM
from .models.chatglm import ChatGLM as ChatGLM
from .models.dolly_v2 import DollyV2 as DollyV2
from .models.flan_t5 import FlanT5 as FlanT5

View File

@@ -430,29 +430,43 @@ class LLMConfig(pydantic.BaseModel, ABC):
__openllm_model_name__: str = ""
__openllm_start_name__: str = ""
__openllm_timeout__: int = 0
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
GenerationConfig: type[t.Any] = GenerationConfig
def __init_subclass__(cls, *, default_timeout: int | None = None, **kwargs: t.Any):
def __init_subclass__(
cls,
*,
default_timeout: int | None = None,
name_type: t.Literal["dasherize", "lowercase"] = "dasherize",
**kwargs: t.Any,
):
if default_timeout is None:
default_timeout = 3600
cls.__openllm_timeout__ = default_timeout
if name_type not in ("dasherize", "lowercase"):
raise RuntimeError(f"Unknown name_type {name_type}. Only allowed are 'dasherize' and 'lowercase'.")
cls.__openllm_name_type__ = name_type
super(LLMConfig, cls).__init_subclass__(**kwargs)
@classmethod
def __pydantic_init_subclass__(cls, **kwargs: t.Any):
cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
def __pydantic_init_subclass__(cls, **_: t.Any):
if cls.__openllm_name_type__ == "dasherize":
cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
else:
cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
cls.__openllm_start_name__ = cls.__openllm_model_name__
if hasattr(cls, "GenerationConfig"):
generation_class = t.cast(
cls.generation_config = t.cast(
"type[GenerationConfig]",
types.new_class(
cls.__name__.replace("Config", "") + "GenerationConfig",
(GenerationConfig,),
{"model_name": cls.__openllm_model_name__, "_internal": True},
),
)
cls.generation_config = generation_class.construct_from_llm_config(cls)
).construct_from_llm_config(cls)
delattr(cls, "GenerationConfig")
for key, field in cls.model_fields.items():

View File

@@ -70,7 +70,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):
TEXT2TEXT_GENERATION = enum.auto()
def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, *model_args: t.Any, **kwds: t.Any):
def import_model(
model_name: str,
tag: bentoml.Tag,
__openllm_framework__: str,
*model_args: t.Any,
tokenizer_kwds: dict[str, t.Any],
**kwds: t.Any,
):
"""Auto detect model type from given model_name and import it to bentoml's model store.
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first, returning all of the unused kwargs.
@@ -91,10 +98,6 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
config: transformers.PretrainedConfig = kwds.pop("config", None)
trust_remote_code = kwds.pop("trust_remote_code", False)
tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
# this logic below is synonymous to handling `from_pretrained` kwds.
hub_kwds_names = [
"cache_dir",
@@ -117,6 +120,7 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_kwds, **copied_kwds
),
)
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
task_type = "text-generation"
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
@@ -186,14 +190,6 @@ class LLMInterface(ABC):
)
if t.TYPE_CHECKING:
class LLMRunnable(bentoml.Runnable):
@abstractmethod
def generate(self, prompt: str, **kwargs: t.Any) -> t.Any:
...
class LLM(LLMInterface):
_implementation: t.Literal["pt", "tf", "flax"]
@@ -204,7 +200,14 @@ class LLM(LLMInterface):
if t.TYPE_CHECKING:
def import_model(self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, **kwds: t.Any) -> bentoml.Model:
def import_model(
self,
pretrained: str,
tag: bentoml.Tag,
*args: t.Any,
tokenizer_kwds: dict[str, t.Any],
**kwds: t.Any,
) -> bentoml.Model:
...
def __init_subclass__(cls, *, implementation: t.Literal["pt", "tf", "flax"] = "pt", _internal: bool = False):
@@ -216,7 +219,7 @@ class LLM(LLMInterface):
if implementation == "tf":
cls.config_class = getattr(openllm, f"{cls.__name__[2:]}Config")
elif implementation == "flax":
cls.config_class = getattr(openllm, f"{cls.__name__[len('flax'):]}Config")
cls.config_class = getattr(openllm, f"{cls.__name__[4:]}Config")
else:
cls.config_class = getattr(openllm, f"{cls.__name__}Config")
else:
@@ -245,7 +248,11 @@ class LLM(LLMInterface):
return {"configuration": self.config.model_dump(), "variants": self.variants}
def __init__(
self, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **kwargs: t.Any
self,
pretrained: str | None = None,
llm_config: openllm.LLMConfig | None = None,
*args: t.Any,
**kwargs: t.Any,
):
"""Initialize the LLM with given pretrained model.
@@ -261,16 +268,24 @@ class LLM(LLMInterface):
If you need to overwrite the default ``import_model``, implement the following in your subclass:
```python
def import_model(self, pretrained: str, tag: bentoml.Tag, *args: t.Any, **kwargs: t.Any):
tokenizer_kwargs = {k[len('_tokenizer_'):]: v for k, v in kwargs.items() if k.startswith('_tokenizer_')]}
kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_tokenizer_')}
def import_model(
self,
pretrained: str,
tag: bentoml.Tag,
*args: t.Any,
tokenizer_kwds: dict[str, t.Any],
**kwargs: t.Any,
):
return bentoml.transformers.save_model(
str(tag),
transformers.AutoModelForCausalLM.from_pretrained(
pretrained, device_map="auto", torch_dtype=torch.bfloat16, **kwargs
),
custom_objects={"tokenizer": transformers.AutoTokenizer.from_pretrained(pretrained, padding_size="left",
**tokenizer_kwargs)},
custom_objects={
"tokenizer": transformers.AutoTokenizer.from_pretrained(
pretrained, padding_size="left", **tokenizer_kwds
)
},
)
```
@@ -295,7 +310,7 @@ class LLM(LLMInterface):
"""
if llm_config is not None:
logger.debug("Using given 'llm_config=%s' to initialize LLM", llm_config)
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
self.config = llm_config
else:
self.config = self.config_class(**kwargs)
@@ -319,7 +334,11 @@ class LLM(LLMInterface):
@property
def _bentomodel(self) -> bentoml.Model:
if self.__bentomodel__ is None:
tag, kwargs = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
tag, kwds = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
try:
self.__bentomodel__ = bentoml.transformers.get(tag)
except bentoml.exceptions.BentoMLException:
@@ -328,13 +347,20 @@ class LLM(LLMInterface):
)
if hasattr(self, "import_model"):
logger.debug("Using custom 'import_model' defined in subclass.")
self.__bentomodel__ = self.import_model(self._pretrained, tag, *self._args, **kwargs)
self.__bentomodel__ = self.import_model(
self._pretrained, tag, *self._args, tokenizer_kwds=tokenizer_kwds, **kwds
)
else:
if self.import_kwargs:
kwargs = {**self.import_kwargs, **kwargs}
kwds = {**self.import_kwargs, **kwds}
# NOTE: In this branch, we just use the default implementation.
self.__bentomodel__ = import_model(
self._pretrained, tag, __openllm_framework__=self._implementation, *self._args, **kwargs
self._pretrained,
tag,
*self._args,
tokenizer_kwds=tokenizer_kwds,
__openllm_framework__=self._implementation,
**kwds,
)
return self.__bentomodel__
@@ -360,7 +386,8 @@ class LLM(LLMInterface):
# This could happen if users implement their own import_model
raise openllm.exceptions.OpenLLMException(
"Model does not have tokenizer. Make sure to save \
the tokenizer within the model via 'custom_objects'."
the tokenizer within the model via 'custom_objects'.\
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
)
return self.__llm_tokenizer__
@@ -406,7 +433,9 @@ class LLM(LLMInterface):
method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
else:
generate_sig = ModelSignature.convert_signatures_dict(method_configs).get("generate", generate_sig)
ModelSignature.convert_signatures_dict(method_configs).get("generate_iterator", generate_iterator_sig)
generate_iterator_sig = ModelSignature.convert_signatures_dict(method_configs).get(
"generate_iterator", generate_iterator_sig
)
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
@@ -418,7 +447,7 @@ class LLM(LLMInterface):
input_spec=generate_sig.input_spec,
output_spec=generate_sig.output_spec,
)
def generate(__self, prompt: str, **kwds: t.Any) -> list[str]:
def generate(__self, prompt: str, **kwds: t.Any) -> list[t.Any]:
return self.generate(prompt, **kwds)
@bentoml.Runnable.method(
@@ -427,14 +456,11 @@ class LLM(LLMInterface):
input_spec=generate_iterator_sig.input_spec,
output_spec=generate_iterator_sig.output_spec,
)
def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[str]:
return self.generate_iterator(prompt, **kwds)
def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[t.Any]:
yield self.generate_iterator(prompt, **kwds)
return bentoml.Runner(
t.cast(
"type[LLMRunnable]",
types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
),
types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
runnable_init_params=kwargs,
name=name,
models=models,

View File

@@ -13,4 +13,6 @@
# limitations under the License.
from . import auto as auto
from . import chatglm as chatglm
from . import dolly_v2 as dolly_v2
from . import flan_t5 as flan_t5

View File

@@ -28,7 +28,13 @@ else:
ConfigOrderedDict = OrderedDict
# NOTE: This is the entrypoint when adding new model config
CONFIG_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Config"), ("dolly_v2", "DollyV2Config")])
CONFIG_MAPPING_NAMES = OrderedDict(
[
("flan_t5", "FlanT5Config"),
("dolly_v2", "DollyV2Config"),
("chatglm", "ChatGLMConfig"),
]
)
class _LazyConfigMapping(ConfigOrderedDict):

View File

@@ -19,7 +19,7 @@ from collections import OrderedDict
from .configuration_auto import CONFIG_MAPPING_NAMES
from .factory import _BaseAutoLLMClass, _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2")])
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2"), ("chatglm", "ChatGLM")])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

View File

@@ -14,4 +14,39 @@
from __future__ import annotations
raise NotImplementedError("This module is not implemented yet.")
import typing as t
import openllm
_import_structure = {
"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
}
try:
if not openllm.utils.is_torch_available():
raise openllm.exceptions.MissingDependencyError
except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure["modeling_chatglm"] = ["ChatGLM"]
if t.TYPE_CHECKING:
from .configuration_chatglm import \
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
from .configuration_chatglm import \
START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
try:
if not openllm.utils.is_torch_available():
raise openllm.exceptions.MissingDependencyError
except openllm.exceptions.MissingDependencyError:
pass
else:
from .modeling_chatglm import ChatGLM as ChatGLM
else:
import sys
sys.modules[__name__] = openllm.utils.LazyModule(
__name__, globals()["__file__"], _import_structure, module_spec=__spec__
)

View File

@@ -0,0 +1,49 @@
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import openllm
class ChatGLMConfig(openllm.LLMConfig, name_type="lowercase"):
"""Configuration for the ChatGLM model."""
retain_history: bool = True
"""Whether to retain history given to the model. If set to True, then the model will retain given history."""
use_half_precision: bool = True
"""Whether to use half precision for model."""
class GenerationConfig:
max_length: int = 2048
num_beams: int = 1
top_p: float = 0.7
temperature: float = 0.95
START_CHATGLM_COMMAND_DOCSTRING = """\
Run a LLMServer for ChatGLM model and variants.
\b
> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
\b
## Usage
Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in your system.
\b
ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
"""

View File

@@ -0,0 +1,104 @@
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing as t
import bentoml
import transformers
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
import openllm
if t.TYPE_CHECKING:
import torch
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
class InvalidScoreLogitsProcessor(LogitsProcessor):
"""Ported from modeling_chatglm.py"""
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
class ChatGLM(openllm.LLM, _internal=True):
default_model = "THUDM/chatglm-6b"
variants = ["THUDM/chatglm-6b", "THUDM/chatglm-6b-int8", "THUDM/chatglm-6b-int4"]
def model_post_init(self, _: t.Any):
self.history: list[tuple[str, str]] = []
def import_model(
self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **kwds: t.Any
) -> bentoml.Model:
return bentoml.transformers.save_model(
str(tag),
transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=True, **kwds),
custom_objects={
"tokenizer": transformers.AutoTokenizer.from_pretrained(
pretrained, trust_remote_code=True, **tokenizer_kwds
)
},
)
@torch.inference_mode()
def generate(
self,
prompt: str,
max_length: int | None = None,
num_beams: int | None = None,
top_p: float | None = None,
temperature: float | None = None,
**kwargs: t.Any,
) -> t.Any:
if torch.cuda.is_available():
self.model = self.model.cuda()
if self.config.use_half_precision:
self.model = self.model.half()
self.model.eval()
logit_processor = LogitsProcessorList()
logit_processor.append(InvalidScoreLogitsProcessor())
prompt_text = ""
for i, (old_query, response) in enumerate(self.history):
prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
prompt_text += f"[Round {len(self.history)}]\n问:{prompt}\n答:"
inputs = self.tokenizer([prompt_text], return_tensors="pt").to(self.model.device)
outputs = self.model.generate(
**inputs,
generation_config=self.config.with_options(
max_length=max_length,
num_beams=num_beams,
top_p=top_p,
temperature=temperature,
do_sample=True,
**kwargs,
).to_generation_config(),
logits_processor=logit_processor,
)
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
response = self.tokenizer.decode(outputs)
response = self.model.process_response(response)
if self.config.retain_history:
self.history.append((prompt, response))
return self.history

View File

@@ -11,14 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The following includes OpenLLM configuration and excerpt from
[instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)"""
from __future__ import annotations
import openllm
# NOTE: The following includes OpenLLM configuration and excerpt from [instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)
class DollyV2Config(openllm.LLMConfig, default_timeout=3600000):
"""Configuration for the dolly-v2 model."""
@@ -55,7 +53,7 @@ INTRO_BLURB = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
# This is the prompt that is used for generating responses using an already trained model. It ends with the response
# NOTE: This is the prompt that is used for generating responses using an already trained model. It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
DEFAULT_PROMPT_TEMPLATE = """{intro}
{instruction_key}

View File

@@ -19,6 +19,13 @@ class DollyV2(metaclass=DummyMetaclass):
require_backends(self, ["torch"])
class ChatGLM(metaclass=DummyMetaclass):
_backends = ["torch"]
def __init__(self, *args: t.Any, **kwargs: t.Any):
require_backends(self, ["torch"])
class AutoLLM(metaclass=DummyMetaclass):
_backends = ["torch"]