diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py index 78241eba..b029a9e3 100644 --- a/src/openllm/__init__.py +++ b/src/openllm/__init__.py @@ -42,6 +42,7 @@ _import_structure = { "models.auto": ["AutoConfig", "CONFIG_MAPPING"], "models.flan_t5": ["FlanT5Config"], "models.dolly_v2": ["DollyV2Config"], + "models.chatglm": ["ChatGLMConfig"], } try: @@ -54,6 +55,7 @@ except MissingDependencyError: else: _import_structure["models.flan_t5"].extend(["FlanT5"]) _import_structure["models.dolly_v2"].extend(["DollyV2"]) + _import_structure["models.chatglm"].extend(["ChatGLM"]) _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]) try: @@ -99,6 +101,7 @@ if t.TYPE_CHECKING: from .cli import start_grpc as start_grpc from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING from .models.auto import AutoConfig as AutoConfig + from .models.chatglm import ChatGLMConfig as ChatGLMConfig from .models.dolly_v2 import DollyV2Config as DollyV2Config from .models.flan_t5 import FlanT5Config as FlanT5Config @@ -111,6 +114,7 @@ if t.TYPE_CHECKING: from .models.auto import MODEL_MAPPING as MODEL_MAPPING from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES from .models.auto import AutoLLM as AutoLLM + from .models.chatglm import ChatGLM as ChatGLM from .models.dolly_v2 import DollyV2 as DollyV2 from .models.flan_t5 import FlanT5 as FlanT5 diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py index 2bb9e60c..8c765ad0 100644 --- a/src/openllm/_configuration.py +++ b/src/openllm/_configuration.py @@ -430,29 +430,43 @@ class LLMConfig(pydantic.BaseModel, ABC): __openllm_model_name__: str = "" __openllm_start_name__: str = "" __openllm_timeout__: int = 0 + __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize" GenerationConfig: type[t.Any] = GenerationConfig - def __init_subclass__(cls, *, default_timeout: int | None = None, **kwargs: t.Any): + def __init_subclass__( + cls, + *, + default_timeout: int | None = None, + name_type: t.Literal["dasherize", "lowercase"] = "dasherize", + **kwargs: t.Any, + ): if default_timeout is None: default_timeout = 3600 cls.__openllm_timeout__ = default_timeout + if name_type not in ("dasherize", "lowercase"): + raise RuntimeError(f"Unknown name_type {name_type}. Only allowed are 'dasherize' and 'lowercase'.") + cls.__openllm_name_type__ = name_type super(LLMConfig, cls).__init_subclass__(**kwargs) @classmethod - def __pydantic_init_subclass__(cls, **kwargs: t.Any): - cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", "")) - cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__) + def __pydantic_init_subclass__(cls, **_: t.Any): + if cls.__openllm_name_type__ == "dasherize": + cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", "")) + cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__) + else: + cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower() + cls.__openllm_start_name__ = cls.__openllm_model_name__ + if hasattr(cls, "GenerationConfig"): - generation_class = t.cast( + cls.generation_config = t.cast( "type[GenerationConfig]", types.new_class( cls.__name__.replace("Config", "") + "GenerationConfig", (GenerationConfig,), {"model_name": cls.__openllm_model_name__, "_internal": True}, ), - ) - cls.generation_config = generation_class.construct_from_llm_config(cls) + ).construct_from_llm_config(cls) delattr(cls, "GenerationConfig") for key, field in cls.model_fields.items(): diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index 0df5160b..5caae2ba 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -70,7 +70,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta): TEXT2TEXT_GENERATION = enum.auto() -def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, *model_args: t.Any, **kwds: t.Any): +def import_model( + model_name: str, + tag: bentoml.Tag, + __openllm_framework__: str, + *model_args: t.Any, + tokenizer_kwds: dict[str, t.Any], + **kwds: t.Any, +): """Auto detect model type from given model_name and import it to bentoml's model store. For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first, returning all of the unused kwargs. @@ -91,10 +98,6 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, config: transformers.PretrainedConfig = kwds.pop("config", None) trust_remote_code = kwds.pop("trust_remote_code", False) - tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")} - - kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")} - # this logic below is synonymous to handling `from_pretrained` kwds. hub_kwds_names = [ "cache_dir", @@ -117,6 +120,7 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_kwds, **copied_kwds ), ) + if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: task_type = "text-generation" elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: @@ -186,14 +190,6 @@ class LLMInterface(ABC): ) -if t.TYPE_CHECKING: - - class LLMRunnable(bentoml.Runnable): - @abstractmethod - def generate(self, prompt: str, **kwargs: t.Any) -> t.Any: - ... - - class LLM(LLMInterface): _implementation: t.Literal["pt", "tf", "flax"] @@ -204,7 +200,14 @@ class LLM(LLMInterface): if t.TYPE_CHECKING: - def import_model(self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, **kwds: t.Any) -> bentoml.Model: + def import_model( + self, + pretrained: str, + tag: bentoml.Tag, + *args: t.Any, + tokenizer_kwds: dict[str, t.Any], + **kwds: t.Any, + ) -> bentoml.Model: ... def __init_subclass__(cls, *, implementation: t.Literal["pt", "tf", "flax"] = "pt", _internal: bool = False): @@ -216,7 +219,7 @@ class LLM(LLMInterface): if implementation == "tf": cls.config_class = getattr(openllm, f"{cls.__name__[2:]}Config") elif implementation == "flax": - cls.config_class = getattr(openllm, f"{cls.__name__[len('flax'):]}Config") + cls.config_class = getattr(openllm, f"{cls.__name__[4:]}Config") else: cls.config_class = getattr(openllm, f"{cls.__name__}Config") else: @@ -245,7 +248,11 @@ class LLM(LLMInterface): return {"configuration": self.config.model_dump(), "variants": self.variants} def __init__( - self, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **kwargs: t.Any + self, + pretrained: str | None = None, + llm_config: openllm.LLMConfig | None = None, + *args: t.Any, + **kwargs: t.Any, ): """Initialize the LLM with given pretrained model. @@ -261,16 +268,24 @@ class LLM(LLMInterface): If you need to overwrite the default ``import_model``, implement the following in your subclass: ```python - def import_model(self, pretrained: str, tag: bentoml.Tag, *args: t.Any, **kwargs: t.Any): - tokenizer_kwargs = {k[len('_tokenizer_'):]: v for k, v in kwargs.items() if k.startswith('_tokenizer_')]} - kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_tokenizer_')} + def import_model( + self, + pretrained: str, + tag: bentoml.Tag, + *args: t.Any, + tokenizer_kwds: dict[str, t.Any], + **kwargs: t.Any, + ): return bentoml.transformers.save_model( str(tag), transformers.AutoModelForCausalLM.from_pretrained( pretrained, device_map="auto", torch_dtype=torch.bfloat16, **kwargs ), - custom_objects={"tokenizer": transformers.AutoTokenizer.from_pretrained(pretrained, padding_size="left", - **tokenizer_kwargs)}, + custom_objects={ + "tokenizer": transformers.AutoTokenizer.from_pretrained( + pretrained, padding_size="left", **tokenizer_kwds + ) + }, ) ``` @@ -295,7 +310,7 @@ class LLM(LLMInterface): """ if llm_config is not None: - logger.debug("Using given 'llm_config=%s' to initialize LLM", llm_config) + logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config) self.config = llm_config else: self.config = self.config_class(**kwargs) @@ -319,7 +334,11 @@ class LLM(LLMInterface): @property def _bentomodel(self) -> bentoml.Model: if self.__bentomodel__ is None: - tag, kwargs = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs) + tag, kwds = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs) + + tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")} + kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")} + try: self.__bentomodel__ = bentoml.transformers.get(tag) except bentoml.exceptions.BentoMLException: @@ -328,13 +347,20 @@ class LLM(LLMInterface): ) if hasattr(self, "import_model"): logger.debug("Using custom 'import_model' defined in subclass.") - self.__bentomodel__ = self.import_model(self._pretrained, tag, *self._args, **kwargs) + self.__bentomodel__ = self.import_model( + self._pretrained, tag, *self._args, tokenizer_kwds=tokenizer_kwds, **kwds + ) else: if self.import_kwargs: - kwargs = {**self.import_kwargs, **kwargs} + kwds = {**self.import_kwargs, **kwds} # NOTE: In this branch, we just use the default implementation. self.__bentomodel__ = import_model( - self._pretrained, tag, __openllm_framework__=self._implementation, *self._args, **kwargs + self._pretrained, + tag, + *self._args, + tokenizer_kwds=tokenizer_kwds, + __openllm_framework__=self._implementation, + **kwds, ) return self.__bentomodel__ @@ -360,7 +386,8 @@ class LLM(LLMInterface): # This could happen if users implement their own import_model raise openllm.exceptions.OpenLLMException( "Model does not have tokenizer. Make sure to save \ - the tokenizer within the model via 'custom_objects'." + the tokenizer within the model via 'custom_objects'.\ + For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))" ) return self.__llm_tokenizer__ @@ -406,7 +433,9 @@ class LLM(LLMInterface): method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig} else: generate_sig = ModelSignature.convert_signatures_dict(method_configs).get("generate", generate_sig) - ModelSignature.convert_signatures_dict(method_configs).get("generate_iterator", generate_iterator_sig) + generate_iterator_sig = ModelSignature.convert_signatures_dict(method_configs).get( + "generate_iterator", generate_iterator_sig + ) class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") @@ -418,7 +447,7 @@ class LLM(LLMInterface): input_spec=generate_sig.input_spec, output_spec=generate_sig.output_spec, ) - def generate(__self, prompt: str, **kwds: t.Any) -> list[str]: + def generate(__self, prompt: str, **kwds: t.Any) -> list[t.Any]: return self.generate(prompt, **kwds) @bentoml.Runnable.method( @@ -427,14 +456,11 @@ class LLM(LLMInterface): input_spec=generate_iterator_sig.input_spec, output_spec=generate_iterator_sig.output_spec, ) - def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[str]: - return self.generate_iterator(prompt, **kwds) + def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[t.Any]: + yield self.generate_iterator(prompt, **kwds) return bentoml.Runner( - t.cast( - "type[LLMRunnable]", - types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)), - ), + types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)), runnable_init_params=kwargs, name=name, models=models, diff --git a/src/openllm/models/__init__.py b/src/openllm/models/__init__.py index b32b98e0..92f09876 100644 --- a/src/openllm/models/__init__.py +++ b/src/openllm/models/__init__.py @@ -13,4 +13,6 @@ # limitations under the License. from . import auto as auto +from . import chatglm as chatglm +from . import dolly_v2 as dolly_v2 from . import flan_t5 as flan_t5 diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py index 5c6b109c..0f4556fb 100644 --- a/src/openllm/models/auto/configuration_auto.py +++ b/src/openllm/models/auto/configuration_auto.py @@ -28,7 +28,13 @@ else: ConfigOrderedDict = OrderedDict # NOTE: This is the entrypoint when adding new model config -CONFIG_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Config"), ("dolly_v2", "DollyV2Config")]) +CONFIG_MAPPING_NAMES = OrderedDict( + [ + ("flan_t5", "FlanT5Config"), + ("dolly_v2", "DollyV2Config"), + ("chatglm", "ChatGLMConfig"), + ] +) class _LazyConfigMapping(ConfigOrderedDict): diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py index 82af2b80..44e8131b 100644 --- a/src/openllm/models/auto/modeling_auto.py +++ b/src/openllm/models/auto/modeling_auto.py @@ -19,7 +19,7 @@ from collections import OrderedDict from .configuration_auto import CONFIG_MAPPING_NAMES from .factory import _BaseAutoLLMClass, _LazyAutoMapping -MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2")]) +MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2"), ("chatglm", "ChatGLM")]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py index 5e1ed3d7..deeba09b 100644 --- a/src/openllm/models/chatglm/__init__.py +++ b/src/openllm/models/chatglm/__init__.py @@ -14,4 +14,39 @@ from __future__ import annotations -raise NotImplementedError("This module is not implemented yet.") +import typing as t + +import openllm + +_import_structure = { + "configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"], +} + +try: + if not openllm.utils.is_torch_available(): + raise openllm.exceptions.MissingDependencyError +except openllm.exceptions.MissingDependencyError: + pass +else: + _import_structure["modeling_chatglm"] = ["ChatGLM"] + +if t.TYPE_CHECKING: + from .configuration_chatglm import \ + DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE + from .configuration_chatglm import \ + START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING + from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig + + try: + if not openllm.utils.is_torch_available(): + raise openllm.exceptions.MissingDependencyError + except openllm.exceptions.MissingDependencyError: + pass + else: + from .modeling_chatglm import ChatGLM as ChatGLM +else: + import sys + + sys.modules[__name__] = openllm.utils.LazyModule( + __name__, globals()["__file__"], _import_structure, module_spec=__spec__ + ) diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py index e69de29b..f3711533 100644 --- a/src/openllm/models/chatglm/configuration_chatglm.py +++ b/src/openllm/models/chatglm/configuration_chatglm.py @@ -0,0 +1,49 @@ +# Copyright 2023 BentoML Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import openllm + + +class ChatGLMConfig(openllm.LLMConfig, name_type="lowercase"): + """Configuration for the ChatGLM model.""" + + retain_history: bool = True + """Whether to retain history given to the model. If set to True, then the model will retain given history.""" + + use_half_precision: bool = True + """Whether to use half precision for model.""" + + class GenerationConfig: + max_length: int = 2048 + num_beams: int = 1 + top_p: float = 0.7 + temperature: float = 0.95 + + +START_CHATGLM_COMMAND_DOCSTRING = """\ +Run a LLMServer for ChatGLM model and variants. + +\b +> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b) + +\b +## Usage + +Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in your system. + +\b +ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM +saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'`` +""" diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py new file mode 100644 index 00000000..db58a551 --- /dev/null +++ b/src/openllm/models/chatglm/modeling_chatglm.py @@ -0,0 +1,104 @@ +# Copyright 2023 BentoML Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import typing as t + +import bentoml +import transformers +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList + +import openllm + +if t.TYPE_CHECKING: + import torch +else: + torch = openllm.utils.LazyLoader("torch", globals(), "torch") + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + """Ported from modeling_chatglm.py""" + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class ChatGLM(openllm.LLM, _internal=True): + default_model = "THUDM/chatglm-6b" + + variants = ["THUDM/chatglm-6b", "THUDM/chatglm-6b-int8", "THUDM/chatglm-6b-int4"] + + def model_post_init(self, _: t.Any): + self.history: list[tuple[str, str]] = [] + + def import_model( + self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **kwds: t.Any + ) -> bentoml.Model: + return bentoml.transformers.save_model( + str(tag), + transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=True, **kwds), + custom_objects={ + "tokenizer": transformers.AutoTokenizer.from_pretrained( + pretrained, trust_remote_code=True, **tokenizer_kwds + ) + }, + ) + + @torch.inference_mode() + def generate( + self, + prompt: str, + max_length: int | None = None, + num_beams: int | None = None, + top_p: float | None = None, + temperature: float | None = None, + **kwargs: t.Any, + ) -> t.Any: + if torch.cuda.is_available(): + self.model = self.model.cuda() + if self.config.use_half_precision: + self.model = self.model.half() + self.model.eval() + + logit_processor = LogitsProcessorList() + logit_processor.append(InvalidScoreLogitsProcessor()) + + prompt_text = "" + for i, (old_query, response) in enumerate(self.history): + prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" + prompt_text += f"[Round {len(self.history)}]\n问:{prompt}\n答:" + + inputs = self.tokenizer([prompt_text], return_tensors="pt").to(self.model.device) + outputs = self.model.generate( + **inputs, + generation_config=self.config.with_options( + max_length=max_length, + num_beams=num_beams, + top_p=top_p, + temperature=temperature, + do_sample=True, + **kwargs, + ).to_generation_config(), + logits_processor=logit_processor, + ) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :] + response = self.tokenizer.decode(outputs) + response = self.model.process_response(response) + if self.config.retain_history: + self.history.append((prompt, response)) + return self.history diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py index 0c041181..1cf14021 100644 --- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py +++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py @@ -11,14 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""The following includes OpenLLM configuration and excerpt from -[instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)""" - from __future__ import annotations import openllm +# NOTE: The following includes OpenLLM configuration and excerpt from [instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py) class DollyV2Config(openllm.LLMConfig, default_timeout=3600000): """Configuration for the dolly-v2 model.""" @@ -55,7 +53,7 @@ INTRO_BLURB = ( "Below is an instruction that describes a task. Write a response that appropriately completes the request." ) -# This is the prompt that is used for generating responses using an already trained model. It ends with the response +# NOTE: This is the prompt that is used for generating responses using an already trained model. It ends with the response # key, where the job of the model is to provide the completion that follows it (i.e. the response itself). DEFAULT_PROMPT_TEMPLATE = """{intro} {instruction_key} diff --git a/src/openllm/utils/dummy_pt_objects.py b/src/openllm/utils/dummy_pt_objects.py index 5822287d..2949e5fb 100644 --- a/src/openllm/utils/dummy_pt_objects.py +++ b/src/openllm/utils/dummy_pt_objects.py @@ -19,6 +19,13 @@ class DollyV2(metaclass=DummyMetaclass): require_backends(self, ["torch"]) +class ChatGLM(metaclass=DummyMetaclass): + _backends = ["torch"] + + def __init__(self, *args: t.Any, **kwargs: t.Any): + require_backends(self, ["torch"]) + + class AutoLLM(metaclass=DummyMetaclass): _backends = ["torch"]