mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-14 13:06:09 -04:00
feat: chatglm and configuration naming type
by default, it is dasherize, but for cases like chatglm, it can be lowercase as well Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -42,6 +42,7 @@ _import_structure = {
|
||||
"models.auto": ["AutoConfig", "CONFIG_MAPPING"],
|
||||
"models.flan_t5": ["FlanT5Config"],
|
||||
"models.dolly_v2": ["DollyV2Config"],
|
||||
"models.chatglm": ["ChatGLMConfig"],
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -54,6 +55,7 @@ except MissingDependencyError:
|
||||
else:
|
||||
_import_structure["models.flan_t5"].extend(["FlanT5"])
|
||||
_import_structure["models.dolly_v2"].extend(["DollyV2"])
|
||||
_import_structure["models.chatglm"].extend(["ChatGLM"])
|
||||
_import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
|
||||
|
||||
try:
|
||||
@@ -99,6 +101,7 @@ if t.TYPE_CHECKING:
|
||||
from .cli import start_grpc as start_grpc
|
||||
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
|
||||
from .models.auto import AutoConfig as AutoConfig
|
||||
from .models.chatglm import ChatGLMConfig as ChatGLMConfig
|
||||
from .models.dolly_v2 import DollyV2Config as DollyV2Config
|
||||
from .models.flan_t5 import FlanT5Config as FlanT5Config
|
||||
|
||||
@@ -111,6 +114,7 @@ if t.TYPE_CHECKING:
|
||||
from .models.auto import MODEL_MAPPING as MODEL_MAPPING
|
||||
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
|
||||
from .models.auto import AutoLLM as AutoLLM
|
||||
from .models.chatglm import ChatGLM as ChatGLM
|
||||
from .models.dolly_v2 import DollyV2 as DollyV2
|
||||
from .models.flan_t5 import FlanT5 as FlanT5
|
||||
|
||||
|
||||
@@ -430,29 +430,43 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
__openllm_model_name__: str = ""
|
||||
__openllm_start_name__: str = ""
|
||||
__openllm_timeout__: int = 0
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
|
||||
GenerationConfig: type[t.Any] = GenerationConfig
|
||||
|
||||
def __init_subclass__(cls, *, default_timeout: int | None = None, **kwargs: t.Any):
|
||||
def __init_subclass__(
|
||||
cls,
|
||||
*,
|
||||
default_timeout: int | None = None,
|
||||
name_type: t.Literal["dasherize", "lowercase"] = "dasherize",
|
||||
**kwargs: t.Any,
|
||||
):
|
||||
if default_timeout is None:
|
||||
default_timeout = 3600
|
||||
cls.__openllm_timeout__ = default_timeout
|
||||
if name_type not in ("dasherize", "lowercase"):
|
||||
raise RuntimeError(f"Unknown name_type {name_type}. Only allowed are 'dasherize' and 'lowercase'.")
|
||||
cls.__openllm_name_type__ = name_type
|
||||
|
||||
super(LLMConfig, cls).__init_subclass__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def __pydantic_init_subclass__(cls, **kwargs: t.Any):
|
||||
cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
|
||||
cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
|
||||
def __pydantic_init_subclass__(cls, **_: t.Any):
|
||||
if cls.__openllm_name_type__ == "dasherize":
|
||||
cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
|
||||
cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
|
||||
else:
|
||||
cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
|
||||
cls.__openllm_start_name__ = cls.__openllm_model_name__
|
||||
|
||||
if hasattr(cls, "GenerationConfig"):
|
||||
generation_class = t.cast(
|
||||
cls.generation_config = t.cast(
|
||||
"type[GenerationConfig]",
|
||||
types.new_class(
|
||||
cls.__name__.replace("Config", "") + "GenerationConfig",
|
||||
(GenerationConfig,),
|
||||
{"model_name": cls.__openllm_model_name__, "_internal": True},
|
||||
),
|
||||
)
|
||||
cls.generation_config = generation_class.construct_from_llm_config(cls)
|
||||
).construct_from_llm_config(cls)
|
||||
delattr(cls, "GenerationConfig")
|
||||
|
||||
for key, field in cls.model_fields.items():
|
||||
|
||||
@@ -70,7 +70,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):
|
||||
TEXT2TEXT_GENERATION = enum.auto()
|
||||
|
||||
|
||||
def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, *model_args: t.Any, **kwds: t.Any):
|
||||
def import_model(
|
||||
model_name: str,
|
||||
tag: bentoml.Tag,
|
||||
__openllm_framework__: str,
|
||||
*model_args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**kwds: t.Any,
|
||||
):
|
||||
"""Auto detect model type from given model_name and import it to bentoml's model store.
|
||||
|
||||
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first, returning all of the unused kwargs.
|
||||
@@ -91,10 +98,6 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
|
||||
config: transformers.PretrainedConfig = kwds.pop("config", None)
|
||||
trust_remote_code = kwds.pop("trust_remote_code", False)
|
||||
|
||||
tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
|
||||
|
||||
kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
|
||||
|
||||
# this logic below is synonymous to handling `from_pretrained` kwds.
|
||||
hub_kwds_names = [
|
||||
"cache_dir",
|
||||
@@ -117,6 +120,7 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
|
||||
model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_kwds, **copied_kwds
|
||||
),
|
||||
)
|
||||
|
||||
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
|
||||
task_type = "text-generation"
|
||||
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
|
||||
@@ -186,14 +190,6 @@ class LLMInterface(ABC):
|
||||
)
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
|
||||
class LLMRunnable(bentoml.Runnable):
|
||||
@abstractmethod
|
||||
def generate(self, prompt: str, **kwargs: t.Any) -> t.Any:
|
||||
...
|
||||
|
||||
|
||||
class LLM(LLMInterface):
|
||||
_implementation: t.Literal["pt", "tf", "flax"]
|
||||
|
||||
@@ -204,7 +200,14 @@ class LLM(LLMInterface):
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
|
||||
def import_model(self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, **kwds: t.Any) -> bentoml.Model:
|
||||
def import_model(
|
||||
self,
|
||||
pretrained: str,
|
||||
tag: bentoml.Tag,
|
||||
*args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**kwds: t.Any,
|
||||
) -> bentoml.Model:
|
||||
...
|
||||
|
||||
def __init_subclass__(cls, *, implementation: t.Literal["pt", "tf", "flax"] = "pt", _internal: bool = False):
|
||||
@@ -216,7 +219,7 @@ class LLM(LLMInterface):
|
||||
if implementation == "tf":
|
||||
cls.config_class = getattr(openllm, f"{cls.__name__[2:]}Config")
|
||||
elif implementation == "flax":
|
||||
cls.config_class = getattr(openllm, f"{cls.__name__[len('flax'):]}Config")
|
||||
cls.config_class = getattr(openllm, f"{cls.__name__[4:]}Config")
|
||||
else:
|
||||
cls.config_class = getattr(openllm, f"{cls.__name__}Config")
|
||||
else:
|
||||
@@ -245,7 +248,11 @@ class LLM(LLMInterface):
|
||||
return {"configuration": self.config.model_dump(), "variants": self.variants}
|
||||
|
||||
def __init__(
|
||||
self, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **kwargs: t.Any
|
||||
self,
|
||||
pretrained: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
**kwargs: t.Any,
|
||||
):
|
||||
"""Initialize the LLM with given pretrained model.
|
||||
|
||||
@@ -261,16 +268,24 @@ class LLM(LLMInterface):
|
||||
If you need to overwrite the default ``import_model``, implement the following in your subclass:
|
||||
|
||||
```python
|
||||
def import_model(self, pretrained: str, tag: bentoml.Tag, *args: t.Any, **kwargs: t.Any):
|
||||
tokenizer_kwargs = {k[len('_tokenizer_'):]: v for k, v in kwargs.items() if k.startswith('_tokenizer_')]}
|
||||
kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_tokenizer_')}
|
||||
def import_model(
|
||||
self,
|
||||
pretrained: str,
|
||||
tag: bentoml.Tag,
|
||||
*args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**kwargs: t.Any,
|
||||
):
|
||||
return bentoml.transformers.save_model(
|
||||
str(tag),
|
||||
transformers.AutoModelForCausalLM.from_pretrained(
|
||||
pretrained, device_map="auto", torch_dtype=torch.bfloat16, **kwargs
|
||||
),
|
||||
custom_objects={"tokenizer": transformers.AutoTokenizer.from_pretrained(pretrained, padding_size="left",
|
||||
**tokenizer_kwargs)},
|
||||
custom_objects={
|
||||
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
||||
pretrained, padding_size="left", **tokenizer_kwds
|
||||
)
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
@@ -295,7 +310,7 @@ class LLM(LLMInterface):
|
||||
"""
|
||||
|
||||
if llm_config is not None:
|
||||
logger.debug("Using given 'llm_config=%s' to initialize LLM", llm_config)
|
||||
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
|
||||
self.config = llm_config
|
||||
else:
|
||||
self.config = self.config_class(**kwargs)
|
||||
@@ -319,7 +334,11 @@ class LLM(LLMInterface):
|
||||
@property
|
||||
def _bentomodel(self) -> bentoml.Model:
|
||||
if self.__bentomodel__ is None:
|
||||
tag, kwargs = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
|
||||
tag, kwds = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
|
||||
|
||||
tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
|
||||
kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
|
||||
|
||||
try:
|
||||
self.__bentomodel__ = bentoml.transformers.get(tag)
|
||||
except bentoml.exceptions.BentoMLException:
|
||||
@@ -328,13 +347,20 @@ class LLM(LLMInterface):
|
||||
)
|
||||
if hasattr(self, "import_model"):
|
||||
logger.debug("Using custom 'import_model' defined in subclass.")
|
||||
self.__bentomodel__ = self.import_model(self._pretrained, tag, *self._args, **kwargs)
|
||||
self.__bentomodel__ = self.import_model(
|
||||
self._pretrained, tag, *self._args, tokenizer_kwds=tokenizer_kwds, **kwds
|
||||
)
|
||||
else:
|
||||
if self.import_kwargs:
|
||||
kwargs = {**self.import_kwargs, **kwargs}
|
||||
kwds = {**self.import_kwargs, **kwds}
|
||||
# NOTE: In this branch, we just use the default implementation.
|
||||
self.__bentomodel__ = import_model(
|
||||
self._pretrained, tag, __openllm_framework__=self._implementation, *self._args, **kwargs
|
||||
self._pretrained,
|
||||
tag,
|
||||
*self._args,
|
||||
tokenizer_kwds=tokenizer_kwds,
|
||||
__openllm_framework__=self._implementation,
|
||||
**kwds,
|
||||
)
|
||||
return self.__bentomodel__
|
||||
|
||||
@@ -360,7 +386,8 @@ class LLM(LLMInterface):
|
||||
# This could happen if users implement their own import_model
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"Model does not have tokenizer. Make sure to save \
|
||||
the tokenizer within the model via 'custom_objects'."
|
||||
the tokenizer within the model via 'custom_objects'.\
|
||||
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
|
||||
)
|
||||
return self.__llm_tokenizer__
|
||||
|
||||
@@ -406,7 +433,9 @@ class LLM(LLMInterface):
|
||||
method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
|
||||
else:
|
||||
generate_sig = ModelSignature.convert_signatures_dict(method_configs).get("generate", generate_sig)
|
||||
ModelSignature.convert_signatures_dict(method_configs).get("generate_iterator", generate_iterator_sig)
|
||||
generate_iterator_sig = ModelSignature.convert_signatures_dict(method_configs).get(
|
||||
"generate_iterator", generate_iterator_sig
|
||||
)
|
||||
|
||||
class _Runnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
|
||||
@@ -418,7 +447,7 @@ class LLM(LLMInterface):
|
||||
input_spec=generate_sig.input_spec,
|
||||
output_spec=generate_sig.output_spec,
|
||||
)
|
||||
def generate(__self, prompt: str, **kwds: t.Any) -> list[str]:
|
||||
def generate(__self, prompt: str, **kwds: t.Any) -> list[t.Any]:
|
||||
return self.generate(prompt, **kwds)
|
||||
|
||||
@bentoml.Runnable.method(
|
||||
@@ -427,14 +456,11 @@ class LLM(LLMInterface):
|
||||
input_spec=generate_iterator_sig.input_spec,
|
||||
output_spec=generate_iterator_sig.output_spec,
|
||||
)
|
||||
def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[str]:
|
||||
return self.generate_iterator(prompt, **kwds)
|
||||
def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[t.Any]:
|
||||
yield self.generate_iterator(prompt, **kwds)
|
||||
|
||||
return bentoml.Runner(
|
||||
t.cast(
|
||||
"type[LLMRunnable]",
|
||||
types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
|
||||
),
|
||||
types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
|
||||
runnable_init_params=kwargs,
|
||||
name=name,
|
||||
models=models,
|
||||
|
||||
@@ -13,4 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
from . import auto as auto
|
||||
from . import chatglm as chatglm
|
||||
from . import dolly_v2 as dolly_v2
|
||||
from . import flan_t5 as flan_t5
|
||||
|
||||
@@ -28,7 +28,13 @@ else:
|
||||
ConfigOrderedDict = OrderedDict
|
||||
|
||||
# NOTE: This is the entrypoint when adding new model config
|
||||
CONFIG_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Config"), ("dolly_v2", "DollyV2Config")])
|
||||
CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
("flan_t5", "FlanT5Config"),
|
||||
("dolly_v2", "DollyV2Config"),
|
||||
("chatglm", "ChatGLMConfig"),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class _LazyConfigMapping(ConfigOrderedDict):
|
||||
|
||||
@@ -19,7 +19,7 @@ from collections import OrderedDict
|
||||
from .configuration_auto import CONFIG_MAPPING_NAMES
|
||||
from .factory import _BaseAutoLLMClass, _LazyAutoMapping
|
||||
|
||||
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2")])
|
||||
MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2"), ("chatglm", "ChatGLM")])
|
||||
|
||||
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
|
||||
|
||||
|
||||
@@ -14,4 +14,39 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
raise NotImplementedError("This module is not implemented yet.")
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
_import_structure = {
|
||||
"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
|
||||
}
|
||||
|
||||
try:
|
||||
if not openllm.utils.is_torch_available():
|
||||
raise openllm.exceptions.MissingDependencyError
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_chatglm"] = ["ChatGLM"]
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from .configuration_chatglm import \
|
||||
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_chatglm import \
|
||||
START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
|
||||
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
|
||||
|
||||
try:
|
||||
if not openllm.utils.is_torch_available():
|
||||
raise openllm.exceptions.MissingDependencyError
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
from .modeling_chatglm import ChatGLM as ChatGLM
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = openllm.utils.LazyModule(
|
||||
__name__, globals()["__file__"], _import_structure, module_spec=__spec__
|
||||
)
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
class ChatGLMConfig(openllm.LLMConfig, name_type="lowercase"):
|
||||
"""Configuration for the ChatGLM model."""
|
||||
|
||||
retain_history: bool = True
|
||||
"""Whether to retain history given to the model. If set to True, then the model will retain given history."""
|
||||
|
||||
use_half_precision: bool = True
|
||||
"""Whether to use half precision for model."""
|
||||
|
||||
class GenerationConfig:
|
||||
max_length: int = 2048
|
||||
num_beams: int = 1
|
||||
top_p: float = 0.7
|
||||
temperature: float = 0.95
|
||||
|
||||
|
||||
START_CHATGLM_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for ChatGLM model and variants.
|
||||
|
||||
\b
|
||||
> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
|
||||
|
||||
\b
|
||||
## Usage
|
||||
|
||||
Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in your system.
|
||||
|
||||
\b
|
||||
ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
|
||||
saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
|
||||
"""
|
||||
|
||||
104
src/openllm/models/chatglm/modeling_chatglm.py
Normal file
104
src/openllm/models/chatglm/modeling_chatglm.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import transformers
|
||||
from transformers.generation.logits_process import LogitsProcessor
|
||||
from transformers.generation.utils import LogitsProcessorList
|
||||
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
||||
"""Ported from modeling_chatglm.py"""
|
||||
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
||||
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
||||
scores.zero_()
|
||||
scores[..., 5] = 5e4
|
||||
return scores
|
||||
|
||||
|
||||
class ChatGLM(openllm.LLM, _internal=True):
|
||||
default_model = "THUDM/chatglm-6b"
|
||||
|
||||
variants = ["THUDM/chatglm-6b", "THUDM/chatglm-6b-int8", "THUDM/chatglm-6b-int4"]
|
||||
|
||||
def model_post_init(self, _: t.Any):
|
||||
self.history: list[tuple[str, str]] = []
|
||||
|
||||
def import_model(
|
||||
self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **kwds: t.Any
|
||||
) -> bentoml.Model:
|
||||
return bentoml.transformers.save_model(
|
||||
str(tag),
|
||||
transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=True, **kwds),
|
||||
custom_objects={
|
||||
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
||||
pretrained, trust_remote_code=True, **tokenizer_kwds
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
max_length: int | None = None,
|
||||
num_beams: int | None = None,
|
||||
top_p: float | None = None,
|
||||
temperature: float | None = None,
|
||||
**kwargs: t.Any,
|
||||
) -> t.Any:
|
||||
if torch.cuda.is_available():
|
||||
self.model = self.model.cuda()
|
||||
if self.config.use_half_precision:
|
||||
self.model = self.model.half()
|
||||
self.model.eval()
|
||||
|
||||
logit_processor = LogitsProcessorList()
|
||||
logit_processor.append(InvalidScoreLogitsProcessor())
|
||||
|
||||
prompt_text = ""
|
||||
for i, (old_query, response) in enumerate(self.history):
|
||||
prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
|
||||
prompt_text += f"[Round {len(self.history)}]\n问:{prompt}\n答:"
|
||||
|
||||
inputs = self.tokenizer([prompt_text], return_tensors="pt").to(self.model.device)
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=self.config.with_options(
|
||||
max_length=max_length,
|
||||
num_beams=num_beams,
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
do_sample=True,
|
||||
**kwargs,
|
||||
).to_generation_config(),
|
||||
logits_processor=logit_processor,
|
||||
)
|
||||
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
|
||||
response = self.tokenizer.decode(outputs)
|
||||
response = self.model.process_response(response)
|
||||
if self.config.retain_history:
|
||||
self.history.append((prompt, response))
|
||||
return self.history
|
||||
@@ -11,14 +11,12 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""The following includes OpenLLM configuration and excerpt from
|
||||
[instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
# NOTE: The following includes OpenLLM configuration and excerpt from [instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)
|
||||
class DollyV2Config(openllm.LLMConfig, default_timeout=3600000):
|
||||
"""Configuration for the dolly-v2 model."""
|
||||
|
||||
@@ -55,7 +53,7 @@ INTRO_BLURB = (
|
||||
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
)
|
||||
|
||||
# This is the prompt that is used for generating responses using an already trained model. It ends with the response
|
||||
# NOTE: This is the prompt that is used for generating responses using an already trained model. It ends with the response
|
||||
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
|
||||
DEFAULT_PROMPT_TEMPLATE = """{intro}
|
||||
{instruction_key}
|
||||
|
||||
@@ -19,6 +19,13 @@ class DollyV2(metaclass=DummyMetaclass):
|
||||
require_backends(self, ["torch"])
|
||||
|
||||
|
||||
class ChatGLM(metaclass=DummyMetaclass):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args: t.Any, **kwargs: t.Any):
|
||||
require_backends(self, ["torch"])
|
||||
|
||||
|
||||
class AutoLLM(metaclass=DummyMetaclass):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user