mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-06-12 02:20:32 -04:00
style: google
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -43,7 +43,8 @@ except openllm.exceptions.MissingDependencyError:
|
||||
pass
|
||||
else:
|
||||
_import_structure['modeling_flax_auto'].extend(['AutoFlaxLLM', 'MODEL_FLAX_MAPPING'])
|
||||
if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
|
||||
if t.TYPE_CHECKING:
|
||||
from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
|
||||
try:
|
||||
if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
|
||||
except openllm.exceptions.MissingDependencyError:
|
||||
|
||||
@@ -30,10 +30,18 @@ class BaseAutoLLMClass:
|
||||
_model_mapping: t.ClassVar[_LazyAutoMapping]
|
||||
|
||||
def __init__(self, *args: t.Any, **attrs: t.Any):
|
||||
raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
|
||||
raise EnvironmentError(
|
||||
f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False,
|
||||
def for_model(cls,
|
||||
model: str,
|
||||
/,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
ensure_available: bool = False,
|
||||
**attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
|
||||
'''The lower level API for creating a LLM instance.
|
||||
|
||||
@@ -42,7 +50,10 @@ class BaseAutoLLMClass:
|
||||
>>> llm = openllm.AutoLLM.for_model("flan-t5")
|
||||
```
|
||||
'''
|
||||
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
|
||||
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
|
||||
model_version=model_version,
|
||||
llm_config=llm_config,
|
||||
**attrs)
|
||||
if ensure_available: llm.ensure_model_id_exists()
|
||||
return llm
|
||||
|
||||
@@ -105,7 +116,9 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
This OrderedDict values() and keys() returns the list instead, so you don't
|
||||
have to do list(mapping.values()) to get the list of values.
|
||||
"""
|
||||
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
|
||||
|
||||
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
|
||||
model_mapping: OrderedDict[LiteralString, LiteralString]):
|
||||
self._config_mapping = config_mapping
|
||||
self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
|
||||
self._model_mapping = model_mapping
|
||||
@@ -115,7 +128,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]:
|
||||
if key in self._extra_content: return self._extra_content[key]
|
||||
model_type = self._reverse_config_mapping[key.__name__]
|
||||
if model_type in self._model_mapping: return self._load_attr_from_module(model_type, self._model_mapping[model_type])
|
||||
if model_type in self._model_mapping:
|
||||
return self._load_attr_from_module(model_type, self._model_mapping[model_type])
|
||||
# Maybe there was several model types associated with this config.
|
||||
model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
|
||||
for mtype in model_types:
|
||||
@@ -124,7 +138,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
|
||||
def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
|
||||
module_name = inflection.underscore(model_type)
|
||||
if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
|
||||
if module_name not in self._modules:
|
||||
self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
|
||||
return getattribute_from_module(self._modules[module_name], attr)
|
||||
|
||||
def __len__(self) -> int:
|
||||
@@ -138,29 +153,32 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
return ReprMixin.__repr__(self)
|
||||
|
||||
def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
|
||||
yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
|
||||
yield from ((key, (value, self._model_mapping[key]))
|
||||
for key, value in self._config_mapping.items()
|
||||
if key in self._model_mapping)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return bool(self.keys())
|
||||
|
||||
def keys(self) -> ConfigModelKeysView:
|
||||
return t.cast(
|
||||
'ConfigModelKeysView', [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
|
||||
)
|
||||
return t.cast('ConfigModelKeysView', [
|
||||
self._load_attr_from_module(key, name)
|
||||
for key, name in self._config_mapping.items()
|
||||
if key in self._model_mapping.keys()
|
||||
] + list(self._extra_content.keys()))
|
||||
|
||||
def values(self) -> ConfigModelValuesView:
|
||||
return t.cast(
|
||||
'ConfigModelValuesView', [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
|
||||
self._extra_content.values()
|
||||
)
|
||||
)
|
||||
return t.cast('ConfigModelValuesView', [
|
||||
self._load_attr_from_module(key, name)
|
||||
for key, name in self._model_mapping.items()
|
||||
if key in self._config_mapping.keys()
|
||||
] + list(self._extra_content.values()))
|
||||
|
||||
def items(self) -> ConfigModelItemsView:
|
||||
return t.cast(
|
||||
'ConfigModelItemsView',
|
||||
[(self._load_attr_from_module(key, self._config_mapping[key]),
|
||||
self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())
|
||||
)
|
||||
return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
|
||||
key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
|
||||
for key in self._model_mapping.keys()
|
||||
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
|
||||
|
||||
def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
|
||||
return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
|
||||
@@ -172,7 +190,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
|
||||
def register(self, key: t.Any, value: t.Any) -> None:
|
||||
if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
|
||||
if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
|
||||
if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys():
|
||||
raise ValueError(f"'{key}' is already used by a OpenLLM model.")
|
||||
self._extra_content[key] = value
|
||||
|
||||
__all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']
|
||||
|
||||
@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
|
||||
from .factory import BaseAutoLLMClass
|
||||
from .factory import _LazyAutoMapping
|
||||
|
||||
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), (
|
||||
'opt', 'OPT'
|
||||
), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
|
||||
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
|
||||
('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
|
||||
('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
|
||||
('baichuan', 'Baichuan')])
|
||||
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
|
||||
|
||||
class AutoLLM(BaseAutoLLMClass):
|
||||
|
||||
@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
|
||||
from .factory import BaseAutoLLMClass
|
||||
from .factory import _LazyAutoMapping
|
||||
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), (
|
||||
'opt', 'VLLMOPT'
|
||||
), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
|
||||
('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
|
||||
('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
|
||||
('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
|
||||
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
|
||||
|
||||
class AutoVLLM(BaseAutoLLMClass):
|
||||
|
||||
@@ -11,5 +11,6 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
|
||||
import torch
|
||||
inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
|
||||
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
|
||||
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
outputs = self.model.generate(**inputs,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
|
||||
@@ -14,7 +14,9 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
|
||||
self.model.eval()
|
||||
# Only use half precision if the model is not yet quantized
|
||||
if self.config.use_half_precision: self.model.half()
|
||||
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
return self.model.chat(self.tokenizer,
|
||||
prompt,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
import torch
|
||||
|
||||
@@ -10,29 +10,57 @@ from openllm_core.config.configuration_dolly_v2 import END_KEY
|
||||
from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
|
||||
from openllm_core.config.configuration_dolly_v2 import get_special_token_id
|
||||
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
|
||||
else: torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
|
||||
else:
|
||||
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
|
||||
'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@overload
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
|
||||
def get_pipeline(model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: t.Literal[True] = True,
|
||||
**attrs: t.Any) -> transformers.Pipeline:
|
||||
...
|
||||
|
||||
@overload
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
|
||||
def get_pipeline(model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: t.Literal[False] = ...,
|
||||
**attrs: t.Any) -> type[transformers.Pipeline]:
|
||||
...
|
||||
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
|
||||
def get_pipeline(model: transformers.PreTrainedModel,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
_init: bool = False,
|
||||
**attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
|
||||
# Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
|
||||
class InstructionTextGenerationPipeline(transformers.Pipeline):
|
||||
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
|
||||
super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
|
||||
|
||||
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
|
||||
def __init__(self,
|
||||
*args: t.Any,
|
||||
do_sample: bool = True,
|
||||
max_new_tokens: int = 256,
|
||||
top_p: float = 0.92,
|
||||
top_k: int = 0,
|
||||
**kwargs: t.Any):
|
||||
super().__init__(*args,
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
do_sample=do_sample,
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
**kwargs)
|
||||
|
||||
def _sanitize_parameters(self,
|
||||
return_full_text: bool | None = None,
|
||||
**generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
preprocess_params: dict[str, t.Any] = {}
|
||||
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
|
||||
# append a newline to yield a single token. find whatever token is configured for the response key.
|
||||
tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
|
||||
tokenizer_response_key = next(
|
||||
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
|
||||
response_key_token_id = None
|
||||
end_key_token_id = None
|
||||
if tokenizer_response_key:
|
||||
@@ -56,7 +84,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
|
||||
inputs['instruction_text'] = input_
|
||||
return t.cast(t.Dict[str, t.Any], inputs)
|
||||
|
||||
def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
|
||||
def _forward(self, input_tensors: dict[str, t.Any],
|
||||
**generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
|
||||
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
|
||||
@@ -65,15 +94,20 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
|
||||
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
|
||||
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
|
||||
pad_token_id=self.tokenizer.pad_token_id,
|
||||
**generate_kwargs
|
||||
)
|
||||
**generate_kwargs)
|
||||
out_b = generated_sequence.shape[0]
|
||||
if self.framework == 'pt': generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == 'tf': generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
|
||||
if self.framework == 'pt':
|
||||
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == 'tf':
|
||||
generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
|
||||
instruction_text = input_tensors.pop('instruction_text')
|
||||
return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}
|
||||
|
||||
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
def postprocess(self,
|
||||
model_outputs: dict[str, t.Any],
|
||||
response_key_token_id: int,
|
||||
end_key_token_id: int,
|
||||
return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
if t.TYPE_CHECKING: assert self.tokenizer is not None
|
||||
_generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
|
||||
generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
|
||||
@@ -89,7 +123,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
|
||||
response_pos = sequence.index(response_key_token_id)
|
||||
except ValueError:
|
||||
response_pos = None
|
||||
if response_pos is None: logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
|
||||
if response_pos is None:
|
||||
logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
|
||||
if response_pos:
|
||||
# Next find where "### End" is located. The model has been trained to end its responses with this
|
||||
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
|
||||
@@ -127,12 +162,20 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
|
||||
return {
|
||||
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
'torch_dtype': torch.bfloat16
|
||||
}, {}
|
||||
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
|
||||
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
|
||||
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
|
||||
self.tokenizer,
|
||||
_init=True,
|
||||
return_full_text=self.config.return_full_text)
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
with torch.inference_mode():
|
||||
return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
|
||||
return self.model(prompt,
|
||||
return_full_text=llm_config.return_full_text,
|
||||
generation_config=llm_config.to_generation_config())
|
||||
|
||||
@@ -3,32 +3,43 @@ import typing as t
|
||||
|
||||
import openllm
|
||||
if t.TYPE_CHECKING: import torch, transformers
|
||||
else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
|
||||
else:
|
||||
torch, transformers = openllm.utils.LazyLoader('torch', globals(),
|
||||
'torch'), openllm.utils.LazyLoader('transformers', globals(),
|
||||
'transformers')
|
||||
|
||||
class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
|
||||
__openllm_internal__ = True
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
|
||||
return {
|
||||
'torch_dtype': torch.bfloat16,
|
||||
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
|
||||
}, {}
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
|
||||
eos_token_id, inputs = attrs.pop('eos_token_id',
|
||||
self.tokenizer.eos_token_id), self.tokenizer(prompt,
|
||||
return_tensors='pt').to(self.device)
|
||||
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
input_ids=inputs['input_ids'],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()
|
||||
),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
return self.tokenizer.batch_decode(self.model.generate(
|
||||
input_ids=inputs['input_ids'],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
def generate_one(self, prompt: str, stop: list[str],
|
||||
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
|
||||
prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
|
||||
'stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
|
||||
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
result = self.tokenizer.decode(
|
||||
self.model.generate(encoded_inputs['input_ids'],
|
||||
max_new_tokens=max_new_tokens,
|
||||
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
# Inference API returns the stop sequence
|
||||
for stop_seq in stop:
|
||||
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
|
||||
|
||||
@@ -12,9 +12,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
import torch
|
||||
|
||||
@@ -9,18 +9,16 @@ if t.TYPE_CHECKING: import transformers
|
||||
class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
decoder_start_token_id: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
decoder_start_token_id: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if decoder_start_token_id is None: decoder_start_token_id = 0
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
'max_new_tokens': max_new_tokens,
|
||||
@@ -34,13 +32,10 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
|
||||
decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors='np')['input_ids'],
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
decoder_start_token_id=decoder_start_token_id
|
||||
).sequences,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=True
|
||||
)
|
||||
return self.tokenizer.batch_decode(self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors='np')['input_ids'],
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
decoder_start_token_id=decoder_start_token_id).sequences,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=True)
|
||||
|
||||
@@ -8,7 +8,8 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
|
||||
__openllm_internal__ = True
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
return self.tokenizer.batch_decode(self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors='tf').input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
@@ -25,11 +25,8 @@ class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNe
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
|
||||
)
|
||||
)
|
||||
self.model.generate(self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
|
||||
|
||||
@@ -23,13 +23,20 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
|
||||
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
|
||||
masked_embeddings = data * mask
|
||||
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
|
||||
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
|
||||
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
|
||||
num_tokens=int(torch.sum(attention_mask).item()))
|
||||
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
def generate_one(self, prompt: str, stop: list[str],
|
||||
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
|
||||
prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
|
||||
'stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
|
||||
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
result = self.tokenizer.decode(
|
||||
self.model.generate(encoded_inputs['input_ids'],
|
||||
max_new_tokens=max_new_tokens,
|
||||
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
# Inference API returns the stop sequence
|
||||
for stop_seq in stop:
|
||||
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
|
||||
|
||||
@@ -12,12 +12,15 @@ if t.TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_mpt_config(
|
||||
model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True
|
||||
) -> transformers.PretrainedConfig:
|
||||
def get_mpt_config(model_id_or_path: str,
|
||||
max_sequence_length: int,
|
||||
device: torch.device | str | int | None,
|
||||
device_map: str | None = None,
|
||||
trust_remote_code: bool = True) -> transformers.PretrainedConfig:
|
||||
import torch
|
||||
config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
|
||||
if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
|
||||
if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)):
|
||||
config.init_device = str(device)
|
||||
if hasattr(config, 'attn_config') and is_triton_available(): config.attn_config['attn_impl'] = 'triton'
|
||||
else:
|
||||
logger.debug(
|
||||
@@ -37,7 +40,10 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
|
||||
return {
|
||||
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
||||
}, {}
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
|
||||
import torch
|
||||
@@ -46,12 +52,24 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
|
||||
torch_dtype = attrs.pop('torch_dtype', self.dtype)
|
||||
device_map = attrs.pop('device_map', None)
|
||||
attrs.pop('low_cpu_mem_usage', None)
|
||||
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
|
||||
config = get_mpt_config(self.model_id,
|
||||
self.config.max_sequence_length,
|
||||
self.device,
|
||||
device_map=device_map,
|
||||
trust_remote_code=trust_remote_code)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
|
||||
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
|
||||
config=config,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
device_map=device_map,
|
||||
**attrs)
|
||||
try:
|
||||
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
|
||||
return bentoml.transformers.save_model(self.tag,
|
||||
model,
|
||||
custom_objects={'tokenizer': tokenizer},
|
||||
labels=generate_labels(self))
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@@ -60,10 +78,18 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
|
||||
torch_dtype = attrs.pop('torch_dtype', self.dtype)
|
||||
device_map = attrs.pop('device_map', None)
|
||||
trust_remote_code = attrs.pop('trust_remote_code', True)
|
||||
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs
|
||||
)
|
||||
config = get_mpt_config(self._bentomodel.path,
|
||||
self.config.max_sequence_length,
|
||||
self.device,
|
||||
device_map=device_map,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
|
||||
config=config,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device_map,
|
||||
**attrs)
|
||||
model.tie_weights()
|
||||
return model
|
||||
|
||||
|
||||
@@ -16,29 +16,35 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
|
||||
__openllm_internal__ = True
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(
|
||||
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)
|
||||
)
|
||||
return bentoml.transformers.save_model(self.tag,
|
||||
transformers.FlaxAutoModelForCausalLM.from_pretrained(
|
||||
self.model_id, **attrs),
|
||||
custom_objects={'tokenizer': tokenizer},
|
||||
labels=generate_labels(self))
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
|
||||
'max_new_tokens': max_new_tokens,
|
||||
'temperature': temperature,
|
||||
'top_k': top_k,
|
||||
'num_return_sequences': num_return_sequences,
|
||||
'repetition_penalty': repetition_penalty
|
||||
}, {}
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='np'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
|
||||
skip_special_tokens=True
|
||||
)
|
||||
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(
|
||||
**attrs).to_generation_config()).sequences,
|
||||
skip_special_tokens=True)
|
||||
|
||||
@@ -19,6 +19,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
@@ -11,17 +11,18 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
import transformers
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(
|
||||
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
|
||||
custom_objects={'tokenizer': tokenizer},
|
||||
labels=generate_labels(self)
|
||||
)
|
||||
return bentoml.transformers.save_model(self.tag,
|
||||
transformers.TFOPTForCausalLM.from_pretrained(
|
||||
self.model_id, trust_remote_code=trust_remote_code, **attrs),
|
||||
custom_objects={'tokenizer': tokenizer},
|
||||
labels=generate_labels(self))
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
@@ -10,16 +10,17 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
|
||||
__openllm_internal__ = True
|
||||
tokenizer_id = 'local'
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
|
||||
'max_new_tokens': max_new_tokens,
|
||||
'temperature': temperature,
|
||||
'top_k': top_k,
|
||||
'num_return_sequences': num_return_sequences
|
||||
}, {}
|
||||
|
||||
@@ -22,13 +22,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
|
||||
with torch.inference_mode():
|
||||
return [
|
||||
self.tokenizer.decode(
|
||||
self.model.generate(
|
||||
**self.tokenizer(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
|
||||
)[0],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
|
||||
skip_special_tokens=True)
|
||||
]
|
||||
|
||||
@@ -18,17 +18,29 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
|
||||
return {
|
||||
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
}, {}
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
import torch
|
||||
import transformers
|
||||
torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
|
||||
tokenizer.add_special_tokens({
|
||||
'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
|
||||
'pad_token': EOD
|
||||
})
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device_map,
|
||||
**attrs)
|
||||
try:
|
||||
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
|
||||
return bentoml.transformers.save_model(self.tag,
|
||||
model,
|
||||
custom_objects={'tokenizer': tokenizer},
|
||||
labels=generate_labels(self))
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@@ -41,17 +53,22 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
|
||||
self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()
|
||||
)
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
def generate_one(self, prompt: str, stop: list[str],
|
||||
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
|
||||
prompt, return_tensors='pt').to(self.device)
|
||||
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
|
||||
'stopping_criteria', openllm.StoppingCriteriaList([]))
|
||||
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
|
||||
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
result = self.tokenizer.decode(
|
||||
self.model.generate(encoded_inputs['input_ids'],
|
||||
max_new_tokens=max_new_tokens,
|
||||
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
|
||||
# Inference API returns the stop sequence
|
||||
for stop_seq in stop:
|
||||
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
|
||||
|
||||
Reference in New Issue
Block a user