chore: ignore new lines split [skip ci]

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
aarnphm-ec2-dev
2023-09-01 17:00:49 +00:00
parent 608de0b667
commit 7d893e6cd2
70 changed files with 575 additions and 950 deletions

View File

@@ -95,8 +95,7 @@ class _ClientAttr:
if not self.supports_hf_agent:
raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
if not is_transformers_supports_agent():
raise RuntimeError(
"Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
import transformers
return transformers.HfAgent(urljoin(self._address, '/hf/agent'))
@@ -230,15 +229,7 @@ class _AsyncClient(_ClientAttr):
stop = ['Task:']
prompt = t.cast(str, self._hf_agent.format_prompt(task))
async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
response = await client.post(self._hf_agent.url_endpoint,
json={
'inputs': prompt,
'parameters': {
'max_new_tokens': 200,
'return_full_text': False,
'stop': stop
}
})
response = await client.post(self._hf_agent.url_endpoint, json={'inputs': prompt, 'parameters': {'max_new_tokens': 200, 'return_full_text': False, 'stop': stop}})
if response.status_code != HTTPStatus.OK: raise ValueError(f'Error {response.status_code}: {response.json()}')
result = response.json()[0]['generated_text']
@@ -279,12 +270,8 @@ class BaseClient(_Client):
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = 'attrs'
use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt,
use_default_prompt_template=use_default_prompt_template,
**attrs)
r = openllm_core.GenerationOutput(
**self.call('generate',
openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
r = openllm_core.GenerationOutput(**self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
if return_response == 'attrs': return r
elif return_response == 'raw': return bentoml_cattr.unstructure(r)
else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
@@ -309,12 +296,8 @@ class BaseAsyncClient(_AsyncClient):
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = 'attrs'
use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt,
use_default_prompt_template=use_default_prompt_template,
**attrs)
r = openllm_core.GenerationOutput(
**(await self.call('generate',
openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
r = openllm_core.GenerationOutput(**(await self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
if return_response == 'attrs': return r
elif return_response == 'raw': return bentoml_cattr.unstructure(r)
else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)

View File

@@ -154,12 +154,10 @@ class GrpcClient(Client):
try:
reflection.apis[api.name] = InferenceAPI[t.Any](None,
bentoml.io.from_spec({
'id': api.input.descriptor_id,
'args': json_format.MessageToDict(api.input.attributes).get('args', None)
'id': api.input.descriptor_id, 'args': json_format.MessageToDict(api.input.attributes).get('args', None)
}),
bentoml.io.from_spec({
'id': api.output.descriptor_id,
'args': json_format.MessageToDict(api.output.attributes).get('args', None)
'id': api.output.descriptor_id, 'args': json_format.MessageToDict(api.output.attributes).get('args', None)
}),
name=api.name,
doc=api.docs)
@@ -207,11 +205,7 @@ class AsyncGrpcClient(AsyncClient):
if self.ssl:
if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
return aio.secure_channel(self.server_url,
credentials=credentials,
options=self.options,
compression=self.compression,
interceptors=self.interceptors)
return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors)
return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)
@staticmethod
@@ -262,12 +256,10 @@ class AsyncGrpcClient(AsyncClient):
try:
reflection.apis[api.name] = InferenceAPI[t.Any](None,
bentoml.io.from_spec({
'id': api.input.descriptor_id,
'args': json_format.MessageToDict(api.input.attributes).get('args', None)
'id': api.input.descriptor_id, 'args': json_format.MessageToDict(api.input.attributes).get('args', None)
}),
bentoml.io.from_spec({
'id': api.output.descriptor_id,
'args': json_format.MessageToDict(api.output.attributes).get('args', None)
'id': api.output.descriptor_id, 'args': json_format.MessageToDict(api.output.attributes).get('args', None)
}),
name=api.name,
doc=api.docs)

View File

@@ -73,13 +73,12 @@ class HttpClient(Client):
if 'x-bentoml-name' not in meth_spec:
raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
try:
reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](
None,
bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
name=meth_spec['x-bentoml-name'],
doc=meth_spec['description'],
route=route.lstrip('/'))
reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None,
bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
name=meth_spec['x-bentoml-name'],
doc=meth_spec['description'],
route=route.lstrip('/'))
except Exception as e:
logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e)
return cls(url, reflection)
@@ -160,13 +159,12 @@ class AsyncHttpClient(AsyncClient):
if 'x-bentoml-name' not in meth_spec:
raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
try:
reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](
None,
bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
name=meth_spec['x-bentoml-name'],
doc=meth_spec['description'],
route=route.lstrip('/'))
reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None,
bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
name=meth_spec['x-bentoml-name'],
doc=meth_spec['description'],
route=route.lstrip('/'))
except ValueError as e:
logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e)
return cls(url, reflection)

View File

@@ -176,36 +176,26 @@ class FineTuneConfig:
if t.TYPE_CHECKING and not MYPY:
# The following type stubs makes __init__ aware of attrs internal type converter.
@overload
def __init__(self,
adapter_type: AdapterType = ...,
adapter_config: dict[str, t.Any] = ...,
inference_mode: bool = ...,
llm_config_class: type[LLMConfig] = ...) -> None:
def __init__(self, adapter_type: AdapterType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None:
...
@overload
def __init__(self,
adapter_type: PeftType = ...,
adapter_config: dict[str, t.Any] = ...,
inference_mode: bool = ...,
llm_config_class: type[LLMConfig] = ...) -> None:
def __init__(self, adapter_type: PeftType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None:
...
# The below should be generated via attrs. Only here to conform with pyright strict checking.
def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
...
adapter_type: PeftType = dantic.Field(
'lora',
description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
use_default_converter=False,
converter=_adapter_converter)
adapter_config: t.Dict[str,
t.Any] = dantic.Field(None,
description='The configuration for the adapter. The content of the dict depends on the adapter type.',
validator=attr.validators.optional(attr.validators.instance_of(dict)),
converter=attr.converters.default_if_none(factory=dict),
use_default_converter=False)
adapter_type: PeftType = dantic.Field('lora',
description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
use_default_converter=False,
converter=_adapter_converter)
adapter_config: t.Dict[str, t.Any] = dantic.Field(None,
description='The configuration for the adapter. The content of the dict depends on the adapter type.',
validator=attr.validators.optional(attr.validators.instance_of(dict)),
converter=attr.converters.default_if_none(factory=dict),
use_default_converter=False)
inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False)
llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False)
@@ -214,8 +204,7 @@ class FineTuneConfig:
# no need for peft_type since it is internally managed by OpenLLM and PEFT
if 'peft_type' in adapter_config: adapter_config.pop('peft_type')
# respect user set task_type if it is passed, otherwise use one managed by OpenLLM
task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop(
'inference_mode', self.inference_mode)
task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop('inference_mode', self.inference_mode)
return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, inference_mode=inference_mode, **adapter_config)
def train(self) -> FineTuneConfig:
@@ -245,8 +234,7 @@ class GenerationConfig(ReprMixin):
0,
ge=0,
description=
'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.'
)
'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.')
min_new_tokens: int = dantic.Field(description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
early_stopping: bool = dantic.Field(
False,
@@ -254,25 +242,18 @@ class GenerationConfig(ReprMixin):
'''Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm) '''
)
max_time: float = dantic.Field(
description=
'The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.'
)
description='The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.')
num_beams: int = dantic.Field(1, description='Number of beams for beam search. 1 means no beam search.')
num_beam_groups: int = dantic.Field(
1,
description=
'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.'
)
penalty_alpha: float = dantic.Field(
description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.')
use_cache: bool = dantic.Field(
True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.')
'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.')
penalty_alpha: float = dantic.Field(description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.')
use_cache: bool = dantic.Field(True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.')
temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.')
top_k: int = dantic.Field(50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
top_p: float = dantic.Field(
1.0,
description=
'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.')
top_p: float = dantic.Field(1.0,
description='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.')
typical_p: float = dantic.Field(
1.0,
description=
@@ -293,14 +274,10 @@ class GenerationConfig(ReprMixin):
description=
"This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. "
)
repetition_penalty: float = dantic.Field(
1.0,
description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.'
)
repetition_penalty: float = dantic.Field(1.0,
description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.')
encoder_repetition_penalty: float = dantic.Field(
1.0,
description=
'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.')
1.0, description='The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.')
length_penalty: float = dantic.Field(
1.0,
description=
@@ -329,46 +306,34 @@ class GenerationConfig(ReprMixin):
'The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like [mBART](https://huggingface.co/docs/transformers/model_doc/mbart) where the first generated token needs to be the target language token. '
)
forced_eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
description=
'The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.'
)
description='The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.')
remove_invalid_values: bool = dantic.Field(
False,
description=
'Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.'
description='Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.'
)
exponential_decay_length_penalty: t.Tuple[int, float] = dantic.Field(
description=
'This tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay'
)
suppress_tokens: t.List[int] = dantic.Field(
description=
'A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.'
)
description='A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.')
begin_suppress_tokens: t.List[int] = dantic.Field(
description=
'A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. '
)
'A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. ')
forced_decoder_ids: t.List[t.List[int]] = dantic.Field(
description=
'A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.'
)
num_return_sequences: int = dantic.Field(1, description='The number of independently computed returned sequences for each element in the batch.')
output_attentions: bool = dantic.Field(
False,
description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.')
output_hidden_states: bool = dantic.Field(
False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.')
output_scores: bool = dantic.Field(
False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.')
output_attentions: bool = dantic.Field(False,
description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.')
output_hidden_states: bool = dantic.Field(False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.')
output_scores: bool = dantic.Field(False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.')
pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.')
encoder_no_repeat_ngram_size: int = dantic.Field(
0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.')
decoder_start_token_id: int = dantic.Field(
description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.')
eos_token_id: t.Union[int, t.List[int]] = dantic.Field(description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.')
encoder_no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.')
decoder_start_token_id: int = dantic.Field(description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.')
if t.TYPE_CHECKING and not MYPY:
# stubs this for pyright as mypy already has a attr plugin builtin
@@ -390,13 +355,10 @@ class GenerationConfig(ReprMixin):
bentoml_cattr.register_unstructure_hook_factory(
lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
lambda cls: make_dict_unstructure_fn(cls,
bentoml_cattr,
_cattrs_omit_if_default=False,
_cattrs_use_linecache=True,
**{
k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
}))
lambda cls: make_dict_unstructure_fn(
cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{
k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
}))
@attr.frozen(slots=True, repr=False, init=False)
class SamplingParams(ReprMixin):
@@ -425,8 +387,7 @@ class SamplingParams(ReprMixin):
'Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.'
)
use_beam_search: bool = dantic.Field(False, description='Whether to use beam search instead of sampling.')
stop: t.List[str] = dantic.Field(
None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.')
stop: t.List[str] = dantic.Field(None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.')
ignore_eos: bool = dantic.Field(False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.')
logprobs: int = dantic.Field(None, description='Number of log probabilities to return per output token.')
@@ -441,9 +402,7 @@ class SamplingParams(ReprMixin):
def __init__(self, *, _internal: bool = False, **attrs: t.Any):
if not _internal:
raise RuntimeError(
"SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'"
)
raise RuntimeError("SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'")
_object_setattr(self, 'max_tokens', attrs.pop('max_tokens', 16))
_object_setattr(self, 'temperature', attrs.pop('temperature', 1.0))
_object_setattr(self, 'top_k', attrs.pop('top_k', -1))
@@ -459,11 +418,7 @@ class SamplingParams(ReprMixin):
return {i.name for i in attr.fields(self.__class__)}
def to_vllm(self) -> vllm.SamplingParams:
return vllm.SamplingParams(max_tokens=self.max_tokens,
temperature=self.temperature,
top_k=self.top_k,
top_p=self.top_p,
**bentoml_cattr.unstructure(self))
return vllm.SamplingParams(max_tokens=self.max_tokens, temperature=self.temperature, top_k=self.top_k, top_p=self.top_p, **bentoml_cattr.unstructure(self))
@classmethod
def from_generation_config(cls, generation_config: GenerationConfig, **attrs: t.Any) -> Self:
@@ -481,16 +436,12 @@ class SamplingParams(ReprMixin):
bentoml_cattr.register_unstructure_hook_factory(
lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
lambda cls: make_dict_unstructure_fn(cls,
bentoml_cattr,
_cattrs_omit_if_default=False,
_cattrs_use_linecache=True,
**{
k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
}))
bentoml_cattr.register_structure_hook_factory(
lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')))
lambda cls: make_dict_unstructure_fn(
cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{
k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
}))
bentoml_cattr.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')))
# cached it here to save one lookup per assignment
_object_getattribute = object.__getattribute__
@@ -538,14 +489,12 @@ class ModelSettings(t.TypedDict, total=False):
# tokenizer_class is the custom tokenizer class for this given LLM
tokenizer_class: t.Optional[str]
_transformed_type: DictStrAny = {
'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]
}
_transformed_type: DictStrAny = {'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], 'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]}
@attr.define(frozen=False,
slots=True,
field_transformer=lambda _, __: [
field_transformer=lambda _,
__: [
attr.Attribute.from_counting_attr(
k,
dantic.Field(kw_only=False if t.get_origin(ann) is not Required else True,
@@ -553,7 +502,8 @@ _transformed_type: DictStrAny = {
use_default_converter=False,
type=_transformed_type.get(k, ann),
metadata={'target': f'__openllm_{k}__'},
description=f'ModelSettings field for {k}.')) for k, ann in t.get_type_hints(ModelSettings).items()
description=f'ModelSettings field for {k}.')) for k,
ann in t.get_type_hints(ModelSettings).items()
])
class _ModelSettingsAttr:
'''Internal attrs representation of ModelSettings.'''
@@ -570,8 +520,7 @@ class _ModelSettingsAttr:
model_ids=['__default__'],
architecture='PreTrainedModel',
default_backend={
'cpu': 'pt',
'nvidia.com/gpu': 'pt'
'cpu': 'pt', 'nvidia.com/gpu': 'pt'
},
name_type='dasherize',
requires_gpu=False,
@@ -619,8 +568,7 @@ def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBacken
def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
if 'generation_class' in cl_.__config__:
raise ValueError(
f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.")
raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.")
required_fields = {k for k, ann in t.get_type_hints(ModelSettings).items() if t.get_origin(ann) is Required}
if any(i not in cl_.__config__ for i in required_fields):
@@ -633,8 +581,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
if not has_custom_name:
_final_value_dct['model_name'] = inflection.underscore(_cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower()
_final_value_dct['start_name'] = inflection.dasherize(
_final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name']
_final_value_dct['start_name'] = inflection.dasherize(_final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name']
model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
# if the default implementation dependencies doesn't exist, then always fallback to 'pt'
@@ -845,20 +792,10 @@ class _ConfigBuilder:
__slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init')
def __init__(self,
cls: type[LLMConfig],
these: dict[str, _CountingAttr],
auto_attribs: bool = False,
kw_only: bool = False,
collect_by_mro: bool = True):
attrs, base_attrs, base_attr_map = _transform_attrs(cls,
these,
auto_attribs,
kw_only,
collect_by_mro,
field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__))
self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict(
cls.__dict__), attrs, {a.name for a in base_attrs}, base_attr_map
def __init__(self, cls: type[LLMConfig], these: dict[str, _CountingAttr], auto_attribs: bool = False, kw_only: bool = False, collect_by_mro: bool = True):
attrs, base_attrs, base_attr_map = _transform_attrs(cls, these, auto_attribs, kw_only, collect_by_mro, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__))
self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict(cls.__dict__), attrs, {a.name for a in base_attrs
}, base_attr_map
self._attr_names = tuple(a.name for a in attrs)
self._has_pre_init = bool(getattr(cls, '__attrs_pre_init__', False))
self._has_post_init = bool(getattr(cls, '__attrs_post_init__', False))
@@ -943,8 +880,7 @@ class _ConfigBuilder:
def add_attrs_init(self) -> Self:
self._cls_dict['__attrs_init__'] = codegen.add_method_dunders(
self._cls,
_make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
return self
def add_repr(self) -> Self:
@@ -1073,13 +1009,13 @@ class LLMConfig(_ConfigAttr):
repr=False,
init=False,
collect_by_mro=True,
field_transformer=codegen.make_env_transformer(
cls,
cls.__openllm_model_name__,
suffix=suffix_env,
globs=globs,
default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default)
if codegen.has_own_attribute(cls, class_attr) else field_default))
field_transformer=codegen.make_env_transformer(cls,
cls.__openllm_model_name__,
suffix=suffix_env,
globs=globs,
default_callback=lambda field_name,
field_default: getattr(getattr(cls, class_attr), field_name, field_default)
if codegen.has_own_attribute(cls, class_attr) else field_default))
# For pickling to work, the __module__ variable needs to be set to the
# frame where the class is created. This respect the module that is created from cls
try:
@@ -1113,8 +1049,7 @@ class LLMConfig(_ConfigAttr):
raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
# We need to set the accepted key before generation_config
# as generation_config is a special field that users shouldn't pass.
cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)
} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
cls = _ConfigBuilder(cls, these).add_attrs_init().add_repr().build_class()
# Finally, resolve the types
@@ -1126,11 +1061,7 @@ class LLMConfig(_ConfigAttr):
attr.resolve_types(cls.__openllm_sampling_class__, globalns=globs)
cls = attr.resolve_types(cls, globalns=globs)
# the hint cache for easier access
cls.__openllm_hints__ = {
f.name: f.type
for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__),
attr.fields(cls.__openllm_sampling_class__)] for f in ite
}
cls.__openllm_hints__ = {f.name: f.type for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), attr.fields(cls.__openllm_sampling_class__)] for f in ite}
# for pickling to work, need to set the module to the correct outer frame
try:
@@ -1141,8 +1072,7 @@ class LLMConfig(_ConfigAttr):
def __setattr__(self, attr: str, value: t.Any) -> None:
if attr in _reserved_namespace:
raise ForbiddenAttributeError(
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.'
)
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
super().__setattr__(attr, value)
def __init__(self, *, generation_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any):
@@ -1157,9 +1087,7 @@ class LLMConfig(_ConfigAttr):
for k in _cached_keys:
if k in generation_config or k in sampling_config or attrs[k] is None: del attrs[k]
self.__openllm_extras__ = config_merger.merge(first_not_none(__openllm_extras__, default={}), {
k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__
})
self.__openllm_extras__ = config_merger.merge(first_not_none(__openllm_extras__, default={}), {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__})
self.generation_config = self['generation_class'](_internal=True, **generation_config)
self.sampling_config = self['sampling_class'].from_generation_config(self.generation_config, **sampling_config)
@@ -1363,15 +1291,13 @@ class LLMConfig(_ConfigAttr):
return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__)
def values(self) -> list[t.Any]:
return ([getattr(self, k.name) for k in attr.fields(self.__class__)] +
[getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] +
return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] +
[getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values()))
def items(self) -> list[tuple[str, t.Any]]:
return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] +
[(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] +
[(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] +
list(self.__openllm_extras__.items()))
[(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items()))
def __iter__(self) -> t.Iterator[str]:
return iter(self.keys())
@@ -1403,9 +1329,9 @@ class LLMConfig(_ConfigAttr):
_new_cfg = {k: v for k, v in attrs.items() if k in attr.fields_dict(_ModelSettingsAttr)}
attrs = {k: v for k, v in attrs.items() if k not in _new_cfg}
new_cls = types.new_class(
name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {}, lambda ns: ns.update({
'__config__': config_merger.merge(copy.deepcopy(cls.__dict__['__config__']), _new_cfg),
'__base_config__': cls, # keep a reference for easy access
name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {},
lambda ns: ns.update({
'__config__': config_merger.merge(copy.deepcopy(cls.__dict__['__config__']), _new_cfg), '__base_config__': cls, # keep a reference for easy access
}))
# For pickling to work, the __module__ variable needs to be set to the
@@ -1566,9 +1492,8 @@ class LLMConfig(_ConfigAttr):
'''
return generation_result
bentoml_cattr.register_unstructure_hook_factory(
lambda cls: lenient_issubclass(cls, LLMConfig),
lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig),
lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
"""Structure a dictionary to a LLMConfig object.
@@ -1594,5 +1519,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs)
bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
openllm_home = os.path.expanduser(
os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))
openllm_home = os.path.expanduser(os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))

View File

@@ -29,5 +29,4 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template
return template.format(instruction=prompt, **prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template."
) from None
f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None

View File

@@ -40,9 +40,7 @@ class GenerationInput:
return attr.make_class(inflection.camelize(llm_config['model_name']) + 'GenerationInput',
attrs={
'prompt': attr.field(type=str),
'llm_config': attr.field(type=llm_config.__class__,
default=llm_config,
converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)),
'llm_config': attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)),
'adapter_name': attr.field(default=None, type=str)
})
@@ -85,12 +83,8 @@ def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.An
finished=request_output.finished,
prompt_token_ids=request_output.prompt_token_ids,
outputs=[
dict(index=it.index,
text=it.text,
token_ids=it.token_ids,
cumulative_logprob=it.cumulative_logprob,
logprobs=it.logprobs,
finish_reason=it.finish_reason) for it in request_output.outputs
dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason)
for it in request_output.outputs
])
@attr.define

View File

@@ -217,7 +217,8 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
return types.new_class(
name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind}, lambda ns: ns.update({
name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind},
lambda ns: ns.update({
'resource_id': resource_kind,
'from_spec': classmethod(_from_spec),
'from_system': classmethod(_from_system),
@@ -234,12 +235,16 @@ _NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu'
_CPU_RESOURCE: t.Literal['cpu'] = 'cpu'
NvidiaGpuResource = _make_resource_class(
'NvidiaGpuResource', _NVIDIA_GPU_RESOURCE, '''NVIDIA GPU resource.
'NvidiaGpuResource',
_NVIDIA_GPU_RESOURCE,
'''NVIDIA GPU resource.
This is a modified version of internal's BentoML's NvidiaGpuResource
where it respects and parse CUDA_VISIBLE_DEVICES correctly.''')
AmdGpuResource = _make_resource_class(
'AmdGpuResource', _AMD_GPU_RESOURCE, '''AMD GPU resource.
'AmdGpuResource',
_AMD_GPU_RESOURCE,
'''AMD GPU resource.
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''')
@@ -305,13 +310,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
return math.ceil(cpus) * workers_per_resource
# this should not be reached by user since we always read system resource as default
raise ValueError(
f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.'
)
raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.')
@classmethod
def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float,
worker_index: int) -> dict[str, t.Any]:
def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float, worker_index: int) -> dict[str, t.Any]:
'''Get worker env for this given worker_index.
Args:
@@ -369,15 +371,12 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
# NOTE: We hit this branch when workers_per_resource is set to
# float, for example 0.5 or 0.25
if workers_per_resource > 1:
raise ValueError(
"Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
)
raise ValueError("Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case.")
# We are round the assigned resource here. This means if workers_per_resource=.4
# then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
assigned_resource_per_worker = round(1 / workers_per_resource)
if len(gpus) < assigned_resource_per_worker:
logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index,
assigned_resource_per_worker)
logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index, assigned_resource_per_worker)
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
assigned_gpu = gpus[assigned_resource_per_worker * worker_index:assigned_resource_per_worker * (worker_index + 1)]
dev = ','.join(assigned_gpu)

View File

@@ -24,9 +24,8 @@ if t.TYPE_CHECKING:
ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]]
# NOTE: This is the entrypoint when adding new model config
CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'),
('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'),
('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'),
CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'),
('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'), ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'),
('baichuan', 'BaichuanConfig')])
class _LazyConfigMapping(OrderedDict, ReprMixin):

View File

@@ -46,8 +46,12 @@ class BaichuanConfig(openllm_core.LLMConfig):
'architecture': 'BaiChuanForCausalLM',
'default_id': 'baichuan-inc/baichuan-7b',
'model_ids': [
'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat', 'fireballoon/baichuan-vicuna-chinese-7b',
'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft'
'baichuan-inc/baichuan-7b',
'baichuan-inc/baichuan-13b-base',
'baichuan-inc/baichuan-13b-chat',
'fireballoon/baichuan-vicuna-chinese-7b',
'fireballoon/baichuan-vicuna-7b',
'hiyouga/baichuan-7b-sft'
]
}
@@ -63,12 +67,7 @@ class BaichuanConfig(openllm_core.LLMConfig):
temperature: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'top_p': top_p,
'temperature': temperature,
**attrs
}, {}
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {'max_new_tokens': max_new_tokens, 'top_p': top_p, 'temperature': temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]

View File

@@ -51,8 +51,7 @@ class ChatGLMConfig(openllm_core.LLMConfig):
'default_id': 'thudm/chatglm-6b',
'model_ids': ['thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b', 'thudm/chatglm2-6b-int4']
}
retain_history: bool = dantic.Field(
False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.')
retain_history: bool = dantic.Field(False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.')
use_half_precision: bool = dantic.Field(True, description='Whether to use half precision for model.')
class GenerationConfig:
@@ -78,20 +77,9 @@ class ChatGLMConfig(openllm_core.LLMConfig):
else:
prompt_text = prompt
postprocess_generate_kwargs = {'chat_history': chat_history if chat_history is not None else None}
return prompt_text, {
'max_new_tokens': max_new_tokens,
'num_beams': num_beams,
'top_p': top_p,
'temperature': temperature,
**attrs
}, postprocess_generate_kwargs
return prompt_text, {'max_new_tokens': max_new_tokens, 'num_beams': num_beams, 'top_p': top_p, 'temperature': temperature, **attrs}, postprocess_generate_kwargs
def postprocess_generate(self,
prompt: str,
generation_result: tuple[str, list[tuple[str, str]]],
*,
chat_history: list[tuple[str, str]] | None = None,
**attrs: t.Any) -> str:
def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
generated, history = generation_result
if self.config.retain_history:
if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")

View File

@@ -98,11 +98,7 @@ class DollyV2Config(openllm_core.LLMConfig):
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'top_k': top_k,
'top_p': top_p,
'temperature': temperature,
**attrs
'max_new_tokens': max_new_tokens, 'top_k': top_k, 'top_p': top_p, 'temperature': temperature, **attrs
}, {}
def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]], **_: t.Any) -> str:

View File

@@ -49,12 +49,7 @@ class FalconConfig(openllm_core.LLMConfig):
'default_id': 'tiiuae/falcon-7b',
'model_ids': ['tiiuae/falcon-7b', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-40b-instruct'],
'fine_tune_strategies': ({
'adapter_type': 'lora',
'r': 64,
'lora_alpha': 16,
'lora_dropout': 0.1,
'bias': 'none',
'target_modules': ['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h']
'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none', 'target_modules': ['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h']
},)
}
@@ -74,11 +69,7 @@ class FalconConfig(openllm_core.LLMConfig):
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'top_k': top_k,
'num_return_sequences': num_return_sequences,
'eos_token_id': eos_token_id,
**attrs
'max_new_tokens': max_new_tokens, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'eos_token_id': eos_token_id, **attrs
}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:

View File

@@ -64,11 +64,7 @@ class FlanT5Config(openllm_core.LLMConfig):
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'top_p': top_p,
'repetition_penalty': repetition_penalty
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'top_p': top_p, 'repetition_penalty': repetition_penalty
}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:

View File

@@ -63,10 +63,7 @@ class GPTNeoXConfig(openllm_core.LLMConfig):
max_new_tokens: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature
}, {}
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {'max_new_tokens': max_new_tokens, 'temperature': temperature}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]

View File

@@ -45,8 +45,11 @@ If a question does not make any sense, or is not factually coherent, explain why
'''
SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = '[INST]', '[/INST]', '<<SYS>>', '</s>', '<s>'
# TODO: support history and v1 prompt implementation
_v1_prompt, _v2_prompt = '''{instruction}''', '''{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} '''.format(
start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction='{instruction}', end_key=EINST_KEY)
_v1_prompt, _v2_prompt = '''{instruction}''', '''{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} '''.format(start_key=SINST_KEY,
sys_key=SYS_KEY,
system_message=SYSTEM_MESSAGE,
instruction='{instruction}',
end_key=EINST_KEY)
PROMPT_MAPPING = {'v1': _v1_prompt, 'v2': _v2_prompt}
def _get_prompt(model_type: t.Literal['v1', 'v2']) -> str:
@@ -71,26 +74,35 @@ class LlamaConfig(openllm_core.LLMConfig):
'name_type': 'lowercase',
'url': 'https://github.com/facebookresearch/llama',
'default_backend': {
'cpu': 'pt',
'nvidia.com/gpu': 'pt'
'cpu': 'pt', 'nvidia.com/gpu': 'pt'
},
'architecture': 'LlamaForCausalLM',
'requirements': ['fairscale', 'sentencepiece'],
'tokenizer_class': 'LlamaTokenizerFast',
'default_id': 'NousResearch/llama-2-7b-hf',
'model_ids': [
'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-70b-hf',
'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf', 'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf',
'NousResearch/llama-2-7b-chat-hf', 'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf',
'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b', 'huggyllama/llama-65b',
'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b'
'meta-llama/Llama-2-70b-chat-hf',
'meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-7b-chat-hf',
'meta-llama/Llama-2-70b-hf',
'meta-llama/Llama-2-13b-hf',
'meta-llama/Llama-2-7b-hf',
'NousResearch/llama-2-70b-chat-hf',
'NousResearch/llama-2-13b-chat-hf',
'NousResearch/llama-2-7b-chat-hf',
'NousResearch/llama-2-70b-hf',
'NousResearch/llama-2-13b-hf',
'NousResearch/llama-2-7b-hf',
'openlm-research/open_llama_7b_v2',
'openlm-research/open_llama_3b_v2',
'openlm-research/open_llama_13b',
'huggyllama/llama-65b',
'huggyllama/llama-30b',
'huggyllama/llama-13b',
'huggyllama/llama-7b'
],
'fine_tune_strategies': ({
'adapter_type': 'lora',
'r': 64,
'lora_alpha': 16,
'lora_dropout': 0.1,
'bias': 'none'
'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'
},)
}
@@ -113,14 +125,9 @@ class LlamaConfig(openllm_core.LLMConfig):
use_default_prompt_template: bool = False,
use_llama2_prompt: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt,
DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None,
use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_p': top_p,
'top_k': top_k
}, {}
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p, 'top_k': top_k
}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]

View File

@@ -69,12 +69,10 @@ class MPTConfig(openllm_core.LLMConfig):
'architecture': 'MPTForCausalLM',
'default_id': 'mosaicml/mpt-7b-instruct',
'model_ids': [
'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b',
'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat'
'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b', 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat'
]
}
prompt_type: MPTPromptType = dantic.Field('"default"',
description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.')
prompt_type: MPTPromptType = dantic.Field('"default"', description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.')
max_sequence_length: int = dantic.Field(
2048,
description=
@@ -103,11 +101,7 @@ class MPTConfig(openllm_core.LLMConfig):
elif 'chat' in self.model_id: prompt_type = 'chat'
else: prompt_type = 'default'
_template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
return process_prompt(prompt, _template, use_default_prompt_template), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_p': top_p
}, {}
return process_prompt(prompt, _template, use_default_prompt_template), {'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
return generation_result[0]

View File

@@ -51,12 +51,7 @@ class OPTConfig(openllm_core.LLMConfig):
'architecture': 'OPTForCausalLM',
'model_ids': ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-66b'],
'fine_tune_strategies': ({
'adapter_type': 'lora',
'r': 16,
'lora_alpha': 32,
'target_modules': ['q_proj', 'v_proj'],
'lora_dropout': 0.05,
'bias': 'none'
'adapter_type': 'lora', 'r': 16, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'lora_dropout': 0.05, 'bias': 'none'
},)
}
format_outputs: bool = dantic.Field(False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''')
@@ -76,10 +71,7 @@ class OPTConfig(openllm_core.LLMConfig):
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:

View File

@@ -51,10 +51,7 @@ class StableLMConfig(openllm_core.LLMConfig):
'url': 'https://github.com/Stability-AI/StableLM',
'architecture': 'GPTNeoXForCausalLM',
'default_id': 'stabilityai/stablelm-tuned-alpha-3b',
'model_ids': [
'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b',
'stabilityai/stablelm-base-alpha-7b'
]
'model_ids': ['stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b', 'stabilityai/stablelm-base-alpha-7b']
}
class GenerationConfig:

View File

@@ -71,14 +71,7 @@ class StarCoderConfig(openllm_core.LLMConfig):
else:
prompt_text = prompt
# XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD
return prompt_text, {
'temperature': temperature,
'top_p': top_p,
'max_new_tokens': max_new_tokens,
'repetition_penalty': repetition_penalty,
'pad_token_id': 49152,
**attrs
}, {}
return prompt_text, {'temperature': temperature, 'top_p': top_p, 'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty, 'pad_token_id': 49152, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]

View File

@@ -152,32 +152,24 @@ _LOGGING_CONFIG: dict[str, t.Any] = {
'filters': {
'excfilter': {
'()': 'openllm_core.utils.ExceptionFilter'
},
'infofilter': {
}, 'infofilter': {
'()': 'openllm_core.utils.InfoFilter'
}
},
'handlers': {
'bentomlhandler': {
'class': 'logging.StreamHandler',
'filters': ['excfilter', 'infofilter'],
'stream': 'ext://sys.stdout'
'class': 'logging.StreamHandler', 'filters': ['excfilter', 'infofilter'], 'stream': 'ext://sys.stdout'
},
'defaulthandler': {
'class': 'logging.StreamHandler',
'level': logging.WARNING
'class': 'logging.StreamHandler', 'level': logging.WARNING
}
},
'loggers': {
'bentoml': {
'handlers': ['bentomlhandler', 'defaulthandler'],
'level': logging.INFO,
'propagate': False
'handlers': ['bentomlhandler', 'defaulthandler'], 'level': logging.INFO, 'propagate': False
},
'openllm': {
'handlers': ['bentomlhandler', 'defaulthandler'],
'level': logging.INFO,
'propagate': False
'handlers': ['bentomlhandler', 'defaulthandler'], 'level': logging.INFO, 'propagate': False
}
},
'root': {
@@ -318,9 +310,7 @@ _whitelist_modules = {'pkg'}
# XXX: define all classes, functions import above this line
# since _extras will be the locals() import from this file.
_extras: dict[str, t.Any] = {
k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))
}
_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))}
_extras['__openllm_migration__'] = {'ModelEnv': 'EnvVarMixin'}
_import_structure: dict[str, list[str]] = {
'analytics': [],
@@ -329,11 +319,32 @@ _import_structure: dict[str, list[str]] = {
'lazy': [],
'representation': ['ReprMixin'],
'import_utils': [
'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', 'is_einops_available',
'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available',
'is_datasets_available', 'is_transformers_supports_kbit', 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available',
'is_notebook_available', 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available',
'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', 'is_transformers_available'
'OPTIONAL_DEPENDENCIES',
'DummyMetaclass',
'EnvVarMixin',
'require_backends',
'is_cpm_kernels_available',
'is_einops_available',
'is_flax_available',
'is_tf_available',
'is_vllm_available',
'is_torch_available',
'is_bitsandbytes_available',
'is_peft_available',
'is_datasets_available',
'is_transformers_supports_kbit',
'is_transformers_supports_agent',
'is_jupyter_available',
'is_jupytext_available',
'is_notebook_available',
'is_triton_available',
'is_autogptq_available',
'is_sentencepiece_available',
'is_xformers_available',
'is_fairscale_available',
'is_grpc_available',
'is_grpc_health_available',
'is_transformers_available'
]
}

View File

@@ -130,28 +130,13 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig],
default_callback = identity if default_callback is None else default_callback
globs = {} if globs is None else globs
globs.update({
'__populate_env': dantic.env_converter,
'__default_callback': default_callback,
'__field_env': field_env_key,
'__suffix': suffix or '',
'__model_name': model_name,
})
globs.update({'__populate_env': dantic.env_converter, '__default_callback': default_callback, '__field_env': field_env_key, '__suffix': suffix or '', '__model_name': model_name,})
lines: ListStr = [
'__env=lambda field_name:__field_env(field_name,__suffix)',
"return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]"
]
fields_ann = 'list[attr.Attribute[t.Any]]'
return generate_function(cls,
'__auto_env',
lines,
args=('_', 'fields'),
globs=globs,
annotations={
'_': 'type[LLMConfig]',
'fields': fields_ann,
'return': fields_ann
})
return generate_function(cls, '__auto_env', lines, args=('_', 'fields'), globs=globs, annotations={'_': 'type[LLMConfig]', 'fields': fields_ann, 'return': fields_ann})
def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
'''Enhance sdk with nice repr that plays well with your brain.'''
@@ -178,7 +163,8 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
'__doc__': inspect.cleandoc(doc),
'__module__': 'openllm'
}),
)(func, **attrs), func,
)(func, **attrs),
func,
))
__all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function']

View File

@@ -25,19 +25,29 @@ AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar('FC', bound=t.Union[AnyCallable, click.Command])
__all__ = [
'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice', 'LiteralChoice', 'allows_multiple',
'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg', 'CUDA', 'JsonType', 'BytesType'
'FC',
'attrs_to_options',
'Field',
'parse_type',
'is_typing',
'is_literal',
'ModuleType',
'EnumChoice',
'LiteralChoice',
'allows_multiple',
'is_mapping',
'is_container',
'parse_container_args',
'parse_single_arg',
'CUDA',
'JsonType',
'BytesType'
]
def __dir__() -> list[str]:
return sorted(__all__)
def attrs_to_options(name: str,
field: attr.Attribute[t.Any],
model_name: str,
typ: t.Any = None,
suffix_generation: bool = False,
suffix_sampling: bool = False) -> t.Callable[[FC], FC]:
def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False) -> t.Callable[[FC], FC]:
# TODO: support parsing nested attrs class and Union
envvar = field.metadata['env']
dasherized = inflection.dasherize(name)

View File

@@ -142,8 +142,17 @@ def is_tf_available() -> bool:
_tf_version = None
if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
if _tf_available:
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', 'tensorflow-aarch64',
candidates = ('tensorflow',
'tensorflow-cpu',
'tensorflow-gpu',
'tf-nightly',
'tf-nightly-cpu',
'tf-nightly-gpu',
'intel-tensorflow',
'intel-tensorflow-avx512',
'tensorflow-rocm',
'tensorflow-macos',
'tensorflow-aarch64',
)
_tf_version = None
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
@@ -282,20 +291,13 @@ You can install it with pip: `pip install fairscale`. Please note that you may n
your runtime after installation.
'''
BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)),
('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)),
('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)),
('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)),
('einops', (is_einops_available, EINOPS_IMPORT_ERROR)),
('triton', (is_triton_available, TRITON_IMPORT_ERROR)),
('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)),
('peft', (is_peft_available, PEFT_IMPORT_ERROR)),
('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)),
('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))])
BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)), ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)),
('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)),
('triton', (is_triton_available, TRITON_IMPORT_ERROR)), ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)),
('peft', (is_peft_available, PEFT_IMPORT_ERROR)), ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))])
class DummyMetaclass(abc.ABCMeta):
'''Metaclass for dummy object.

View File

@@ -126,10 +126,9 @@ class LazyModule(types.ModuleType):
}
if name in dunder_to_metadata:
if name not in {'__version_info__', '__copyright__', '__version__'}:
warnings.warn(
f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.",
DeprecationWarning,
stacklevel=2)
warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.",
DeprecationWarning,
stacklevel=2)
meta = importlib.metadata.metadata('openllm')
project_url = dict(url.split(', ') for url in t.cast(t.List[str], meta.get_all('Project-URL')))
if name == '__license__': return 'Apache-2.0'
@@ -146,9 +145,7 @@ class LazyModule(types.ModuleType):
if '__openllm_migration__' in self._objects:
cur_value = self._objects['__openllm_migration__'].get(name, _sentinel)
if cur_value is not _sentinel:
warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead",
DeprecationWarning,
stacklevel=3)
warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", DeprecationWarning, stacklevel=3)
return getattr(self, cur_value)
if name in self._objects: return self._objects.__getitem__(name)
if name in self._modules: value = self._get_module(name)

View File

@@ -129,9 +129,7 @@ else:
try:
if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_vllm_objects"] = [
name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.baichuan"].extend(["VLLMBaichuan"])
_import_structure["models.llama"].extend(["VLLMLlama"])
@@ -157,9 +155,7 @@ else:
try:
if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_flax_objects"] = [
name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
_import_structure["models.opt"].extend(["FlaxOPT"])
@@ -171,9 +167,7 @@ else:
try:
if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_tf_objects"] = [
name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
_import_structure["models.opt"].extend(["TFOPT"])
@@ -184,15 +178,7 @@ else:
from .models.opt import TFOPT as TFOPT
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__lazy = openllm_core.utils.LazyModule(__name__,
globals()["__file__"],
_import_structure,
extra_objects={
"COMPILED": COMPILED,
"__openllm_migration__": {
"LLMEmbeddings": "EmbeddingsOutput"
}
})
__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED, "__openllm_migration__": {"LLMEmbeddings": "EmbeddingsOutput"}})
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -99,10 +99,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
# _cached_LLMFunction_get and _ccached_LLMSerialisation_get
globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
# llm_post_init implementation
lines: ListStr = [
f'_impl_{cls.__name__}_func=cls.llm_post_init',
_setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
]
lines: ListStr = [f'_impl_{cls.__name__}_func=cls.llm_post_init', _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')]
serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
for func, impl in serialisation_attr.items():
@@ -114,10 +111,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
# assign vLLM implementation
if cls.__llm_backend__ == 'vllm':
vllm_func = {
f'_vllm_{it}': fn
for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
}
vllm_func = {f'_vllm_{it}': fn for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))}
globs.update(vllm_func)
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
@@ -137,15 +131,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
return codegen.generate_function(cls,
'__assign_llm_attr',
lines,
args=('cls', *args),
globs=globs,
annotations={
'cls': 't.Type[LLM]',
'return': None
})
return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations={'cls': 't.Type[LLM]', 'return': None})
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]['outputs'][0]['text']

View File

@@ -25,8 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
return bentoml.transformers.get(ids)
except bentoml.exceptions.NotFound:
model_signatures = {
k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search',
'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
k: ModelSignature(batchable=False)
for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
}
with bentoml.models.create(ids,
module=MODULE_NAME,
@@ -34,8 +34,7 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='transformers'),
labels={
'runtime': 'pt',
'framework': 'openllm'
'runtime': 'pt', 'framework': 'openllm'
},
signatures=model_signatures) as bentomodel:
snapshot_download(_GENERIC_EMBEDDING_ID,

View File

@@ -14,8 +14,7 @@ LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(self, stop_sequences: str | list[str],
tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer

View File

@@ -278,12 +278,20 @@ class LLM(LLMInterface[M, T], ReprMixin):
if t.TYPE_CHECKING: __name__: str
if t.TYPE_CHECKING and not MYPY:
def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4',
'gptq']], serialisation_format: t.Literal['safetensors',
'legacy'], _local: bool, **attrs: t.Any) -> None:
def __attrs_init__(self,
config: LLMConfig,
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
model_id: str,
model_decls: TupleAny,
model_attrs: DictStrAny,
tokenizer_attrs: DictStrAny,
tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping],
model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
serialisation_format: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any) -> None:
'''Generated __attrs_init__ for openllm.LLM.'''
config: LLMConfig
@@ -432,14 +440,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
_local = False
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
quantize = first_not_none(quantize,
t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])),
default=None)
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
# quantization setup
if quantization_config and quantize:
raise ValueError(
"'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
if quantization_config is None and quantize is not None:
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
if quantize == 'gptq': serialisation = 'safetensors'
@@ -465,9 +470,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
if _tag.version is None:
raise ValueError(f'Failed to resolve the correct model version for {cfg_cls.__openllm_start_name__}')
except Exception as err:
raise OpenLLMException(
f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}"
) from err
raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}") from err
return cls(*args,
model_id=_model_id,
@@ -518,9 +521,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
else:
from .serialisation.transformers._helpers import process_config
model_version = getattr(
process_config(model_id,
trust_remote_code=cls.config_class.__openllm_trust_remote_code__,
revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None)
process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None)
if model_version is None:
raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)")
return f'{tag_name}:{model_version}'
@@ -529,10 +530,18 @@ class LLM(LLMInterface[M, T], ReprMixin):
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any,
def __init__(self,
*args: t.Any,
model_id: str,
llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
_model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any,
):
'''Initialize the LLM with given pretrained model.
@@ -630,21 +639,27 @@ class LLM(LLMInterface[M, T], ReprMixin):
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
# NOTE: Save the args and kwargs for latter load
self.__attrs_init__(llm_config, quantization_config, model_id, args, {
**model_kwds,
**normalized_model_kwds
}, {
**tokenizer_kwds,
**normalized_tokenizer_kwds
}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
self.__attrs_init__(llm_config,
quantization_config,
model_id,
args, {
**model_kwds, **normalized_model_kwds
}, {
**tokenizer_kwds, **normalized_tokenizer_kwds
},
_tag,
_adapters_mapping,
_model_version,
_quantize_method,
_serialisation_format,
_local)
self.llm_post_init()
def __setattr__(self, attr: str, value: t.Any) -> None:
if attr in _reserved_namespace:
raise ForbiddenAttributeError(
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.'
)
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
super().__setattr__(attr, value)
@property
@@ -738,8 +753,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
model = self.load_model(*self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_backend__ == 'pt' and is_torch_available():
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(
model, 'is_quantized', False)
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to('cuda')
@@ -777,34 +791,22 @@ class LLM(LLMInterface[M, T], ReprMixin):
if name is None:
_converted_first_none = True
name = 'default'
peft_config = default_config.with_config(
**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
adapter_config=adapter.config,
inference_mode=inference_mode,
llm_config_class=self.config_class).to_peft_config()
peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(
adapter_type=t.cast('PeftType', _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config()
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
return adapter_map
def prepare_for_training(self,
adapter_type: AdapterType = 'lora',
use_gradient_checkpointing: bool = True,
**attrs: t.Any) -> tuple[peft.PeftModel, T]:
def prepare_for_training(self, adapter_type: AdapterType = 'lora', use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]:
from peft import prepare_model_for_kbit_training
peft_config = self.config['fine_tune_strategies'].get(
adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type),
llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
wrapped_peft = peft.get_peft_model(
prepare_model_for_kbit_training( # type: ignore[no-untyped-call]
self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
peft_config = self.config['fine_tune_strategies'].get(adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type),
llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training( # type: ignore[no-untyped-call]
self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
if DEBUG: wrapped_peft.print_trainable_parameters()
return wrapped_peft, self.tokenizer
def apply_adapter(self,
inference_mode: bool = True,
adapter_type: AdapterType = 'lora',
load_adapters: t.Literal['all'] | list[str] | None = None,
use_cache: bool = True) -> M:
def apply_adapter(self, inference_mode: bool = True, adapter_type: AdapterType = 'lora', load_adapters: t.Literal['all'] | list[str] | None = None, use_cache: bool = True) -> M:
'''Apply given LoRA mapping to the model. Note that the base model can still be accessed via self.model.get_base_model().'''
if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
# early out if _adapters_mapping is empty or it is already wrapped with peft.
@@ -828,10 +830,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
adapters_to_load = adapter_mapping.keys() if load_adapters == 'all' else load_adapters
for adapter_name in adapters_to_load:
_peft_config, _peft_model_id = adapter_mapping[adapter_name]
t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id,
adapter_name=adapter_name,
is_trainable=not inference_mode,
**dict(_peft_config.to_dict()))
t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id, adapter_name=adapter_name, is_trainable=not inference_mode, **dict(_peft_config.to_dict()))
return self.__llm_model__
@@ -848,8 +847,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
# the below shared similar logics with `get_peft_model`
# TODO: Support PromptLearningConfig
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig):
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
default_config.task_type)
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", default_config.task_type)
model = peft.PeftModel(self.__llm_model__, default_config)
else:
# XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
@@ -1041,42 +1039,21 @@ class LLM(LLMInterface[M, T], ReprMixin):
# Prevent yielding partial stop sequence
if not partially_stopped:
yield {
'text': output,
'usage': {
'prompt_tokens': input_echo_len,
'completion_tokens': i,
'total_tokens': input_echo_len + i
},
'finish_reason': None
}
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
if stopped: break
# Finish stream event, which contains finish reason
if i == self.config['max_new_tokens'] - 1: finish_reason = 'length'
elif stopped: finish_reason = 'stop'
else: finish_reason = None
yield {
'text': output,
'usage': {
'prompt_tokens': input_echo_len,
'completion_tokens': i,
'total_tokens': input_echo_len + i
},
'finish_reason': finish_reason
}
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
# Clean
del past_key_values, out
gc.collect()
torch.cuda.empty_cache()
@overload
def Runner(model_name: str,
*,
model_id: str | None = None,
model_version: str | None = ...,
init_local: t.Literal[False, True] = ...,
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
...
@overload
@@ -1158,10 +1135,7 @@ def Runner(model_name: str,
'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
})
backend = t.cast(
LiteralBackend,
first_not_none(backend,
default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
backend = t.cast(LiteralBackend, first_not_none(backend, default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
if init_local: ensure_available = True
runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs)
if init_local: runner.init_local(quiet=True)
@@ -1174,8 +1148,7 @@ class SetAdapterOutput(t.TypedDict):
success: bool
message: str
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature,
generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
@@ -1234,7 +1207,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
return ' '.join(output_text) + ' '
return types.new_class(
self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({
self.__class__.__name__ + 'Runnable', (_Runnable,), {},
lambda ns: ns.update({
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
'__module__': self.__module__,
'__doc__': self.config['env'].start_docstring
@@ -1281,12 +1255,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
yield 'runner_methods', {
method.name: {
'batchable': method.config.batchable,
'batch_dim': method.config.batch_dim if method.config.batchable else None
} for method in __self.runner_methods
}
yield 'runner_methods', {method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods}
yield 'config', self.config.model_dump(flatten=True)
yield 'llm_type', __self.llm_type
yield 'backend', self.__llm_backend__

View File

@@ -15,25 +15,21 @@ if t.TYPE_CHECKING:
from ._llm import LLM
autogptq, torch, transformers = LazyLoader('autogptq', globals(),
'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
logger = logging.getLogger(__name__)
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'],
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
...
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'],
**attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
...
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)

View File

@@ -21,8 +21,7 @@ if t.TYPE_CHECKING:
from bentoml._internal.runner.runner import AbstractRunner
from bentoml._internal.runner.runner import RunnerMethod
from openllm_core._typing_compat import TypeAlias
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]],
t.Sequence[openllm.EmbeddingsOutput]]
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
@@ -44,12 +43,7 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=ru
_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
@svc.api(route='/v1/generate',
input=_JsonInput,
output=bentoml.io.JSON.from_sample({
'responses': [],
'configuration': llm_config.model_dump(flatten=True)
}))
@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
@@ -86,11 +80,32 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
output=bentoml.io.JSON.from_sample({
'embeddings': [
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752,
0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679,
-0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
-0.014814382418990135, 0.01796768605709076
0.007917795330286026,
-0.014421648345887661,
0.00481307040899992,
0.007331526838243008,
-0.0066398633643984795,
0.00945580005645752,
0.0087016262114048,
-0.010709521360695362,
0.012635177001357079,
0.010541186667978764,
-0.00730888033285737,
-0.001783102168701589,
0.02339819073677063,
-0.010825827717781067,
-0.015888236463069916,
0.01876218430697918,
0.0076906150206923485,
0.0009032754460349679,
-0.010024012066423893,
0.01090280432254076,
-0.008668390102684498,
0.02070549875497818,
0.0014594447566196322,
-0.018775740638375282,
-0.014814382418990135,
0.01796768605709076
],
'num_tokens': 20
}))

View File

@@ -63,11 +63,7 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
llm_fs: FS,
extra_dependencies: tuple[str, ...] | None = None,
adapter_map: dict[str, str | None] | None = None,
) -> PythonOptions:
def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
packages = ['openllm', 'scipy'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
# NOTE: add openllm to the default dependencies
@@ -90,8 +86,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
elif backend_envvar == 'tf':
if not openllm_core.utils.is_tf_available():
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
candidates = ('tensorflow',
'tensorflow-cpu',
'tensorflow-gpu',
'tf-nightly',
'tf-nightly-cpu',
'tf-nightly-gpu',
'intel-tensorflow',
'intel-tensorflow-avx512',
'tensorflow-rocm',
'tensorflow-macos',
)
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for candidate in candidates:
@@ -109,10 +113,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
packages.extend([f'torch>={importlib.metadata.version("torch")}'])
wheels: list[str] = []
built_wheels: list[str | None] = [
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
for p in ('openllm_core', 'openllm_client', 'openllm')
]
built_wheels: list[str |
None] = [build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')]
if all(i for i in built_wheels):
wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(packages=packages,
@@ -120,9 +122,14 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
lock_packages=False,
extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry,
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
_: FS,
workers_per_resource: float,
quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
@@ -145,9 +152,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template)
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
OPENLLM_MODEL_NAME = '# openllm: model name'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -188,8 +193,7 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N
if OPENLLM_MODEL_NAME in it:
src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
elif OPENLLM_MODEL_ADAPTER_MAP in it:
src_contents[src_contents.index(it)] = (
ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG: logger.info('Generated script:\n%s', script)
llm_fs.writetext(llm.config['service_name'], script)
@@ -210,13 +214,7 @@ def create_bento(bento_tag: bentoml.Tag,
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
backend_envvar = llm.config['env']['backend_value']
labels = dict(llm.identifying_params)
labels.update({
'_type': llm.llm_type,
'_framework': backend_envvar,
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle'
})
labels.update({'_type': llm.llm_type, '_framework': backend_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'})
if adapter_map: labels.update(adapter_map)
if isinstance(workers_per_resource, str):
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
@@ -242,8 +240,15 @@ def create_bento(bento_tag: bentoml.Tag,
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template,
serialisation_format, container_registry, container_version_strategy))
docker=construct_docker_options(llm,
llm_fs,
workers_per_resource,
quantize,
adapter_map,
dockerfile_template,
serialisation_format,
container_registry,
container_version_strategy))
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.

View File

@@ -42,11 +42,7 @@ ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
# but in the future, we can infer based on git repo and everything to make it more options for users
# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
# NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'
}
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER = 'bentoml'
@@ -82,9 +78,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range()))
return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
# now is the correct behaviour
return orjson.loads(
subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
@@ -142,9 +136,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
try:
if not _BUILDER.health(): raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
if openllm_core.utils.device_count() == 0:
raise RuntimeError('Building base container requires GPUs (None available)')
if not shutil.which('nvidia-container-runtime'):
@@ -153,8 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -171,8 +162,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
quiet=machine)
if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
return tags
if t.TYPE_CHECKING:

View File

@@ -43,35 +43,29 @@ _AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(str(it.tag), help='Bento')
for it in bentoml.list()
if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
]
return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool,
environ: DictStrAny) -> DictStrAny:
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
_bentoml_config_options_opts = [
'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
'tracing.sample_rate=1.0',
f'api_server.traffic.timeout={server_timeout}',
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
]
if device:
if len(device) > 1:
_bentoml_config_options_opts.extend(
[f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
_bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else:
_bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if cors:
_bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend(
[f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -123,18 +117,27 @@ Available official model_id(s): [default: {llm_config['default_id']}]
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update({
'short_help': '(Disabled because there is no GPU available)',
'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
})
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
@group.command(**command_attrs)
@start_decorator(llm_config, serve_grpc=_serve_grpc)
@click.pass_context
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
def start_cmd(ctx: click.Context,
/,
server_timeout: int,
model_id: str | None,
model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
backend: LiteralBackend,
serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool,
adapter_id: str | None,
return_process: bool,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
termui.echo(
@@ -202,8 +205,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
cmd_name = f'openllm build {model_name}'
if adapter_map is not None:
cmd_name += ' ' + ' '.join(
[f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
if not openllm.utils.get_quiet_mode():
termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
@@ -242,11 +244,15 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = openllm.utils.compose(
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
llm_config.to_click_options,
_http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
model_id_option(factory=cog.optgroup),
model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
cog.optgroup.group('LLM Optimization Options',
help='''Optimization related options.
@@ -257,7 +263,9 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
cog.optgroup.option('--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
@@ -375,32 +383,16 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput
**attrs)(f)
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--cors/--no-cors',
show_default=True,
default=False,
envvar='OPENLLM_CORS',
show_envvar=True,
help='Enable CORS for the server.',
**attrs)(f)
return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-id',
type=click.STRING,
default=None,
envvar='OPENLLM_MODEL_ID',
show_envvar=True,
help='Optional model_id name or path for (fine-tune) weight.',
**attrs)(f)
return cli_option('--model-id', type=click.STRING, default=None, envvar='OPENLLM_MODEL_ID', show_envvar=True, help='Optional model_id name or path for (fine-tune) weight.', **attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-version',
type=click.STRING,
default=None,
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
# NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
@@ -512,8 +504,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
try:
float(value) # type: ignore[arg-type]
except ValueError:
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx,
param) from None
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
else:
return value

View File

@@ -83,10 +83,7 @@ def _start(model_name: str,
from .entrypoint import start_command
from .entrypoint import start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()),
model_id=model_id,
quantize=quantize)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name, backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), model_id=model_id, quantize=quantize)
os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
args: list[str] = []
@@ -102,9 +99,7 @@ def _start(model_name: str,
if additional_args: args.extend(additional_args)
if __test__: args.append('--return-process')
return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
model_name,
_context_settings=termui.CONTEXT_SETTINGS,
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS,
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
@inject
@@ -199,9 +194,7 @@ def _build(model_name: str,
raise OpenLLMException(str(e)) from None
matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
if matched is None:
raise ValueError(
f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
)
raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
return bentoml.get(matched.group(1), _bento_store=bento_store)
def _import_model(model_name: str,
@@ -256,6 +249,5 @@ def _list_models() -> dict[str, t.Any]:
return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']

View File

@@ -28,14 +28,10 @@ if t.TYPE_CHECKING:
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
''')
@container_registry_option
@click.option('--version-strategy',
type=click.Choice(['release', 'latest', 'nightly']),
default='nightly',
help='Version strategy to use for tagging the image.')
@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool,
machine: bool) -> dict[str, str]:
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping

View File

@@ -31,9 +31,7 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
except bentoml.exceptions.NotFound:
ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
ctx.fail(
f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
)
ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
if machine: return bentomodel.path
# copy and paste this into a new shell
if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)

View File

@@ -41,11 +41,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
# for the reconstruction of the Dockerfile.
if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
build_ctx=bentomodel.path,
conda=options.conda,
bento_fs=bentomodel._fs,
enable_buildkit=True,
add_header=True)
doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
termui.echo(doc, fg='white')
return bentomodel.path

View File

@@ -18,9 +18,7 @@ from openllm_core._prompt import process_prompt
LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
shell_complete=model_complete_envvar)
@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@@ -32,8 +30,7 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
callback=opt_callback,
metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any],
**_: t.Any) -> str | None:
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
'''Get the default prompt used by OpenLLM.'''
module = openllm.utils.EnvVarMixin(model_name).module
_memoized = {k: v[0] for k, v in _memoized.items() if v}

View File

@@ -22,17 +22,10 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
'tag': str(b.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
'models': [{
'tag': str(m.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
}
for m in (bentoml.models.get(_.tag)
for _ in b.info.models)]
}
for b in tuple(i
for i in bentoml.list()
if all(k in i.info.labels
for k in {'start_name', 'bundler'}))
if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
} for b in tuple(i for i in bentoml.list() if all(
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
}
mapping = {k: v for k, v in mapping.items() if v}
if output == 'pretty':

View File

@@ -25,30 +25,17 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
'''This is equivalent to openllm models --show-available less the nice table.'''
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [
i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and
'model_name' in i.info.labels and i.info.labels['model_name'] == k
] for k in models
k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k]
for k in models
}
if model_name is not None:
ids_in_local_store = {
k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
for k, v in ids_in_local_store.items()
}
ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
local_models = {
k: [{
'tag': str(i.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
} for i in val] for k, val in ids_in_local_store.items()
}
local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
if output == 'pretty':
import tabulate
tabulate.PRESERVE_WHITESPACE = True
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size']),
fg='white')
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
else:
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
return local_models

View File

@@ -153,13 +153,11 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
def keys(self) -> ConfigModelKeysView:
return t.cast('ConfigModelKeysView',
[self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] +
list(self._extra_content.keys()))
[self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()))
def values(self) -> ConfigModelValuesView:
return t.cast('ConfigModelValuesView',
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] +
list(self._extra_content.values()))
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
def items(self) -> ConfigModelItemsView:
return t.cast('ConfigModelItemsView',

View File

@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'),
('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'),
('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
class AutoLLM(BaseAutoLLMClass):

View File

@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'),
('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
class AutoVLLM(BaseAutoLLMClass):

View File

@@ -12,36 +12,24 @@ from openllm_core.config.configuration_dolly_v2 import get_special_token_id
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
else:
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
'transformers'), openllm.utils.LazyLoader(
'tf', globals(), 'tensorflow')
'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
logger = logging.getLogger(__name__)
@overload
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[True] = True,
**attrs: t.Any) -> transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
...
@overload
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[False] = ...,
**attrs: t.Any) -> type[transformers.Pipeline]:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
...
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: bool = False,
**attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
# Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
class InstructionTextGenerationPipeline(transformers.Pipeline):
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self,
return_full_text: bool | None = None,
**generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
preprocess_params: dict[str, t.Any] = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
@@ -87,11 +75,7 @@ def get_pipeline(model: transformers.PreTrainedModel,
instruction_text = input_tensors.pop('instruction_text')
return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}
def postprocess(self,
model_outputs: dict[str, t.Any],
response_key_token_id: int,
end_key_token_id: int,
return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
_generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
@@ -149,10 +133,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
self.tokenizer,
_init=True,
return_full_text=self.config.return_full_text)
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
llm_config = self.config.model_construct_env(**attrs)

View File

@@ -18,17 +18,14 @@ class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTraine
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id,
**attrs).to_generation_config()),
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
skip_special_tokens=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -23,16 +23,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
masked_embeddings = data * mask
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -36,10 +36,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32
}, {}
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
import torch
@@ -51,12 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
config=config,
torch_dtype=torch_dtype,
trust_remote_code=trust_remote_code,
device_map=device_map,
**attrs)
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
try:
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
finally:
@@ -67,12 +59,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
device_map = attrs.pop('device_map', None)
trust_remote_code = attrs.pop('trust_remote_code', True)
config = get_mpt_config(self._bentomodel.path,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
config=config,
trust_remote_code=trust_remote_code,

View File

@@ -16,8 +16,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
__openllm_internal__ = True
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
@@ -34,11 +33,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences,
'repetition_penalty': repetition_penalty
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
}, {}
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:

View File

@@ -11,8 +11,7 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import transformers
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),

View File

@@ -19,8 +19,5 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
}, {}

View File

@@ -18,10 +18,7 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32
}, {}
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import torch
@@ -50,11 +47,9 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -56,18 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("falcon",
model_id=model_args.model_id,
quantize="int4",
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(
adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

View File

@@ -98,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])
# tokenize and chunk dataset
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True,
remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")
@@ -180,15 +179,11 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
transformers.set_seed(model_args.seed)
model, tokenizer = prepare_for_int4_training(model_args.model_id,
gradient_checkpointing=training_args.gradient_checkpointing,
bf16=training_args.bf16,
)
model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,)
datasets = prepare_datasets(tokenizer)
trainer = transformers.Trainer(model=model,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)

View File

@@ -56,13 +56,12 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8",
ensure_available=True).prepare_for_training(adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True).prepare_for_training(adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
# ft on english_quotes
data = load_dataset("Abirate/english_quotes")

View File

@@ -43,13 +43,10 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
try:
tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
except KeyError:
raise openllm.exceptions.OpenLLMException(
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
else:
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
trust_remote_code=llm.trust_remote_code,
**tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id

View File

@@ -6,6 +6,4 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'),
'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
}
HUB_ATTRS = [
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'
]
HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token']

View File

@@ -43,11 +43,7 @@ logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model']
@inject
def import_model(llm: openllm.LLM[M, T],
*decls: t.Any,
trust_remote_code: bool,
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
**attrs: t.Any) -> bentoml.Model:
def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
"""Auto detect model type from given model_id and import it to bentoml's model store.
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
@@ -76,8 +72,7 @@ def import_model(llm: openllm.LLM[M, T],
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
@@ -107,8 +102,7 @@ def import_model(llm: openllm.LLM[M, T],
tokenizer.save_pretrained(bentomodel.path)
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
@@ -124,20 +118,13 @@ def import_model(llm: openllm.LLM[M, T],
else:
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
architecture = architectures[0]
update_model(bentomodel, metadata={'_pretrained_class': architecture})
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id,
*decls,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs)
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
@@ -149,8 +136,7 @@ def import_model(llm: openllm.LLM[M, T],
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
# NOTE: We need to free up the cache after importing the model
@@ -171,8 +157,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
if Version(model.info.api_version) < Version('v2'):
raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
if model.info.labels['backend'] != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
return model
except Exception as err:
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
@@ -185,8 +170,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,

View File

@@ -23,8 +23,7 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
_object_setattr = object.__setattr__
@@ -45,11 +44,7 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
if not isinstance(config, transformers.PretrainedConfig):
copied_attrs = copy.deepcopy(attrs)
if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
config, attrs = transformers.AutoConfig.from_pretrained(model_id,
return_unused_kwargs=True,
trust_remote_code=trust_remote_code,
**hub_attrs,
**copied_attrs)
config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
return config, hub_attrs, attrs
def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
@@ -62,9 +57,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
if llm.config['trust_remote_code']:
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if not hasattr(config, 'auto_map'):
raise ValueError(
f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping'
)
raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping')
# in case this model doesn't use the correct auto class for model type, for example like chatglm
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
if autoclass not in config.auto_map: autoclass = 'AutoModel'
@@ -84,7 +77,8 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
based.update(metadata)
_object_setattr(
bentomodel, '_info',
bentomodel,
'_info',
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag,
module=bentomodel.info.module,
@@ -102,9 +96,7 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_backend__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search',
'constrained_beam_search',
)
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',)
elif llm.__llm_backend__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
else:

View File

@@ -15,10 +15,7 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
@contextlib.contextmanager
def build_bento(model: str,
model_id: str | None = None,
quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
logger.info('Building BentoML for %s', model)
bento = openllm.build(model, model_id=model_id, quantize=quantize)
yield bento

View File

@@ -31,20 +31,12 @@ def test_missing_default():
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingArchitecture', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing'],
'requirements': ['bentoml'],
},
)
make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
def test_forbidden_access():
cl_ = make_llm_config(
'ForbiddenAccess', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'],
'architecture': 'PreTrainedModel',
'requirements': ['bentoml'],
'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'],
},
)
@@ -77,9 +69,7 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
assert attr.has(cl_)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473),
st.floats(min_value=0.0, max_value=1.0),
)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
sent = cl_()
@@ -138,9 +128,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
mk.setenv(field_env_key('field1'), str(4.0))
mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
sent = make_llm_config('OverwriteWithEnvAvailable', {
'default_id': 'asdfasdf',
'model_ids': ['asdf', 'asdfasdfads'],
'architecture': 'PreTrainedModel'
'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel'
},
fields=(('field1', 'float', 3.0),),
).model_construct_env(field1=20.0, temperature=0.4)

View File

@@ -73,8 +73,7 @@ class ResponseComparator(JSONSnapshotExtension):
return s == t
def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
eq_config(s.marshaled_config, t.marshaled_config))
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config))
return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
@@ -210,8 +209,7 @@ def _container_handle(model: str,
detach=True,
device_requests=devs,
ports={
'3000/tcp': port,
'3001/tcp': prom_port
'3000/tcp': port, '3001/tcp': prom_port
},
)

View File

@@ -49,8 +49,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
@pytest.fixture()
def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
file.write_text(
"{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
return file
@pytest.mark.usefixtures('dockerfile_template')

View File

@@ -261,7 +261,7 @@ ignore_patterns = [
based_on_style = "google"
INDENT_WIDTH = 2
JOIN_MULTIPLE_LINES = true
COLUMN_LIMIT = 152
COLUMN_LIMIT = 192
USE_TABS = false
BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
@@ -282,6 +282,7 @@ SPACES_AROUND_TUPLE_DELIMITERS = false
SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false
SPACE_INSIDE_BRACKETS = false
SPLIT_ALL_COMMA_SEPARATED_VALUES = false
SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = true
SPLIT_BEFORE_DOT = true
[tool.pytest.ini_options]

View File

@@ -28,10 +28,7 @@ class Classifier:
@staticmethod
def status() -> dict[int, str]:
return {
v: status for v, status in zip(range(
1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive'])
}
return {v: status for v, status in zip(range(1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive'])}
@staticmethod
def apache() -> str:
@@ -137,9 +134,7 @@ GPTQ_DEPS = ['auto-gptq[triton]']
VLLM_DEPS = ['vllm>=0.1.4', 'ray']
_base_requirements: dict[str, t.Any] = {
inflection.dasherize(name): config_cls.__openllm_requirements__
for name, config_cls in openllm.CONFIG_MAPPING.items()
if config_cls.__openllm_requirements__
inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__
}
# shallow copy from locals()
@@ -171,7 +166,8 @@ def create_classifiers() -> Array:
Classifier.create_classifier('audience', 'Developers'),
Classifier.create_classifier('audience', 'Science/Research'),
Classifier.create_classifier('audience', 'System Administrators'),
Classifier.create_classifier('typing', 'Typed'), *Classifier.create_python_classifier(),
Classifier.create_classifier('typing', 'Typed'),
*Classifier.create_python_classifier(),
])
return arr.multiline(True)
@@ -218,8 +214,23 @@ def authors() -> Array:
def keywords() -> Array:
arr = correct_style(tomlkit.array())
arr.extend([
'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2', 'Fine tuning', 'Serverless',
'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch', 'Transformers'
'MLOps',
'AI',
'BentoML',
'Model Serving',
'Model Deployment',
'LLMOps',
'Falcon',
'Vicuna',
'Llama 2',
'Fine tuning',
'Serverless',
'Large Language Model',
'Generative AI',
'StableLM',
'Alpaca',
'PyTorch',
'Transformers'
])
return arr.multiline(True)

View File

@@ -16,9 +16,7 @@ _OWNER = 'bentoml'
_REPO = 'openllm'
_gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = {
'macos_arm': 'aarch64-apple-darwin',
'macos_intel': 'x86_64-apple-darwin',
'linux_intel': 'x86_64-unknown-linux-musl'
'macos_arm': 'aarch64-apple-darwin', 'macos_intel': 'x86_64-apple-darwin', 'linux_intel': 'x86_64-unknown-linux-musl'
}
def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str:
@@ -34,9 +32,7 @@ def main() -> int:
_info = api.repos.get()
release_tag = api.repos.get_latest_release().name
shadict: dict[str, t.Any] = {
k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies
}
shadict: dict[str, t.Any] = {k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies}
shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag)().strip()
ENVIRONMENT = Environment(extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'],

View File

@@ -91,28 +91,23 @@ def main() -> int:
# NOTE: inline stubs for _ConfigAttr type stubs
config_attr_lines: list[str] = []
for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
config_attr_lines.extend([
' ' * 4 + line for line in [
f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n',
f"'''{_value_docstring[keys]}'''\n",
]
])
config_attr_lines.extend(
[' ' * 4 + line for line in [f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', f"'''{_value_docstring[keys]}'''\n",]])
# NOTE: inline runtime __getitem__ overload process
lines: list[str] = []
lines.append(' ' * 2 + '# NOTE: ModelSettings arguments\n')
for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
lines.extend([
' ' * 2 + line for line in [
'@overload\n',
f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n",
]
])
lines.extend(
[' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n",]])
# special case variables: generation_class, extras, sampling_class
lines.append(' ' * 2 + '# NOTE: generation_class, sampling_class and extras arguments\n')
lines.extend([
' ' * 2 + line for line in [
'@overload\n', "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", '@overload\n',
"def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", '@overload\n',
'@overload\n',
"def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n",
'@overload\n',
"def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n",
'@overload\n',
"def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n",
]
])
@@ -128,11 +123,9 @@ def main() -> int:
for keys in PeftType._member_names_:
lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]])
processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT
] + processed[end_attrs_idx + 1:start_stub_idx] + [
' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT
] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT
] + processed[end_idx + 1:]
processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [
' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT
] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT] + processed[end_idx + 1:]
with _TARGET_FILE.open('w') as f:
f.writelines(processed)
return 0

View File

@@ -13,9 +13,7 @@ from openllm import CONFIG_MAPPING
if t.TYPE_CHECKING: from collections import OrderedDict
config_requirements = {
k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()
}
config_requirements = {k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()}
_dependencies: dict[LiteralBackend, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))}
_auto: dict[str, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))}
@@ -30,23 +28,24 @@ def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]:
def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]:
_dep_list: list[str] = [
f'"{v}"' for v in [
_dependencies[backend], *(
t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])
]
f'"{v}"' for v in [_dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])]
]
if auto: cl_ = _auto[backend]
else: cl_ = get_mapping(backend)[model_name]
lines = [
f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]",
f'class {cl_}(metaclass=_DummyMetaclass):',
' ' * indentation + f"_backends=[{','.join(_dep_list)}]",
' ' * indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"
]
return lines
def write_stub(backend: LiteralBackend, _path: str) -> list[str]:
base = [
f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations',
'import typing as _t', 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!',
f'# To update this, run ./{_path}',
'from __future__ import annotations',
'import typing as _t',
'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
]
base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it])
# autoclass

View File

@@ -11,9 +11,10 @@ def create_module_import() -> str:
def create_stubs_import() -> list[str]:
return [
'if t.TYPE_CHECKING:from . import ' +
','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]),
'__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__', '__dir__=__lazy.__dir__',
'if t.TYPE_CHECKING:from . import ' + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]),
'__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})',
'__all__=__lazy.__all__',
'__dir__=__lazy.__dir__',
'__getattr__=__lazy.__getattr__\n'
]
@@ -21,9 +22,13 @@ def main() -> int:
_path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
with _TARGET_FILE.open('w') as f:
f.writelines('\n'.join([
f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations',
'import typing as t', 'from openllm_core.utils import LazyModule',
create_module_import(), *create_stubs_import(),
f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!',
f'# To update this, run ./{_path}',
'from __future__ import annotations',
'import typing as t',
'from openllm_core.utils import LazyModule',
create_module_import(),
*create_stubs_import(),
]))
return 0

View File

@@ -18,11 +18,7 @@ def main() -> int:
start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
formatted: dict[t.Literal['Model', 'Architecture', 'URL', 'Installation', 'Model Ids'], list[str | list[str]]] = {
'Model': [],
'Architecture': [],
'URL': [],
'Model Ids': [],
'Installation': [],
'Model': [], 'Architecture': [], 'URL': [], 'Model Ids': [], 'Installation': [],
}
max_install_len_div = 0
for name, config_cls in openllm.CONFIG_MAPPING.items():