diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index 0f19974a..ae453ec2 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -95,8 +95,7 @@ class _ClientAttr: if not self.supports_hf_agent: raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.') if not is_transformers_supports_agent(): - raise RuntimeError( - "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'") + raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'") import transformers return transformers.HfAgent(urljoin(self._address, '/hf/agent')) @@ -230,15 +229,7 @@ class _AsyncClient(_ClientAttr): stop = ['Task:'] prompt = t.cast(str, self._hf_agent.format_prompt(task)) async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client: - response = await client.post(self._hf_agent.url_endpoint, - json={ - 'inputs': prompt, - 'parameters': { - 'max_new_tokens': 200, - 'return_full_text': False, - 'stop': stop - } - }) + response = await client.post(self._hf_agent.url_endpoint, json={'inputs': prompt, 'parameters': {'max_new_tokens': 200, 'return_full_text': False, 'stop': stop}}) if response.status_code != HTTPStatus.OK: raise ValueError(f'Error {response.status_code}: {response.json()}') result = response.json()[0]['generated_text'] @@ -279,12 +270,8 @@ class BaseClient(_Client): logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.") if return_attrs is True: return_response = 'attrs' use_default_prompt_template = attrs.pop('use_default_prompt_template', False) - prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, - use_default_prompt_template=use_default_prompt_template, - **attrs) - r = openllm_core.GenerationOutput( - **self.call('generate', - openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())) + prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs) + r = openllm_core.GenerationOutput(**self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())) if return_response == 'attrs': return r elif return_response == 'raw': return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) @@ -309,12 +296,8 @@ class BaseAsyncClient(_AsyncClient): logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.") if return_attrs is True: return_response = 'attrs' use_default_prompt_template = attrs.pop('use_default_prompt_template', False) - prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, - use_default_prompt_template=use_default_prompt_template, - **attrs) - r = openllm_core.GenerationOutput( - **(await self.call('generate', - openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))) + prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs) + r = openllm_core.GenerationOutput(**(await self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))) if return_response == 'attrs': return r elif return_response == 'raw': return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py index 4525143b..dc62d154 100644 --- a/openllm-client/src/openllm_client/benmin/_grpc.py +++ b/openllm-client/src/openllm_client/benmin/_grpc.py @@ -154,12 +154,10 @@ class GrpcClient(Client): try: reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({ - 'id': api.input.descriptor_id, - 'args': json_format.MessageToDict(api.input.attributes).get('args', None) + 'id': api.input.descriptor_id, 'args': json_format.MessageToDict(api.input.attributes).get('args', None) }), bentoml.io.from_spec({ - 'id': api.output.descriptor_id, - 'args': json_format.MessageToDict(api.output.attributes).get('args', None) + 'id': api.output.descriptor_id, 'args': json_format.MessageToDict(api.output.attributes).get('args', None) }), name=api.name, doc=api.docs) @@ -207,11 +205,7 @@ class AsyncGrpcClient(AsyncClient): if self.ssl: if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'") credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()}) - return aio.secure_channel(self.server_url, - credentials=credentials, - options=self.options, - compression=self.compression, - interceptors=self.interceptors) + return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors) return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors) @staticmethod @@ -262,12 +256,10 @@ class AsyncGrpcClient(AsyncClient): try: reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({ - 'id': api.input.descriptor_id, - 'args': json_format.MessageToDict(api.input.attributes).get('args', None) + 'id': api.input.descriptor_id, 'args': json_format.MessageToDict(api.input.attributes).get('args', None) }), bentoml.io.from_spec({ - 'id': api.output.descriptor_id, - 'args': json_format.MessageToDict(api.output.attributes).get('args', None) + 'id': api.output.descriptor_id, 'args': json_format.MessageToDict(api.output.attributes).get('args', None) }), name=api.name, doc=api.docs) diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py index af8d6cb1..356b0a95 100644 --- a/openllm-client/src/openllm_client/benmin/_http.py +++ b/openllm-client/src/openllm_client/benmin/_http.py @@ -73,13 +73,12 @@ class HttpClient(Client): if 'x-bentoml-name' not in meth_spec: raise ValueError(f'Malformed BentoML spec received from BentoML server {url}') try: - reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any]( - None, - bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']), - bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']), - name=meth_spec['x-bentoml-name'], - doc=meth_spec['description'], - route=route.lstrip('/')) + reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None, + bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']), + bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']), + name=meth_spec['x-bentoml-name'], + doc=meth_spec['description'], + route=route.lstrip('/')) except Exception as e: logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e) return cls(url, reflection) @@ -160,13 +159,12 @@ class AsyncHttpClient(AsyncClient): if 'x-bentoml-name' not in meth_spec: raise ValueError(f'Malformed BentoML spec received from BentoML server {url}') try: - reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any]( - None, - bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']), - bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']), - name=meth_spec['x-bentoml-name'], - doc=meth_spec['description'], - route=route.lstrip('/')) + reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None, + bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']), + bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']), + name=meth_spec['x-bentoml-name'], + doc=meth_spec['description'], + route=route.lstrip('/')) except ValueError as e: logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e) return cls(url, reflection) diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 62a558da..ea2b59b3 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -176,36 +176,26 @@ class FineTuneConfig: if t.TYPE_CHECKING and not MYPY: # The following type stubs makes __init__ aware of attrs internal type converter. @overload - def __init__(self, - adapter_type: AdapterType = ..., - adapter_config: dict[str, t.Any] = ..., - inference_mode: bool = ..., - llm_config_class: type[LLMConfig] = ...) -> None: + def __init__(self, adapter_type: AdapterType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: ... @overload - def __init__(self, - adapter_type: PeftType = ..., - adapter_config: dict[str, t.Any] = ..., - inference_mode: bool = ..., - llm_config_class: type[LLMConfig] = ...) -> None: + def __init__(self, adapter_type: PeftType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: ... # The below should be generated via attrs. Only here to conform with pyright strict checking. def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: ... - adapter_type: PeftType = dantic.Field( - 'lora', - description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", - use_default_converter=False, - converter=_adapter_converter) - adapter_config: t.Dict[str, - t.Any] = dantic.Field(None, - description='The configuration for the adapter. The content of the dict depends on the adapter type.', - validator=attr.validators.optional(attr.validators.instance_of(dict)), - converter=attr.converters.default_if_none(factory=dict), - use_default_converter=False) + adapter_type: PeftType = dantic.Field('lora', + description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", + use_default_converter=False, + converter=_adapter_converter) + adapter_config: t.Dict[str, t.Any] = dantic.Field(None, + description='The configuration for the adapter. The content of the dict depends on the adapter type.', + validator=attr.validators.optional(attr.validators.instance_of(dict)), + converter=attr.converters.default_if_none(factory=dict), + use_default_converter=False) inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False) llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False) @@ -214,8 +204,7 @@ class FineTuneConfig: # no need for peft_type since it is internally managed by OpenLLM and PEFT if 'peft_type' in adapter_config: adapter_config.pop('peft_type') # respect user set task_type if it is passed, otherwise use one managed by OpenLLM - task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop( - 'inference_mode', self.inference_mode) + task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop('inference_mode', self.inference_mode) return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, inference_mode=inference_mode, **adapter_config) def train(self) -> FineTuneConfig: @@ -245,8 +234,7 @@ class GenerationConfig(ReprMixin): 0, ge=0, description= - 'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.' - ) + 'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.') min_new_tokens: int = dantic.Field(description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.') early_stopping: bool = dantic.Field( False, @@ -254,25 +242,18 @@ class GenerationConfig(ReprMixin): '''Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm) ''' ) max_time: float = dantic.Field( - description= - 'The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.' - ) + description='The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.') num_beams: int = dantic.Field(1, description='Number of beams for beam search. 1 means no beam search.') num_beam_groups: int = dantic.Field( 1, description= - 'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.' - ) - penalty_alpha: float = dantic.Field( - description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.') - use_cache: bool = dantic.Field( - True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.') + 'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.') + penalty_alpha: float = dantic.Field(description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.') + use_cache: bool = dantic.Field(True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.') temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.') top_k: int = dantic.Field(50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.') - top_p: float = dantic.Field( - 1.0, - description= - 'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.') + top_p: float = dantic.Field(1.0, + description='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.') typical_p: float = dantic.Field( 1.0, description= @@ -293,14 +274,10 @@ class GenerationConfig(ReprMixin): description= "This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. " ) - repetition_penalty: float = dantic.Field( - 1.0, - description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.' - ) + repetition_penalty: float = dantic.Field(1.0, + description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.') encoder_repetition_penalty: float = dantic.Field( - 1.0, - description= - 'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.') + 1.0, description='The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.') length_penalty: float = dantic.Field( 1.0, description= @@ -329,46 +306,34 @@ class GenerationConfig(ReprMixin): 'The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like [mBART](https://huggingface.co/docs/transformers/model_doc/mbart) where the first generated token needs to be the target language token. ' ) forced_eos_token_id: t.Union[int, t.List[int]] = dantic.Field( - description= - 'The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.' - ) + description='The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.') remove_invalid_values: bool = dantic.Field( False, - description= - 'Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.' + description='Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.' ) exponential_decay_length_penalty: t.Tuple[int, float] = dantic.Field( description= 'This tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay' ) suppress_tokens: t.List[int] = dantic.Field( - description= - 'A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.' - ) + description='A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.') begin_suppress_tokens: t.List[int] = dantic.Field( description= - 'A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. ' - ) + 'A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. ') forced_decoder_ids: t.List[t.List[int]] = dantic.Field( description= 'A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.' ) num_return_sequences: int = dantic.Field(1, description='The number of independently computed returned sequences for each element in the batch.') - output_attentions: bool = dantic.Field( - False, - description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.') - output_hidden_states: bool = dantic.Field( - False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.') - output_scores: bool = dantic.Field( - False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.') + output_attentions: bool = dantic.Field(False, + description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.') + output_hidden_states: bool = dantic.Field(False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.') + output_scores: bool = dantic.Field(False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.') pad_token_id: int = dantic.Field(description='The id of the *padding* token.') bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.') - eos_token_id: t.Union[int, t.List[int]] = dantic.Field( - description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.') - encoder_no_repeat_ngram_size: int = dantic.Field( - 0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.') - decoder_start_token_id: int = dantic.Field( - description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.') + eos_token_id: t.Union[int, t.List[int]] = dantic.Field(description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.') + encoder_no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.') + decoder_start_token_id: int = dantic.Field(description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.') if t.TYPE_CHECKING and not MYPY: # stubs this for pyright as mypy already has a attr plugin builtin @@ -390,13 +355,10 @@ class GenerationConfig(ReprMixin): bentoml_cattr.register_unstructure_hook_factory( lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig), - lambda cls: make_dict_unstructure_fn(cls, - bentoml_cattr, - _cattrs_omit_if_default=False, - _cattrs_use_linecache=True, - **{ - k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) - })) + lambda cls: make_dict_unstructure_fn( + cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{ + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) + })) @attr.frozen(slots=True, repr=False, init=False) class SamplingParams(ReprMixin): @@ -425,8 +387,7 @@ class SamplingParams(ReprMixin): 'Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.' ) use_beam_search: bool = dantic.Field(False, description='Whether to use beam search instead of sampling.') - stop: t.List[str] = dantic.Field( - None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.') + stop: t.List[str] = dantic.Field(None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.') ignore_eos: bool = dantic.Field(False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.') logprobs: int = dantic.Field(None, description='Number of log probabilities to return per output token.') @@ -441,9 +402,7 @@ class SamplingParams(ReprMixin): def __init__(self, *, _internal: bool = False, **attrs: t.Any): if not _internal: - raise RuntimeError( - "SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'" - ) + raise RuntimeError("SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'") _object_setattr(self, 'max_tokens', attrs.pop('max_tokens', 16)) _object_setattr(self, 'temperature', attrs.pop('temperature', 1.0)) _object_setattr(self, 'top_k', attrs.pop('top_k', -1)) @@ -459,11 +418,7 @@ class SamplingParams(ReprMixin): return {i.name for i in attr.fields(self.__class__)} def to_vllm(self) -> vllm.SamplingParams: - return vllm.SamplingParams(max_tokens=self.max_tokens, - temperature=self.temperature, - top_k=self.top_k, - top_p=self.top_p, - **bentoml_cattr.unstructure(self)) + return vllm.SamplingParams(max_tokens=self.max_tokens, temperature=self.temperature, top_k=self.top_k, top_p=self.top_p, **bentoml_cattr.unstructure(self)) @classmethod def from_generation_config(cls, generation_config: GenerationConfig, **attrs: t.Any) -> Self: @@ -481,16 +436,12 @@ class SamplingParams(ReprMixin): bentoml_cattr.register_unstructure_hook_factory( lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), - lambda cls: make_dict_unstructure_fn(cls, - bentoml_cattr, - _cattrs_omit_if_default=False, - _cattrs_use_linecache=True, - **{ - k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) - })) -bentoml_cattr.register_structure_hook_factory( - lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), - lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens'))) + lambda cls: make_dict_unstructure_fn( + cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{ + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) + })) +bentoml_cattr.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), + lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens'))) # cached it here to save one lookup per assignment _object_getattribute = object.__getattribute__ @@ -538,14 +489,12 @@ class ModelSettings(t.TypedDict, total=False): # tokenizer_class is the custom tokenizer class for this given LLM tokenizer_class: t.Optional[str] -_transformed_type: DictStrAny = { - 'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], - 'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend] -} +_transformed_type: DictStrAny = {'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], 'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]} @attr.define(frozen=False, slots=True, - field_transformer=lambda _, __: [ + field_transformer=lambda _, + __: [ attr.Attribute.from_counting_attr( k, dantic.Field(kw_only=False if t.get_origin(ann) is not Required else True, @@ -553,7 +502,8 @@ _transformed_type: DictStrAny = { use_default_converter=False, type=_transformed_type.get(k, ann), metadata={'target': f'__openllm_{k}__'}, - description=f'ModelSettings field for {k}.')) for k, ann in t.get_type_hints(ModelSettings).items() + description=f'ModelSettings field for {k}.')) for k, + ann in t.get_type_hints(ModelSettings).items() ]) class _ModelSettingsAttr: '''Internal attrs representation of ModelSettings.''' @@ -570,8 +520,7 @@ class _ModelSettingsAttr: model_ids=['__default__'], architecture='PreTrainedModel', default_backend={ - 'cpu': 'pt', - 'nvidia.com/gpu': 'pt' + 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, name_type='dasherize', requires_gpu=False, @@ -619,8 +568,7 @@ def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBacken def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: if 'generation_class' in cl_.__config__: - raise ValueError( - f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") + raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") required_fields = {k for k, ann in t.get_type_hints(ModelSettings).items() if t.get_origin(ann) is Required} if any(i not in cl_.__config__ for i in required_fields): @@ -633,8 +581,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ if not has_custom_name: _final_value_dct['model_name'] = inflection.underscore(_cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower() - _final_value_dct['start_name'] = inflection.dasherize( - _final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name'] + _final_value_dct['start_name'] = inflection.dasherize(_final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name'] model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name # if the default implementation dependencies doesn't exist, then always fallback to 'pt' @@ -845,20 +792,10 @@ class _ConfigBuilder: __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init') - def __init__(self, - cls: type[LLMConfig], - these: dict[str, _CountingAttr], - auto_attribs: bool = False, - kw_only: bool = False, - collect_by_mro: bool = True): - attrs, base_attrs, base_attr_map = _transform_attrs(cls, - these, - auto_attribs, - kw_only, - collect_by_mro, - field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__)) - self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict( - cls.__dict__), attrs, {a.name for a in base_attrs}, base_attr_map + def __init__(self, cls: type[LLMConfig], these: dict[str, _CountingAttr], auto_attribs: bool = False, kw_only: bool = False, collect_by_mro: bool = True): + attrs, base_attrs, base_attr_map = _transform_attrs(cls, these, auto_attribs, kw_only, collect_by_mro, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__)) + self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict(cls.__dict__), attrs, {a.name for a in base_attrs + }, base_attr_map self._attr_names = tuple(a.name for a in attrs) self._has_pre_init = bool(getattr(cls, '__attrs_pre_init__', False)) self._has_post_init = bool(getattr(cls, '__attrs_post_init__', False)) @@ -943,8 +880,7 @@ class _ConfigBuilder: def add_attrs_init(self) -> Self: self._cls_dict['__attrs_init__'] = codegen.add_method_dunders( - self._cls, - _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True)) + self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True)) return self def add_repr(self) -> Self: @@ -1073,13 +1009,13 @@ class LLMConfig(_ConfigAttr): repr=False, init=False, collect_by_mro=True, - field_transformer=codegen.make_env_transformer( - cls, - cls.__openllm_model_name__, - suffix=suffix_env, - globs=globs, - default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) - if codegen.has_own_attribute(cls, class_attr) else field_default)) + field_transformer=codegen.make_env_transformer(cls, + cls.__openllm_model_name__, + suffix=suffix_env, + globs=globs, + default_callback=lambda field_name, + field_default: getattr(getattr(cls, class_attr), field_name, field_default) + if codegen.has_own_attribute(cls, class_attr) else field_default)) # For pickling to work, the __module__ variable needs to be set to the # frame where the class is created. This respect the module that is created from cls try: @@ -1113,8 +1049,7 @@ class LLMConfig(_ConfigAttr): raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}") # We need to set the accepted key before generation_config # as generation_config is a special field that users shouldn't pass. - cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__) - } | {a.name for a in attr.fields(cls.__openllm_sampling_class__)} + cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)} cls = _ConfigBuilder(cls, these).add_attrs_init().add_repr().build_class() # Finally, resolve the types @@ -1126,11 +1061,7 @@ class LLMConfig(_ConfigAttr): attr.resolve_types(cls.__openllm_sampling_class__, globalns=globs) cls = attr.resolve_types(cls, globalns=globs) # the hint cache for easier access - cls.__openllm_hints__ = { - f.name: f.type - for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), - attr.fields(cls.__openllm_sampling_class__)] for f in ite - } + cls.__openllm_hints__ = {f.name: f.type for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), attr.fields(cls.__openllm_sampling_class__)] for f in ite} # for pickling to work, need to set the module to the correct outer frame try: @@ -1141,8 +1072,7 @@ class LLMConfig(_ConfigAttr): def __setattr__(self, attr: str, value: t.Any) -> None: if attr in _reserved_namespace: raise ForbiddenAttributeError( - f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.' - ) + f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.') super().__setattr__(attr, value) def __init__(self, *, generation_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any): @@ -1157,9 +1087,7 @@ class LLMConfig(_ConfigAttr): for k in _cached_keys: if k in generation_config or k in sampling_config or attrs[k] is None: del attrs[k] - self.__openllm_extras__ = config_merger.merge(first_not_none(__openllm_extras__, default={}), { - k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__ - }) + self.__openllm_extras__ = config_merger.merge(first_not_none(__openllm_extras__, default={}), {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__}) self.generation_config = self['generation_class'](_internal=True, **generation_config) self.sampling_config = self['sampling_class'].from_generation_config(self.generation_config, **sampling_config) @@ -1363,15 +1291,13 @@ class LLMConfig(_ConfigAttr): return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__) def values(self) -> list[t.Any]: - return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + - [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + + return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values())) def items(self) -> list[tuple[str, t.Any]]: return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + - [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + - list(self.__openllm_extras__.items())) + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) def __iter__(self) -> t.Iterator[str]: return iter(self.keys()) @@ -1403,9 +1329,9 @@ class LLMConfig(_ConfigAttr): _new_cfg = {k: v for k, v in attrs.items() if k in attr.fields_dict(_ModelSettingsAttr)} attrs = {k: v for k, v in attrs.items() if k not in _new_cfg} new_cls = types.new_class( - name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {}, lambda ns: ns.update({ - '__config__': config_merger.merge(copy.deepcopy(cls.__dict__['__config__']), _new_cfg), - '__base_config__': cls, # keep a reference for easy access + name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {}, + lambda ns: ns.update({ + '__config__': config_merger.merge(copy.deepcopy(cls.__dict__['__config__']), _new_cfg), '__base_config__': cls, # keep a reference for easy access })) # For pickling to work, the __module__ variable needs to be set to the @@ -1566,9 +1492,8 @@ class LLMConfig(_ConfigAttr): ''' return generation_result -bentoml_cattr.register_unstructure_hook_factory( - lambda cls: lenient_issubclass(cls, LLMConfig), - lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True)) +bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), + lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True)) def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: """Structure a dictionary to a LLMConfig object. @@ -1594,5 +1519,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs) bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config) -openllm_home = os.path.expanduser( - os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))) +openllm_home = os.path.expanduser(os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))) diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py index d0e64caf..3fe60d89 100644 --- a/openllm-core/src/openllm_core/_prompt.py +++ b/openllm-core/src/openllm_core/_prompt.py @@ -29,5 +29,4 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template return template.format(instruction=prompt, **prompt_variables) except KeyError as e: raise RuntimeError( - f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template." - ) from None + f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index b7396507..424c30ea 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -40,9 +40,7 @@ class GenerationInput: return attr.make_class(inflection.camelize(llm_config['model_name']) + 'GenerationInput', attrs={ 'prompt': attr.field(type=str), - 'llm_config': attr.field(type=llm_config.__class__, - default=llm_config, - converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), + 'llm_config': attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), 'adapter_name': attr.field(default=None, type=str) }) @@ -85,12 +83,8 @@ def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.An finished=request_output.finished, prompt_token_ids=request_output.prompt_token_ids, outputs=[ - dict(index=it.index, - text=it.text, - token_ids=it.token_ids, - cumulative_logprob=it.cumulative_logprob, - logprobs=it.logprobs, - finish_reason=it.finish_reason) for it in request_output.outputs + dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) + for it in request_output.outputs ]) @attr.define diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index f6cc5d19..078a93d1 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -217,7 +217,8 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None: def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class( - name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind}, lambda ns: ns.update({ + name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind}, + lambda ns: ns.update({ 'resource_id': resource_kind, 'from_spec': classmethod(_from_spec), 'from_system': classmethod(_from_system), @@ -234,12 +235,16 @@ _NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu' _CPU_RESOURCE: t.Literal['cpu'] = 'cpu' NvidiaGpuResource = _make_resource_class( - 'NvidiaGpuResource', _NVIDIA_GPU_RESOURCE, '''NVIDIA GPU resource. + 'NvidiaGpuResource', + _NVIDIA_GPU_RESOURCE, + '''NVIDIA GPU resource. This is a modified version of internal's BentoML's NvidiaGpuResource where it respects and parse CUDA_VISIBLE_DEVICES correctly.''') AmdGpuResource = _make_resource_class( - 'AmdGpuResource', _AMD_GPU_RESOURCE, '''AMD GPU resource. + 'AmdGpuResource', + _AMD_GPU_RESOURCE, + '''AMD GPU resource. Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''') @@ -305,13 +310,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): return math.ceil(cpus) * workers_per_resource # this should not be reached by user since we always read system resource as default - raise ValueError( - f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.' - ) + raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.') @classmethod - def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float, - worker_index: int) -> dict[str, t.Any]: + def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float, worker_index: int) -> dict[str, t.Any]: '''Get worker env for this given worker_index. Args: @@ -369,15 +371,12 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): # NOTE: We hit this branch when workers_per_resource is set to # float, for example 0.5 or 0.25 if workers_per_resource > 1: - raise ValueError( - "Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case." - ) + raise ValueError("Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case.") # We are round the assigned resource here. This means if workers_per_resource=.4 # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2. assigned_resource_per_worker = round(1 / workers_per_resource) if len(gpus) < assigned_resource_per_worker: - logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index, - assigned_resource_per_worker) + logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index, assigned_resource_per_worker) raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].") assigned_gpu = gpus[assigned_resource_per_worker * worker_index:assigned_resource_per_worker * (worker_index + 1)] dev = ','.join(assigned_gpu) diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 85535498..a13d9d12 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -24,9 +24,8 @@ if t.TYPE_CHECKING: ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]] # NOTE: This is the entrypoint when adding new model config -CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), - ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'), - ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'), +CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), + ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'), ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'), ('baichuan', 'BaichuanConfig')]) class _LazyConfigMapping(OrderedDict, ReprMixin): diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index dd70745e..e5cfdfd5 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -46,8 +46,12 @@ class BaichuanConfig(openllm_core.LLMConfig): 'architecture': 'BaiChuanForCausalLM', 'default_id': 'baichuan-inc/baichuan-7b', 'model_ids': [ - 'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat', 'fireballoon/baichuan-vicuna-chinese-7b', - 'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft' + 'baichuan-inc/baichuan-7b', + 'baichuan-inc/baichuan-13b-base', + 'baichuan-inc/baichuan-13b-chat', + 'fireballoon/baichuan-vicuna-chinese-7b', + 'fireballoon/baichuan-vicuna-7b', + 'hiyouga/baichuan-7b-sft' ] } @@ -63,12 +67,7 @@ class BaichuanConfig(openllm_core.LLMConfig): temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'top_p': top_p, - 'temperature': temperature, - **attrs - }, {} + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {'max_new_tokens': max_new_tokens, 'top_p': top_p, 'temperature': temperature, **attrs}, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index 2cf948b2..2e9f84d4 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -51,8 +51,7 @@ class ChatGLMConfig(openllm_core.LLMConfig): 'default_id': 'thudm/chatglm-6b', 'model_ids': ['thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b', 'thudm/chatglm2-6b-int4'] } - retain_history: bool = dantic.Field( - False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.') + retain_history: bool = dantic.Field(False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.') use_half_precision: bool = dantic.Field(True, description='Whether to use half precision for model.') class GenerationConfig: @@ -78,20 +77,9 @@ class ChatGLMConfig(openllm_core.LLMConfig): else: prompt_text = prompt postprocess_generate_kwargs = {'chat_history': chat_history if chat_history is not None else None} - return prompt_text, { - 'max_new_tokens': max_new_tokens, - 'num_beams': num_beams, - 'top_p': top_p, - 'temperature': temperature, - **attrs - }, postprocess_generate_kwargs + return prompt_text, {'max_new_tokens': max_new_tokens, 'num_beams': num_beams, 'top_p': top_p, 'temperature': temperature, **attrs}, postprocess_generate_kwargs - def postprocess_generate(self, - prompt: str, - generation_result: tuple[str, list[tuple[str, str]]], - *, - chat_history: list[tuple[str, str]] | None = None, - **attrs: t.Any) -> str: + def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str: generated, history = generation_result if self.config.retain_history: if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.") diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index 42078ad9..836b5f50 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -98,11 +98,7 @@ class DollyV2Config(openllm_core.LLMConfig): use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'top_k': top_k, - 'top_p': top_p, - 'temperature': temperature, - **attrs + 'max_new_tokens': max_new_tokens, 'top_k': top_k, 'top_p': top_p, 'temperature': temperature, **attrs }, {} def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]], **_: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py index d18df361..4e87538d 100644 --- a/openllm-core/src/openllm_core/config/configuration_falcon.py +++ b/openllm-core/src/openllm_core/config/configuration_falcon.py @@ -49,12 +49,7 @@ class FalconConfig(openllm_core.LLMConfig): 'default_id': 'tiiuae/falcon-7b', 'model_ids': ['tiiuae/falcon-7b', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-40b-instruct'], 'fine_tune_strategies': ({ - 'adapter_type': 'lora', - 'r': 64, - 'lora_alpha': 16, - 'lora_dropout': 0.1, - 'bias': 'none', - 'target_modules': ['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'] + 'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none', 'target_modules': ['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'] },) } @@ -74,11 +69,7 @@ class FalconConfig(openllm_core.LLMConfig): use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'top_k': top_k, - 'num_return_sequences': num_return_sequences, - 'eos_token_id': eos_token_id, - **attrs + 'max_new_tokens': max_new_tokens, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'eos_token_id': eos_token_id, **attrs }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index aa027ac4..8d5f2fa0 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -64,11 +64,7 @@ class FlanT5Config(openllm_core.LLMConfig): use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_k': top_k, - 'top_p': top_p, - 'repetition_penalty': repetition_penalty + 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'top_p': top_p, 'repetition_penalty': repetition_penalty }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py index cc03e19f..585cddd4 100644 --- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py +++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py @@ -63,10 +63,7 @@ class GPTNeoXConfig(openllm_core.LLMConfig): max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature - }, {} + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {'max_new_tokens': max_new_tokens, 'temperature': temperature}, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index ef01c1cb..221355d0 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -45,8 +45,11 @@ If a question does not make any sense, or is not factually coherent, explain why ''' SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = '[INST]', '[/INST]', '<>', '', '' # TODO: support history and v1 prompt implementation -_v1_prompt, _v2_prompt = '''{instruction}''', '''{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} '''.format( - start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction='{instruction}', end_key=EINST_KEY) +_v1_prompt, _v2_prompt = '''{instruction}''', '''{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} '''.format(start_key=SINST_KEY, + sys_key=SYS_KEY, + system_message=SYSTEM_MESSAGE, + instruction='{instruction}', + end_key=EINST_KEY) PROMPT_MAPPING = {'v1': _v1_prompt, 'v2': _v2_prompt} def _get_prompt(model_type: t.Literal['v1', 'v2']) -> str: @@ -71,26 +74,35 @@ class LlamaConfig(openllm_core.LLMConfig): 'name_type': 'lowercase', 'url': 'https://github.com/facebookresearch/llama', 'default_backend': { - 'cpu': 'pt', - 'nvidia.com/gpu': 'pt' + 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, 'architecture': 'LlamaForCausalLM', 'requirements': ['fairscale', 'sentencepiece'], 'tokenizer_class': 'LlamaTokenizerFast', 'default_id': 'NousResearch/llama-2-7b-hf', 'model_ids': [ - 'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-70b-hf', - 'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf', 'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf', - 'NousResearch/llama-2-7b-chat-hf', 'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf', - 'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b', 'huggyllama/llama-65b', - 'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b' + 'meta-llama/Llama-2-70b-chat-hf', + 'meta-llama/Llama-2-13b-chat-hf', + 'meta-llama/Llama-2-7b-chat-hf', + 'meta-llama/Llama-2-70b-hf', + 'meta-llama/Llama-2-13b-hf', + 'meta-llama/Llama-2-7b-hf', + 'NousResearch/llama-2-70b-chat-hf', + 'NousResearch/llama-2-13b-chat-hf', + 'NousResearch/llama-2-7b-chat-hf', + 'NousResearch/llama-2-70b-hf', + 'NousResearch/llama-2-13b-hf', + 'NousResearch/llama-2-7b-hf', + 'openlm-research/open_llama_7b_v2', + 'openlm-research/open_llama_3b_v2', + 'openlm-research/open_llama_13b', + 'huggyllama/llama-65b', + 'huggyllama/llama-30b', + 'huggyllama/llama-13b', + 'huggyllama/llama-7b' ], 'fine_tune_strategies': ({ - 'adapter_type': 'lora', - 'r': 64, - 'lora_alpha': 16, - 'lora_dropout': 0.1, - 'bias': 'none' + 'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none' },) } @@ -113,14 +125,9 @@ class LlamaConfig(openllm_core.LLMConfig): use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, - DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None, - use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_p': top_p, - 'top_k': top_k - }, {} + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None, use_default_prompt_template, **attrs), { + 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p, 'top_k': top_k + }, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py index 1105a95a..26bb0add 100644 --- a/openllm-core/src/openllm_core/config/configuration_mpt.py +++ b/openllm-core/src/openllm_core/config/configuration_mpt.py @@ -69,12 +69,10 @@ class MPTConfig(openllm_core.LLMConfig): 'architecture': 'MPTForCausalLM', 'default_id': 'mosaicml/mpt-7b-instruct', 'model_ids': [ - 'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b', - 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat' + 'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b', 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat' ] } - prompt_type: MPTPromptType = dantic.Field('"default"', - description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.') + prompt_type: MPTPromptType = dantic.Field('"default"', description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.') max_sequence_length: int = dantic.Field( 2048, description= @@ -103,11 +101,7 @@ class MPTConfig(openllm_core.LLMConfig): elif 'chat' in self.model_id: prompt_type = 'chat' else: prompt_type = 'default' _template = DEFAULT_PROMPT_TEMPLATE(prompt_type) - return process_prompt(prompt, _template, use_default_prompt_template), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_p': top_p - }, {} + return process_prompt(prompt, _template, use_default_prompt_template), {'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p}, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index bf362654..f8b1afba 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -51,12 +51,7 @@ class OPTConfig(openllm_core.LLMConfig): 'architecture': 'OPTForCausalLM', 'model_ids': ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-66b'], 'fine_tune_strategies': ({ - 'adapter_type': 'lora', - 'r': 16, - 'lora_alpha': 32, - 'target_modules': ['q_proj', 'v_proj'], - 'lora_dropout': 0.05, - 'bias': 'none' + 'adapter_type': 'lora', 'r': 16, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'lora_dropout': 0.05, 'bias': 'none' },) } format_outputs: bool = dantic.Field(False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''') @@ -76,10 +71,7 @@ class OPTConfig(openllm_core.LLMConfig): use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_k': top_k, - 'num_return_sequences': num_return_sequences + 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py index 6dcd50ed..bc0492e8 100644 --- a/openllm-core/src/openllm_core/config/configuration_stablelm.py +++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py @@ -51,10 +51,7 @@ class StableLMConfig(openllm_core.LLMConfig): 'url': 'https://github.com/Stability-AI/StableLM', 'architecture': 'GPTNeoXForCausalLM', 'default_id': 'stabilityai/stablelm-tuned-alpha-3b', - 'model_ids': [ - 'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b', - 'stabilityai/stablelm-base-alpha-7b' - ] + 'model_ids': ['stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b', 'stabilityai/stablelm-base-alpha-7b'] } class GenerationConfig: diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py index cbf697d8..548d995a 100644 --- a/openllm-core/src/openllm_core/config/configuration_starcoder.py +++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py @@ -71,14 +71,7 @@ class StarCoderConfig(openllm_core.LLMConfig): else: prompt_text = prompt # XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD - return prompt_text, { - 'temperature': temperature, - 'top_p': top_p, - 'max_new_tokens': max_new_tokens, - 'repetition_penalty': repetition_penalty, - 'pad_token_id': 49152, - **attrs - }, {} + return prompt_text, {'temperature': temperature, 'top_p': top_p, 'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty, 'pad_token_id': 49152, **attrs}, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index efdfd08a..bdf4e20a 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -152,32 +152,24 @@ _LOGGING_CONFIG: dict[str, t.Any] = { 'filters': { 'excfilter': { '()': 'openllm_core.utils.ExceptionFilter' - }, - 'infofilter': { + }, 'infofilter': { '()': 'openllm_core.utils.InfoFilter' } }, 'handlers': { 'bentomlhandler': { - 'class': 'logging.StreamHandler', - 'filters': ['excfilter', 'infofilter'], - 'stream': 'ext://sys.stdout' + 'class': 'logging.StreamHandler', 'filters': ['excfilter', 'infofilter'], 'stream': 'ext://sys.stdout' }, 'defaulthandler': { - 'class': 'logging.StreamHandler', - 'level': logging.WARNING + 'class': 'logging.StreamHandler', 'level': logging.WARNING } }, 'loggers': { 'bentoml': { - 'handlers': ['bentomlhandler', 'defaulthandler'], - 'level': logging.INFO, - 'propagate': False + 'handlers': ['bentomlhandler', 'defaulthandler'], 'level': logging.INFO, 'propagate': False }, 'openllm': { - 'handlers': ['bentomlhandler', 'defaulthandler'], - 'level': logging.INFO, - 'propagate': False + 'handlers': ['bentomlhandler', 'defaulthandler'], 'level': logging.INFO, 'propagate': False } }, 'root': { @@ -318,9 +310,7 @@ _whitelist_modules = {'pkg'} # XXX: define all classes, functions import above this line # since _extras will be the locals() import from this file. -_extras: dict[str, t.Any] = { - k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_')) -} +_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))} _extras['__openllm_migration__'] = {'ModelEnv': 'EnvVarMixin'} _import_structure: dict[str, list[str]] = { 'analytics': [], @@ -329,11 +319,32 @@ _import_structure: dict[str, list[str]] = { 'lazy': [], 'representation': ['ReprMixin'], 'import_utils': [ - 'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', 'is_einops_available', - 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available', - 'is_datasets_available', 'is_transformers_supports_kbit', 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available', - 'is_notebook_available', 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available', - 'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', 'is_transformers_available' + 'OPTIONAL_DEPENDENCIES', + 'DummyMetaclass', + 'EnvVarMixin', + 'require_backends', + 'is_cpm_kernels_available', + 'is_einops_available', + 'is_flax_available', + 'is_tf_available', + 'is_vllm_available', + 'is_torch_available', + 'is_bitsandbytes_available', + 'is_peft_available', + 'is_datasets_available', + 'is_transformers_supports_kbit', + 'is_transformers_supports_agent', + 'is_jupyter_available', + 'is_jupytext_available', + 'is_notebook_available', + 'is_triton_available', + 'is_autogptq_available', + 'is_sentencepiece_available', + 'is_xformers_available', + 'is_fairscale_available', + 'is_grpc_available', + 'is_grpc_health_available', + 'is_transformers_available' ] } diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index b59ff697..83e7fb1d 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -130,28 +130,13 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig], default_callback = identity if default_callback is None else default_callback globs = {} if globs is None else globs - globs.update({ - '__populate_env': dantic.env_converter, - '__default_callback': default_callback, - '__field_env': field_env_key, - '__suffix': suffix or '', - '__model_name': model_name, - }) + globs.update({'__populate_env': dantic.env_converter, '__default_callback': default_callback, '__field_env': field_env_key, '__suffix': suffix or '', '__model_name': model_name,}) lines: ListStr = [ '__env=lambda field_name:__field_env(field_name,__suffix)', "return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]" ] fields_ann = 'list[attr.Attribute[t.Any]]' - return generate_function(cls, - '__auto_env', - lines, - args=('_', 'fields'), - globs=globs, - annotations={ - '_': 'type[LLMConfig]', - 'fields': fields_ann, - 'return': fields_ann - }) + return generate_function(cls, '__auto_env', lines, args=('_', 'fields'), globs=globs, annotations={'_': 'type[LLMConfig]', 'fields': fields_ann, 'return': fields_ann}) def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: '''Enhance sdk with nice repr that plays well with your brain.''' @@ -178,7 +163,8 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: '__doc__': inspect.cleandoc(doc), '__module__': 'openllm' }), - )(func, **attrs), func, + )(func, **attrs), + func, )) __all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function'] diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index d77a2757..a6fae514 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -25,19 +25,29 @@ AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar('FC', bound=t.Union[AnyCallable, click.Command]) __all__ = [ - 'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice', 'LiteralChoice', 'allows_multiple', - 'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg', 'CUDA', 'JsonType', 'BytesType' + 'FC', + 'attrs_to_options', + 'Field', + 'parse_type', + 'is_typing', + 'is_literal', + 'ModuleType', + 'EnumChoice', + 'LiteralChoice', + 'allows_multiple', + 'is_mapping', + 'is_container', + 'parse_container_args', + 'parse_single_arg', + 'CUDA', + 'JsonType', + 'BytesType' ] def __dir__() -> list[str]: return sorted(__all__) -def attrs_to_options(name: str, - field: attr.Attribute[t.Any], - model_name: str, - typ: t.Any = None, - suffix_generation: bool = False, - suffix_sampling: bool = False) -> t.Callable[[FC], FC]: +def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False) -> t.Callable[[FC], FC]: # TODO: support parsing nested attrs class and Union envvar = field.metadata['env'] dasherized = inflection.dasherize(name) diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index 2f42ca8f..fe5d44c6 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -142,8 +142,17 @@ def is_tf_available() -> bool: _tf_version = None if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES: if _tf_available: - candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', - 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', 'tensorflow-aarch64', + candidates = ('tensorflow', + 'tensorflow-cpu', + 'tensorflow-gpu', + 'tf-nightly', + 'tf-nightly-cpu', + 'tf-nightly-gpu', + 'intel-tensorflow', + 'intel-tensorflow-avx512', + 'tensorflow-rocm', + 'tensorflow-macos', + 'tensorflow-aarch64', ) _tf_version = None # For the metadata, we have to look for both tensorflow and tensorflow-cpu @@ -282,20 +291,13 @@ You can install it with pip: `pip install fairscale`. Please note that you may n your runtime after installation. ''' -BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)), - ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)), - ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), - ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)), - ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), - ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)), - ('triton', (is_triton_available, TRITON_IMPORT_ERROR)), - ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)), - ('peft', (is_peft_available, PEFT_IMPORT_ERROR)), - ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), - ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), - ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), - ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), - ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) +BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)), ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)), + ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)), + ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)), + ('triton', (is_triton_available, TRITON_IMPORT_ERROR)), ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)), + ('peft', (is_peft_available, PEFT_IMPORT_ERROR)), ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), + ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), + ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) class DummyMetaclass(abc.ABCMeta): '''Metaclass for dummy object. diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py index b4b7131f..0ee26e0c 100644 --- a/openllm-core/src/openllm_core/utils/lazy.py +++ b/openllm-core/src/openllm_core/utils/lazy.py @@ -126,10 +126,9 @@ class LazyModule(types.ModuleType): } if name in dunder_to_metadata: if name not in {'__version_info__', '__copyright__', '__version__'}: - warnings.warn( - f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", - DeprecationWarning, - stacklevel=2) + warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", + DeprecationWarning, + stacklevel=2) meta = importlib.metadata.metadata('openllm') project_url = dict(url.split(', ') for url in t.cast(t.List[str], meta.get_all('Project-URL'))) if name == '__license__': return 'Apache-2.0' @@ -146,9 +145,7 @@ class LazyModule(types.ModuleType): if '__openllm_migration__' in self._objects: cur_value = self._objects['__openllm_migration__'].get(name, _sentinel) if cur_value is not _sentinel: - warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", - DeprecationWarning, - stacklevel=3) + warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", DeprecationWarning, stacklevel=3) return getattr(self, cur_value) if name in self._objects: return self._objects.__getitem__(name) if name in self._modules: value = self._get_module(name) diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 695ae271..019427d3 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -129,9 +129,7 @@ else: try: if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError except exceptions.MissingDependencyError: - _import_structure["utils.dummy_vllm_objects"] = [ - name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",) - ] + _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)] else: _import_structure["models.baichuan"].extend(["VLLMBaichuan"]) _import_structure["models.llama"].extend(["VLLMLlama"]) @@ -157,9 +155,7 @@ else: try: if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError except exceptions.MissingDependencyError: - _import_structure["utils.dummy_flax_objects"] = [ - name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",) - ] + _import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)] else: _import_structure["models.flan_t5"].extend(["FlaxFlanT5"]) _import_structure["models.opt"].extend(["FlaxOPT"]) @@ -171,9 +167,7 @@ else: try: if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError except exceptions.MissingDependencyError: - _import_structure["utils.dummy_tf_objects"] = [ - name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",) - ] + _import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)] else: _import_structure["models.flan_t5"].extend(["TFFlanT5"]) _import_structure["models.opt"].extend(["TFOPT"]) @@ -184,15 +178,7 @@ else: from .models.opt import TFOPT as TFOPT # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__ -__lazy = openllm_core.utils.LazyModule(__name__, - globals()["__file__"], - _import_structure, - extra_objects={ - "COMPILED": COMPILED, - "__openllm_migration__": { - "LLMEmbeddings": "EmbeddingsOutput" - } - }) +__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED, "__openllm_migration__": {"LLMEmbeddings": "EmbeddingsOutput"}}) __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ __getattr__ = __lazy.__getattr__ diff --git a/openllm-python/src/openllm/_assign.py b/openllm-python/src/openllm/_assign.py index c10b5c73..195d2bb2 100644 --- a/openllm-python/src/openllm/_assign.py +++ b/openllm-python/src/openllm/_assign.py @@ -99,10 +99,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N # _cached_LLMFunction_get and _ccached_LLMSerialisation_get globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}}) # llm_post_init implementation - lines: ListStr = [ - f'_impl_{cls.__name__}_func=cls.llm_post_init', - _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)') - ] + lines: ListStr = [f'_impl_{cls.__name__}_func=cls.llm_post_init', _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')] serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,} for func, impl in serialisation_attr.items(): @@ -114,10 +111,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N # assign vLLM implementation if cls.__llm_backend__ == 'vllm': - vllm_func = { - f'_vllm_{it}': fn - for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate)) - } + vllm_func = {f'_vllm_{it}': fn for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))} globs.update(vllm_func) lines.extend([_setattr_class(it[6:], it) for it in vllm_func]) @@ -137,15 +131,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')} lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr]) - return codegen.generate_function(cls, - '__assign_llm_attr', - lines, - args=('cls', *args), - globs=globs, - annotations={ - 'cls': 't.Type[LLM]', - 'return': None - }) + return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations={'cls': 't.Type[LLM]', 'return': None}) def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]['outputs'][0]['text'] diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index 18392443..84be1890 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -25,8 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: return bentoml.transformers.get(ids) except bentoml.exceptions.NotFound: model_signatures = { - k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', - 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__') + k: ModelSignature(batchable=False) + for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__') } with bentoml.models.create(ids, module=MODULE_NAME, @@ -34,8 +34,7 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: options=ModelOptions(), context=openllm.utils.generate_context(framework_name='transformers'), labels={ - 'runtime': 'pt', - 'framework': 'openllm' + 'runtime': 'pt', 'framework': 'openllm' }, signatures=model_signatures) as bentomodel: snapshot_download(_GENERIC_EMBEDDING_ID, diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index ee192bf7..8a78cab8 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -14,8 +14,7 @@ LogitsProcessorList = transformers.LogitsProcessorList StoppingCriteriaList = transformers.StoppingCriteriaList class StopSequenceCriteria(transformers.StoppingCriteria): - def __init__(self, stop_sequences: str | list[str], - tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): + def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences, self.tokenizer = stop_sequences, tokenizer diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index acb7cc9b..89160e3c 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -278,12 +278,20 @@ class LLM(LLMInterface[M, T], ReprMixin): if t.TYPE_CHECKING: __name__: str if t.TYPE_CHECKING and not MYPY: - def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], - model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag, - adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], - quantize_method: t.Optional[t.Literal['int8', 'int4', - 'gptq']], serialisation_format: t.Literal['safetensors', - 'legacy'], _local: bool, **attrs: t.Any) -> None: + def __attrs_init__(self, + config: LLMConfig, + quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], + model_id: str, + model_decls: TupleAny, + model_attrs: DictStrAny, + tokenizer_attrs: DictStrAny, + tag: bentoml.Tag, + adapters_mapping: t.Optional[AdaptersMapping], + model_version: t.Optional[str], + quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']], + serialisation_format: t.Literal['safetensors', 'legacy'], + _local: bool, + **attrs: t.Any) -> None: '''Generated __attrs_init__ for openllm.LLM.''' config: LLMConfig @@ -432,14 +440,11 @@ class LLM(LLMInterface[M, T], ReprMixin): _local = False _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__) if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True - quantize = first_not_none(quantize, - t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), - default=None) + quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None) # quantization setup if quantization_config and quantize: - raise ValueError( - "'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.") + raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.") if quantization_config is None and quantize is not None: quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs) if quantize == 'gptq': serialisation = 'safetensors' @@ -465,9 +470,7 @@ class LLM(LLMInterface[M, T], ReprMixin): if _tag.version is None: raise ValueError(f'Failed to resolve the correct model version for {cfg_cls.__openllm_start_name__}') except Exception as err: - raise OpenLLMException( - f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}" - ) from err + raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}") from err return cls(*args, model_id=_model_id, @@ -518,9 +521,7 @@ class LLM(LLMInterface[M, T], ReprMixin): else: from .serialisation.transformers._helpers import process_config model_version = getattr( - process_config(model_id, - trust_remote_code=cls.config_class.__openllm_trust_remote_code__, - revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None) + process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None) if model_version is None: raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)") return f'{tag_name}:{model_version}' @@ -529,10 +530,18 @@ class LLM(LLMInterface[M, T], ReprMixin): def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) - def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, - _tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str, - _serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any, + def __init__(self, + *args: t.Any, + model_id: str, + llm_config: LLMConfig, + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, + _adapters_mapping: AdaptersMapping | None, + _tag: bentoml.Tag, + _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, + _model_version: str, + _serialisation_format: t.Literal['safetensors', 'legacy'], + _local: bool, + **attrs: t.Any, ): '''Initialize the LLM with given pretrained model. @@ -630,21 +639,27 @@ class LLM(LLMInterface[M, T], ReprMixin): # parsing tokenizer and model kwargs, as the hierachy is param pass > default normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs) # NOTE: Save the args and kwargs for latter load - self.__attrs_init__(llm_config, quantization_config, model_id, args, { - **model_kwds, - **normalized_model_kwds - }, { - **tokenizer_kwds, - **normalized_tokenizer_kwds - }, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local) + self.__attrs_init__(llm_config, + quantization_config, + model_id, + args, { + **model_kwds, **normalized_model_kwds + }, { + **tokenizer_kwds, **normalized_tokenizer_kwds + }, + _tag, + _adapters_mapping, + _model_version, + _quantize_method, + _serialisation_format, + _local) self.llm_post_init() def __setattr__(self, attr: str, value: t.Any) -> None: if attr in _reserved_namespace: raise ForbiddenAttributeError( - f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.' - ) + f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.') super().__setattr__(attr, value) @property @@ -738,8 +753,7 @@ class LLM(LLMInterface[M, T], ReprMixin): model = self.load_model(*self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. if self.__llm_backend__ == 'pt' and is_torch_available(): - loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr( - model, 'is_quantized', False) + loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False) if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: try: model = model.to('cuda') @@ -777,34 +791,22 @@ class LLM(LLMInterface[M, T], ReprMixin): if name is None: _converted_first_none = True name = 'default' - peft_config = default_config.with_config( - **adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), - adapter_config=adapter.config, - inference_mode=inference_mode, - llm_config_class=self.config_class).to_peft_config() + peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig( + adapter_type=t.cast('PeftType', _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config() adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id) if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map return adapter_map - def prepare_for_training(self, - adapter_type: AdapterType = 'lora', - use_gradient_checkpointing: bool = True, - **attrs: t.Any) -> tuple[peft.PeftModel, T]: + def prepare_for_training(self, adapter_type: AdapterType = 'lora', use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]: from peft import prepare_model_for_kbit_training - peft_config = self.config['fine_tune_strategies'].get( - adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type), - llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config() - wrapped_peft = peft.get_peft_model( - prepare_model_for_kbit_training( # type: ignore[no-untyped-call] - self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config) + peft_config = self.config['fine_tune_strategies'].get(adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type), + llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config() + wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training( # type: ignore[no-untyped-call] + self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config) if DEBUG: wrapped_peft.print_trainable_parameters() return wrapped_peft, self.tokenizer - def apply_adapter(self, - inference_mode: bool = True, - adapter_type: AdapterType = 'lora', - load_adapters: t.Literal['all'] | list[str] | None = None, - use_cache: bool = True) -> M: + def apply_adapter(self, inference_mode: bool = True, adapter_type: AdapterType = 'lora', load_adapters: t.Literal['all'] | list[str] | None = None, use_cache: bool = True) -> M: '''Apply given LoRA mapping to the model. Note that the base model can still be accessed via self.model.get_base_model().''' if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly') # early out if _adapters_mapping is empty or it is already wrapped with peft. @@ -828,10 +830,7 @@ class LLM(LLMInterface[M, T], ReprMixin): adapters_to_load = adapter_mapping.keys() if load_adapters == 'all' else load_adapters for adapter_name in adapters_to_load: _peft_config, _peft_model_id = adapter_mapping[adapter_name] - t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id, - adapter_name=adapter_name, - is_trainable=not inference_mode, - **dict(_peft_config.to_dict())) + t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id, adapter_name=adapter_name, is_trainable=not inference_mode, **dict(_peft_config.to_dict())) return self.__llm_model__ @@ -848,8 +847,7 @@ class LLM(LLMInterface[M, T], ReprMixin): # the below shared similar logics with `get_peft_model` # TODO: Support PromptLearningConfig if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig): - logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", - default_config.task_type) + logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", default_config.task_type) model = peft.PeftModel(self.__llm_model__, default_config) else: # XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0 @@ -1041,42 +1039,21 @@ class LLM(LLMInterface[M, T], ReprMixin): # Prevent yielding partial stop sequence if not partially_stopped: - yield { - 'text': output, - 'usage': { - 'prompt_tokens': input_echo_len, - 'completion_tokens': i, - 'total_tokens': input_echo_len + i - }, - 'finish_reason': None - } + yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None} if stopped: break # Finish stream event, which contains finish reason if i == self.config['max_new_tokens'] - 1: finish_reason = 'length' elif stopped: finish_reason = 'stop' else: finish_reason = None - yield { - 'text': output, - 'usage': { - 'prompt_tokens': input_echo_len, - 'completion_tokens': i, - 'total_tokens': input_echo_len + i - }, - 'finish_reason': finish_reason - } + yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason} # Clean del past_key_values, out gc.collect() torch.cuda.empty_cache() @overload -def Runner(model_name: str, - *, - model_id: str | None = None, - model_version: str | None = ..., - init_local: t.Literal[False, True] = ..., - **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: +def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @overload @@ -1158,10 +1135,7 @@ def Runner(model_name: str, 'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors') }) - backend = t.cast( - LiteralBackend, - first_not_none(backend, - default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value'])) + backend = t.cast(LiteralBackend, first_not_none(backend, default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value'])) if init_local: ensure_available = True runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs) if init_local: runner.init_local(quiet=True) @@ -1174,8 +1148,7 @@ class SetAdapterOutput(t.TypedDict): success: bool message: str -def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, - generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]: +def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]: class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True @@ -1234,7 +1207,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate return ' '.join(output_text) + ' ' return types.new_class( - self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({ + self.__class__.__name__ + 'Runnable', (_Runnable,), {}, + lambda ns: ns.update({ 'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), '__module__': self.__module__, '__doc__': self.config['env'].start_docstring @@ -1281,12 +1255,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'} def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs: - yield 'runner_methods', { - method.name: { - 'batchable': method.config.batchable, - 'batch_dim': method.config.batch_dim if method.config.batchable else None - } for method in __self.runner_methods - } + yield 'runner_methods', {method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods} yield 'config', self.config.model_dump(flatten=True) yield 'llm_type', __self.llm_type yield 'backend', self.__llm_backend__ diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index 2eade60f..83258aac 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -15,25 +15,21 @@ if t.TYPE_CHECKING: from ._llm import LLM -autogptq, torch, transformers = LazyLoader('autogptq', globals(), - 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers') +autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers') logger = logging.getLogger(__name__) QuantiseMode = t.Literal['int8', 'int4', 'gptq'] @overload -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], - **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ... @overload -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], - **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: ... -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, - **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False) diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 69c5bcfe..70fd1608 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -21,8 +21,7 @@ if t.TYPE_CHECKING: from bentoml._internal.runner.runner import AbstractRunner from bentoml._internal.runner.runner import RunnerMethod from openllm_core._typing_compat import TypeAlias - _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], - t.Sequence[openllm.EmbeddingsOutput]] + _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]] # The following warnings from bitsandbytes, and probably not that important for users to see warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization') @@ -44,12 +43,7 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=ru _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None}) -@svc.api(route='/v1/generate', - input=_JsonInput, - output=bentoml.io.JSON.from_sample({ - 'responses': [], - 'configuration': llm_config.model_dump(flatten=True) - })) +@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() @@ -86,11 +80,32 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']), output=bentoml.io.JSON.from_sample({ 'embeddings': [ - 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, - 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, - 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, - -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, - -0.014814382418990135, 0.01796768605709076 + 0.007917795330286026, + -0.014421648345887661, + 0.00481307040899992, + 0.007331526838243008, + -0.0066398633643984795, + 0.00945580005645752, + 0.0087016262114048, + -0.010709521360695362, + 0.012635177001357079, + 0.010541186667978764, + -0.00730888033285737, + -0.001783102168701589, + 0.02339819073677063, + -0.010825827717781067, + -0.015888236463069916, + 0.01876218430697918, + 0.0076906150206923485, + 0.0009032754460349679, + -0.010024012066423893, + 0.01090280432254076, + -0.008668390102684498, + 0.02070549875497818, + 0.0014594447566196322, + -0.018775740638375282, + -0.014814382418990135, + 0.01796768605709076 ], 'num_tokens': 20 })) diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 2e2b763b..5cb23f20 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -63,11 +63,7 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope return builder.build('wheel', path, config_settings={'--global-option': '--quiet'}) raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.') -def construct_python_options(llm: openllm.LLM[t.Any, t.Any], - llm_fs: FS, - extra_dependencies: tuple[str, ...] | None = None, - adapter_map: dict[str, str | None] | None = None, - ) -> PythonOptions: +def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions: packages = ['openllm', 'scipy'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] # NOTE: add openllm to the default dependencies @@ -90,8 +86,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], elif backend_envvar == 'tf': if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'") - candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', - 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', + candidates = ('tensorflow', + 'tensorflow-cpu', + 'tensorflow-gpu', + 'tf-nightly', + 'tf-nightly-cpu', + 'tf-nightly-gpu', + 'intel-tensorflow', + 'intel-tensorflow-avx512', + 'tensorflow-rocm', + 'tensorflow-macos', ) # For the metadata, we have to look for both tensorflow and tensorflow-cpu for candidate in candidates: @@ -109,10 +113,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], raise ValueError('PyTorch is not available. Make sure to have it locally installed.') packages.extend([f'torch>={importlib.metadata.version("torch")}']) wheels: list[str] = [] - built_wheels: list[str | None] = [ - build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) - for p in ('openllm_core', 'openllm_client', 'openllm') - ] + built_wheels: list[str | + None] = [build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')] if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) return PythonOptions(packages=packages, @@ -120,9 +122,14 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], lock_packages=False, extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/']) -def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, - adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, - serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry, +def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], + _: FS, + workers_per_resource: float, + quantize: LiteralString | None, + adapter_map: dict[str, str | None] | None, + dockerfile_template: str | None, + serialisation_format: t.Literal['safetensors', 'legacy'], + container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy()) @@ -145,9 +152,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize) if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value']) - return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', - env=env_dict, - dockerfile_template=dockerfile_template) + return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template) OPENLLM_MODEL_NAME = '# openllm: model name' OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' @@ -188,8 +193,7 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n') elif OPENLLM_MODEL_ADAPTER_MAP in it: - src_contents[src_contents.index(it)] = ( - ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n') + src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n') script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents) if DEBUG: logger.info('Generated script:\n%s', script) llm_fs.writetext(llm.config['service_name'], script) @@ -210,13 +214,7 @@ def create_bento(bento_tag: bentoml.Tag, _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento: backend_envvar = llm.config['env']['backend_value'] labels = dict(llm.identifying_params) - labels.update({ - '_type': llm.llm_type, - '_framework': backend_envvar, - 'start_name': llm.config['start_name'], - 'base_name_or_path': llm.model_id, - 'bundler': 'openllm.bundle' - }) + labels.update({'_type': llm.llm_type, '_framework': backend_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'}) if adapter_map: labels.update(adapter_map) if isinstance(workers_per_resource, str): if workers_per_resource == 'round_robin': workers_per_resource = 1.0 @@ -242,8 +240,15 @@ def create_bento(bento_tag: bentoml.Tag, exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], - docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, - serialisation_format, container_registry, container_version_strategy)) + docker=construct_docker_options(llm, + llm_fs, + workers_per_resource, + quantize, + adapter_map, + dockerfile_template, + serialisation_format, + container_registry, + container_version_strategy)) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/')) # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM. diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index 33d94740..28c2b490 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -42,11 +42,7 @@ ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent # but in the future, we can infer based on git repo and everything to make it more options for users # to build the base image. For now, all of the base image will be /bentoml/openllm:... # NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it. -_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = { - 'docker': 'docker.io/bentoml/openllm', - 'gh': 'ghcr.io/bentoml/openllm', - 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm' -} +_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'} # TODO: support custom fork. Currently it only support openllm main. _OWNER = 'bentoml' @@ -82,9 +78,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str: commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range())) return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message']) # now is the correct behaviour - return orjson.loads( - subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', - 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2] + return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2] @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: @@ -142,9 +136,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon try: if not _BUILDER.health(): raise openllm.exceptions.Error except (openllm.exceptions.Error, subprocess.CalledProcessError): - raise RuntimeError( - 'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.' - ) from None + raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None if openllm_core.utils.device_count() == 0: raise RuntimeError('Building base container requires GPUs (None available)') if not shutil.which('nvidia-container-runtime'): @@ -153,8 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)") pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml' if not pyproject_path.exists(): - raise ValueError( - "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'") + raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'") if not registries: tags: dict[str | LiteralContainerRegistry, str] = { alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items() @@ -171,8 +162,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon quiet=machine) if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip() except Exception as err: - raise openllm.exceptions.OpenLLMException( - f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err + raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err return tags if t.TYPE_CHECKING: diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 5f67aa3b..b1d1b5af 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -43,35 +43,29 @@ _AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command]) def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: - return [ - sc.CompletionItem(str(it.tag), help='Bento') - for it in bentoml.list() - if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'}) - ] + return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})] def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)] -def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, - environ: DictStrAny) -> DictStrAny: +def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny: # TODO: Support amd.com/gpu on k8s _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '') _bentoml_config_options_opts = [ - 'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}', + 'tracing.sample_rate=1.0', + f'api_server.traffic.timeout={server_timeout}', f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}' ] if device: if len(device) > 1: - _bentoml_config_options_opts.extend( - [f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) + _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}') if cors: _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"']) - _bentoml_config_options_opts.extend( - [f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])]) + _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])]) _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts) environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env) @@ -123,18 +117,27 @@ Available official model_id(s): [default: {llm_config['default_id']}] if llm_config['requires_gpu'] and openllm.utils.device_count() < 1: # NOTE: The model requires GPU, therefore we will return a dummy command command_attrs.update({ - 'short_help': '(Disabled because there is no GPU available)', - 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.' + 'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.' }) return noop_command(group, llm_config, _serve_grpc, **command_attrs) @group.command(**command_attrs) @start_decorator(llm_config, serve_grpc=_serve_grpc) @click.pass_context - def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, - workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], - quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'], - cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, + def start_cmd(ctx: click.Context, + /, + server_timeout: int, + model_id: str | None, + model_version: str | None, + workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, + device: t.Tuple[str, ...], + quantize: t.Literal['int8', 'int4', 'gptq'] | None, + backend: LiteralBackend, + serialisation_format: t.Literal['safetensors', 'legacy'], + cors: bool, + adapter_id: str | None, + return_process: bool, + **attrs: t.Any, ) -> LLMConfig | subprocess.Popen[bytes]: if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'): termui.echo( @@ -202,8 +205,7 @@ Available official model_id(s): [default: {llm_config['default_id']}] def next_step(model_name: str, adapter_map: DictStrAny | None) -> None: cmd_name = f'openllm build {model_name}' if adapter_map is not None: - cmd_name += ' ' + ' '.join( - [f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]]) + cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]]) if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue') @@ -242,11 +244,15 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: def wrapper(fn: FC) -> t.Callable[[FC], FC]: composed = openllm.utils.compose( - llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args, + llm_config.to_click_options, + _http_server_args if not serve_grpc else _grpc_server_args, cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), - model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup), + model_id_option(factory=cog.optgroup), + model_version_option(factory=cog.optgroup), cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), - workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup), + workers_per_resource_option(factory=cog.optgroup), + cors_option(factory=cog.optgroup), + backend_option(factory=cog.optgroup), cog.optgroup.group('LLM Optimization Options', help='''Optimization related options. @@ -257,7 +263,9 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) ''', - ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup), + ), + quantize_option(factory=cog.optgroup), + serialisation_option(factory=cog.optgroup), cog.optgroup.option('--device', type=openllm.utils.dantic.CUDA, multiple=True, @@ -375,32 +383,16 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput **attrs)(f) def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option('--cors/--no-cors', - show_default=True, - default=False, - envvar='OPENLLM_CORS', - show_envvar=True, - help='Enable CORS for the server.', - **attrs)(f) + return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f) def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f) def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option('--model-id', - type=click.STRING, - default=None, - envvar='OPENLLM_MODEL_ID', - show_envvar=True, - help='Optional model_id name or path for (fine-tune) weight.', - **attrs)(f) + return cli_option('--model-id', type=click.STRING, default=None, envvar='OPENLLM_MODEL_ID', show_envvar=True, help='Optional model_id name or path for (fine-tune) weight.', **attrs)(f) def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option('--model-version', - type=click.STRING, - default=None, - help='Optional model version to save for this model. It will be inferred automatically from model-id.', - **attrs)(f) + return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f) def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip @@ -512,8 +504,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va try: float(value) # type: ignore[arg-type] except ValueError: - raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, - param) from None + raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None else: return value diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index 98f1c7eb..c981f126 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -83,10 +83,7 @@ def _start(model_name: str, from .entrypoint import start_command from .entrypoint import start_grpc_command llm_config = openllm.AutoConfig.for_model(model_name) - _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, - backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), - model_id=model_id, - quantize=quantize) + _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), model_id=model_id, quantize=quantize) os.environ[_ModelEnv.backend] = _ModelEnv['backend_value'] args: list[str] = [] @@ -102,9 +99,7 @@ def _start(model_name: str, if additional_args: args.extend(additional_args) if __test__: args.append('--return-process') - return start_command_factory(start_command if not _serve_grpc else start_grpc_command, - model_name, - _context_settings=termui.CONTEXT_SETTINGS, + return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False) @inject @@ -199,9 +194,7 @@ def _build(model_name: str, raise OpenLLMException(str(e)) from None matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip()) if matched is None: - raise ValueError( - f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub." - ) + raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") return bentoml.get(matched.group(1), _bento_store=bento_store) def _import_model(model_name: str, @@ -256,6 +249,5 @@ def _list_models() -> dict[str, t.Any]: return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False) start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk( - _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk( - _import_model), openllm_core.utils.codegen.gen_sdk(_list_models) + _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models) __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models'] diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py index 16d3c50f..9590ed1d 100644 --- a/openllm-python/src/openllm/cli/extension/build_base_container.py +++ b/openllm-python/src/openllm/cli/extension/build_base_container.py @@ -28,14 +28,10 @@ if t.TYPE_CHECKING: Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself. ''') @container_registry_option -@click.option('--version-strategy', - type=click.Choice(['release', 'latest', 'nightly']), - default='nightly', - help='Version strategy to use for tagging the image.') +@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.') @click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False) @machine_option -def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, - machine: bool) -> dict[str, str]: +def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]: mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine) if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white') return mapping diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py index 4126000b..8ddf43ac 100644 --- a/openllm-python/src/openllm/cli/extension/dive_bentos.py +++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py @@ -31,9 +31,7 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore except bentoml.exceptions.NotFound: ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.') if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle': - ctx.fail( - f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness." - ) + ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.") if machine: return bentomodel.path # copy and paste this into a new shell if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path) diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py index df3c8f0b..151e508e 100644 --- a/openllm-python/src/openllm/cli/extension/get_containerfile.py +++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py @@ -41,11 +41,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento # for the reconstruction of the Dockerfile. if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None: docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template' - doc = generate_containerfile(docker=DockerOptions(**docker_attrs), - build_ctx=bentomodel.path, - conda=options.conda, - bento_fs=bentomodel._fs, - enable_buildkit=True, - add_header=True) + doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True) termui.echo(doc, fg='white') return bentomodel.path diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py index a3b66bc6..f83d7798 100644 --- a/openllm-python/src/openllm/cli/extension/get_prompt.py +++ b/openllm-python/src/openllm/cli/extension/get_prompt.py @@ -18,9 +18,7 @@ from openllm_core._prompt import process_prompt LiteralOutput = t.Literal['json', 'pretty', 'porcelain'] @click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS) -@click.argument('model_name', - type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), - shell_complete=model_complete_envvar) +@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar) @click.argument('prompt', type=click.STRING) @output_option @click.option('--format', type=click.STRING, default=None) @@ -32,8 +30,7 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain'] callback=opt_callback, metavar='ARG=VALUE[,ARG=VALUE]') @click.pass_context -def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], - **_: t.Any) -> str | None: +def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None: '''Get the default prompt used by OpenLLM.''' module = openllm.utils.EnvVarMixin(model_name).module _memoized = {k: v[0] for k, v in _memoized.items() if v} diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py index 3169c878..3b13f38b 100644 --- a/openllm-python/src/openllm/cli/extension/list_bentos.py +++ b/openllm-python/src/openllm/cli/extension/list_bentos.py @@ -22,17 +22,10 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None: 'tag': str(b.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(b.path)), 'models': [{ - 'tag': str(m.tag), - 'size': human_readable_size(openllm.utils.calc_dir_size(m.path)) - } - for m in (bentoml.models.get(_.tag) - for _ in b.info.models)] - } - for b in tuple(i - for i in bentoml.list() - if all(k in i.info.labels - for k in {'start_name', 'bundler'})) - if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) + 'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path)) + } for m in (bentoml.models.get(_.tag) for _ in b.info.models)] + } for b in tuple(i for i in bentoml.list() if all( + k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) } mapping = {k: v for k, v in mapping.items() if v} if output == 'pretty': diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py index 2d87560e..fdcab1cc 100644 --- a/openllm-python/src/openllm/cli/extension/list_models.py +++ b/openllm-python/src/openllm/cli/extension/list_models.py @@ -25,30 +25,17 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny: '''This is equivalent to openllm models --show-available less the nice table.''' models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) ids_in_local_store = { - k: [ - i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and - 'model_name' in i.info.labels and i.info.labels['model_name'] == k - ] for k in models + k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k] + for k in models } if model_name is not None: - ids_in_local_store = { - k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] - for k, v in ids_in_local_store.items() - } + ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()} ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v} - local_models = { - k: [{ - 'tag': str(i.tag), - 'size': human_readable_size(openllm.utils.calc_dir_size(i.path)) - } for i in val] for k, val in ids_in_local_store.items() - } + local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()} if output == 'pretty': import tabulate tabulate.PRESERVE_WHITESPACE = True - termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], - tablefmt='fancy_grid', - headers=['LLM', 'Tag', 'Size']), - fg='white') + termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white') else: termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white') return local_models diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index fb73cf56..d04d7bef 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -153,13 +153,11 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): def keys(self) -> ConfigModelKeysView: return t.cast('ConfigModelKeysView', - [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + - list(self._extra_content.keys())) + [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())) def values(self) -> ConfigModelValuesView: return t.cast('ConfigModelValuesView', - [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + - list(self._extra_content.values())) + [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values())) def items(self) -> ConfigModelItemsView: return t.cast('ConfigModelItemsView', diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py index dfcd3b55..ff9aad7d 100644 --- a/openllm-python/src/openllm/models/auto/modeling_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_auto.py @@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass from .factory import _LazyAutoMapping -MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), - ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'), - ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')]) +MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), + ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) class AutoLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index e7538975..08e6379e 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass from .factory import _LazyAutoMapping -MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), - ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), - ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')]) +MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), + ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) class AutoVLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py index 9ad1be40..52f38bde 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -12,36 +12,24 @@ from openllm_core.config.configuration_dolly_v2 import get_special_token_id if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf else: torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), - 'transformers'), openllm.utils.LazyLoader( - 'tf', globals(), 'tensorflow') + 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow') logger = logging.getLogger(__name__) @overload -def get_pipeline(model: transformers.PreTrainedModel, - tokenizer: transformers.PreTrainedTokenizer, - _init: t.Literal[True] = True, - **attrs: t.Any) -> transformers.Pipeline: +def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ... @overload -def get_pipeline(model: transformers.PreTrainedModel, - tokenizer: transformers.PreTrainedTokenizer, - _init: t.Literal[False] = ..., - **attrs: t.Any) -> type[transformers.Pipeline]: +def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ... -def get_pipeline(model: transformers.PreTrainedModel, - tokenizer: transformers.PreTrainedTokenizer, - _init: bool = False, - **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline: +def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline: # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information. class InstructionTextGenerationPipeline(transformers.Pipeline): def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs) - def _sanitize_parameters(self, - return_full_text: bool | None = None, - **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]: + def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]: if t.TYPE_CHECKING: assert self.tokenizer is not None preprocess_params: dict[str, t.Any] = {} # newer versions of the tokenizer configure the response key as a special token. newer versions still may @@ -87,11 +75,7 @@ def get_pipeline(model: transformers.PreTrainedModel, instruction_text = input_tensors.pop('instruction_text') return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text} - def postprocess(self, - model_outputs: dict[str, t.Any], - response_key_token_id: int, - end_key_token_id: int, - return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]: + def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]: if t.TYPE_CHECKING: assert self.tokenizer is not None _generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text'] generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist() @@ -149,10 +133,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {} def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: - return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), - self.tokenizer, - _init=True, - return_full_text=self.config.return_full_text) + return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text) def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]: llm_config = self.config.model_construct_env(**attrs) diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py index d0cf4837..91ffaafb 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py @@ -18,17 +18,14 @@ class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTraine with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined] return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], - generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, - **attrs).to_generation_config()), + generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True) def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', - openllm.StoppingCriteriaList([])) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) - result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index 54cf394e..48dbb434 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -23,16 +23,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke mask = attention_mask.unsqueeze(-1).expand(data.size()).float() masked_embeddings = data * mask sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1) - return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), - num_tokens=int(torch.sum(attention_mask).item())) + return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item())) def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', - openllm.StoppingCriteriaList([])) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) - result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index d79532f8..2a5be097 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -36,10 +36,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch - return { - 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, - 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32 - }, {} + return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {} def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model: import torch @@ -51,12 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code) tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token - model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, - config=config, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map=device_map, - **attrs) + model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs) try: return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)) finally: @@ -67,12 +59,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32) device_map = attrs.pop('device_map', None) trust_remote_code = attrs.pop('trust_remote_code', True) - config = get_mpt_config(self._bentomodel.path, - self.config.max_sequence_length, - self.device, - device_map=device_map, - trust_remote_code=trust_remote_code, - ) + config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,) model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py index 19239321..f8757121 100644 --- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py @@ -16,8 +16,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok __openllm_internal__ = True def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: - config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained( - self.model_id, **self.llm_parameters[-1]) + config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), @@ -34,11 +33,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_k': top_k, - 'num_return_sequences': num_return_sequences, - 'repetition_penalty': repetition_penalty + 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty }, {} def generate(self, prompt: str, **attrs: t.Any) -> list[str]: diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py index d00fa7ea..824dddbc 100644 --- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py @@ -11,8 +11,7 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: import transformers - config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained( - self.model_id, **self.llm_parameters[-1]) + config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py index a7e5a97a..fec0a85c 100644 --- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py @@ -19,8 +19,5 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']): use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_k': top_k, - 'num_return_sequences': num_return_sequences + 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences }, {} diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py index 89de3061..fcf396cc 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py @@ -18,10 +18,7 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers. @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch - return { - 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, - 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32 - }, {} + return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {} def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: import torch @@ -50,11 +47,9 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers. def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', - openllm.StoppingCriteriaList([])) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) - result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/playground/falcon_tuned.py b/openllm-python/src/openllm/playground/falcon_tuned.py index 3776c3c1..17229a7c 100644 --- a/openllm-python/src/openllm/playground/falcon_tuned.py +++ b/openllm-python/src/openllm/playground/falcon_tuned.py @@ -56,18 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): else: model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()) -model, tokenizer = openllm.AutoLLM.for_model("falcon", - model_id=model_args.model_id, - quantize="int4", - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - ensure_available=True).prepare_for_training( - adapter_type="lora", - lora_alpha=16, - lora_dropout=0.1, - r=16, - bias="none", - target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]) +model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, + ensure_available=True).prepare_for_training(adapter_type="lora", + lora_alpha=16, + lora_dropout=0.1, + r=16, + bias="none", + target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]) model.config.use_cache = False tokenizer.pad_token = tokenizer.eos_token diff --git a/openllm-python/src/openllm/playground/llama2_qlora.py b/openllm-python/src/openllm/playground/llama2_qlora.py index c9bb9630..ea28d078 100644 --- a/openllm-python/src/openllm/playground/llama2_qlora.py +++ b/openllm-python/src/openllm/playground/llama2_qlora.py @@ -98,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME): print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"]) # tokenize and chunk dataset - lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, - remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True) + lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True) # Print total number of samples print(f"Total number of samples: {len(lm_dataset)}") @@ -180,15 +179,11 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments): transformers.set_seed(model_args.seed) - model, tokenizer = prepare_for_int4_training(model_args.model_id, - gradient_checkpointing=training_args.gradient_checkpointing, - bf16=training_args.bf16, - ) + model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,) datasets = prepare_datasets(tokenizer) trainer = transformers.Trainer(model=model, - args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), - **dataclasses.asdict(training_args)), + args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)), train_dataset=datasets, data_collator=transformers.default_data_collator, ) diff --git a/openllm-python/src/openllm/playground/opt_tuned.py b/openllm-python/src/openllm/playground/opt_tuned.py index 5b21e600..7c2386ae 100644 --- a/openllm-python/src/openllm/playground/opt_tuned.py +++ b/openllm-python/src/openllm/playground/opt_tuned.py @@ -56,13 +56,12 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): else: model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()) -model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", - ensure_available=True).prepare_for_training(adapter_type="lora", - r=16, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - lora_dropout=0.05, - bias="none") +model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True).prepare_for_training(adapter_type="lora", + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none") # ft on english_quotes data = load_dataset("Abirate/english_quotes") diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index e336bca8..caabd0d6 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -43,13 +43,10 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: try: tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer'] except KeyError: - raise openllm.exceptions.OpenLLMException( - "Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. " - "For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None + raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. " + "For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None else: - tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), - trust_remote_code=llm.trust_remote_code, - **tokenizer_attrs) + tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs) if tokenizer.pad_token_id is None: if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id diff --git a/openllm-python/src/openllm/serialisation/constants.py b/openllm-python/src/openllm/serialisation/constants.py index f90116ec..95d447ef 100644 --- a/openllm-python/src/openllm/serialisation/constants.py +++ b/openllm-python/src/openllm/serialisation/constants.py @@ -6,6 +6,4 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = { 'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'), 'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM') } -HUB_ATTRS = [ - 'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token' -] +HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'] diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 40062be0..44f36576 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -43,11 +43,7 @@ logger = logging.getLogger(__name__) __all__ = ['import_model', 'get', 'load_model'] @inject -def import_model(llm: openllm.LLM[M, T], - *decls: t.Any, - trust_remote_code: bool, - _model_store: ModelStore = Provide[BentoMLContainer.model_store], - **attrs: t.Any) -> bentoml.Model: +def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model: """Auto detect model type from given model_id and import it to bentoml's model store. For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first, @@ -76,8 +72,7 @@ def import_model(llm: openllm.LLM[M, T], if quantize_method == 'gptq': if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") signatures['generate'] = {'batchable': False} @@ -107,8 +102,7 @@ def import_model(llm: openllm.LLM[M, T], tokenizer.save_pretrained(bentomodel.path) if quantize_method == 'gptq': if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") logger.debug('Saving model with GPTQ quantisation will require loading model into memory.') @@ -124,20 +118,13 @@ def import_model(llm: openllm.LLM[M, T], else: architectures = getattr(config, 'architectures', []) if not architectures: - raise RuntimeError( - 'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`' - ) + raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`') architecture = architectures[0] update_model(bentomodel, metadata={'_pretrained_class': architecture}) if llm._local: # possible local path logger.debug('Model will be loaded into memory to save to target store as it is from local path.') - model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, - *decls, - config=config, - trust_remote_code=trust_remote_code, - **hub_attrs, - **attrs) + model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs) # for trust_remote_code to work bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules) model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation) @@ -149,8 +136,7 @@ def import_model(llm: openllm.LLM[M, T], else: bentomodel.flush() # type: ignore[no-untyped-call] bentomodel.save(_model_store) - openllm.utils.analytics.track( - openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)) + openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)) finally: bentomodel.exit_cloudpickle_context(imported_modules) # NOTE: We need to free up the cache after importing the model @@ -171,8 +157,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: if Version(model.info.api_version) < Version('v2'): raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.') if model.info.labels['backend'] != llm.__llm_backend__: - raise openllm.exceptions.OpenLLMException( - f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.") + raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.") return model except Exception as err: if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code) @@ -185,8 +170,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: default=llm._serialisation_format == 'safetensors') if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq': if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 469b9bb2..29fcf8af 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -23,8 +23,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import M from openllm_core._typing_compat import T else: - transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), - 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch') + transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch') _object_setattr = object.__setattr__ @@ -45,11 +44,7 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu if not isinstance(config, transformers.PretrainedConfig): copied_attrs = copy.deepcopy(attrs) if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype') - config, attrs = transformers.AutoConfig.from_pretrained(model_id, - return_unused_kwargs=True, - trust_remote_code=trust_remote_code, - **hub_attrs, - **copied_attrs) + config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs) return config, hub_attrs, attrs def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T: @@ -62,9 +57,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra if llm.config['trust_remote_code']: autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM' if not hasattr(config, 'auto_map'): - raise ValueError( - f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping' - ) + raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping') # in case this model doesn't use the correct auto class for model type, for example like chatglm # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel if autoclass not in config.auto_map: autoclass = 'AutoModel' @@ -84,7 +77,8 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod based: DictStrAny = copy.deepcopy(bentomodel.info.metadata) based.update(metadata) _object_setattr( - bentomodel, '_info', + bentomodel, + '_info', ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged tag=bentomodel.info.tag, module=bentomodel.info.module, @@ -102,9 +96,7 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ('__call__',) default_config = ModelSignature(batchable=False) if llm.__llm_backend__ in {'pt', 'vllm'}: - infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', - 'constrained_beam_search', - ) + infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',) elif llm.__llm_backend__ == 'tf': infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',) else: diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 36f55972..90043c9b 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -15,10 +15,7 @@ if t.TYPE_CHECKING: logger = logging.getLogger(__name__) @contextlib.contextmanager -def build_bento(model: str, - model_id: str | None = None, - quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, - cleanup: bool = False) -> t.Iterator[bentoml.Bento]: +def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]: logger.info('Building BentoML for %s', model) bento = openllm.build(model, model_id=model_id, quantize=quantize) yield bento diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index 43dda04a..a0262a6e 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -31,20 +31,12 @@ def test_missing_default(): with pytest.raises(ValueError, match='Missing required fields *'): make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']}) with pytest.raises(ValueError, match='Missing required fields *'): - make_llm_config('MissingArchitecture', { - 'default_id': 'huggingface/t5-tiny-testing', - 'model_ids': ['huggingface/t5-tiny-testing'], - 'requirements': ['bentoml'], - }, - ) + make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},) def test_forbidden_access(): cl_ = make_llm_config( 'ForbiddenAccess', { - 'default_id': 'huggingface/t5-tiny-testing', - 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], - 'architecture': 'PreTrainedModel', - 'requirements': ['bentoml'], + 'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'], }, ) @@ -77,9 +69,7 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings): cl_ = make_llm_config('AttrsProtocolLLM', gen_settings) assert attr.has(cl_) -@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), - st.floats(min_value=0.0, max_value=1.0), - ) +@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),) def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float): cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),) sent = cl_() @@ -138,9 +128,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat mk.setenv(field_env_key('field1'), str(4.0)) mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2)) sent = make_llm_config('OverwriteWithEnvAvailable', { - 'default_id': 'asdfasdf', - 'model_ids': ['asdf', 'asdfasdfads'], - 'architecture': 'PreTrainedModel' + 'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel' }, fields=(('field1', 'float', 3.0),), ).model_construct_env(field1=20.0, temperature=0.4) diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index 9d53a778..3d722661 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -73,8 +73,7 @@ class ResponseComparator(JSONSnapshotExtension): return s == t def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool: - return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and - eq_config(s.marshaled_config, t.marshaled_config)) + return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config)) return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]) @@ -210,8 +209,7 @@ def _container_handle(model: str, detach=True, device_requests=devs, ports={ - '3000/tcp': port, - '3001/tcp': prom_port + '3000/tcp': port, '3001/tcp': prom_port }, ) diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index 9fdf7eea..b1b950c8 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -49,8 +49,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): @pytest.fixture() def dockerfile_template(tmp_path_factory: pytest.TempPathFactory): file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template' - file.write_text( - "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}") + file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}") return file @pytest.mark.usefixtures('dockerfile_template') diff --git a/pyproject.toml b/pyproject.toml index 9ee7e1a1..8a6b57f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -261,7 +261,7 @@ ignore_patterns = [ based_on_style = "google" INDENT_WIDTH = 2 JOIN_MULTIPLE_LINES = true -COLUMN_LIMIT = 152 +COLUMN_LIMIT = 192 USE_TABS = false BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1 @@ -282,6 +282,7 @@ SPACES_AROUND_TUPLE_DELIMITERS = false SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false SPACE_INSIDE_BRACKETS = false SPLIT_ALL_COMMA_SEPARATED_VALUES = false +SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = true SPLIT_BEFORE_DOT = true [tool.pytest.ini_options] diff --git a/tools/dependencies.py b/tools/dependencies.py index ed98489d..9c50a26a 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -28,10 +28,7 @@ class Classifier: @staticmethod def status() -> dict[int, str]: - return { - v: status for v, status in zip(range( - 1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive']) - } + return {v: status for v, status in zip(range(1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive'])} @staticmethod def apache() -> str: @@ -137,9 +134,7 @@ GPTQ_DEPS = ['auto-gptq[triton]'] VLLM_DEPS = ['vllm>=0.1.4', 'ray'] _base_requirements: dict[str, t.Any] = { - inflection.dasherize(name): config_cls.__openllm_requirements__ - for name, config_cls in openllm.CONFIG_MAPPING.items() - if config_cls.__openllm_requirements__ + inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__ } # shallow copy from locals() @@ -171,7 +166,8 @@ def create_classifiers() -> Array: Classifier.create_classifier('audience', 'Developers'), Classifier.create_classifier('audience', 'Science/Research'), Classifier.create_classifier('audience', 'System Administrators'), - Classifier.create_classifier('typing', 'Typed'), *Classifier.create_python_classifier(), + Classifier.create_classifier('typing', 'Typed'), + *Classifier.create_python_classifier(), ]) return arr.multiline(True) @@ -218,8 +214,23 @@ def authors() -> Array: def keywords() -> Array: arr = correct_style(tomlkit.array()) arr.extend([ - 'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2', 'Fine tuning', 'Serverless', - 'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch', 'Transformers' + 'MLOps', + 'AI', + 'BentoML', + 'Model Serving', + 'Model Deployment', + 'LLMOps', + 'Falcon', + 'Vicuna', + 'Llama 2', + 'Fine tuning', + 'Serverless', + 'Large Language Model', + 'Generative AI', + 'StableLM', + 'Alpaca', + 'PyTorch', + 'Transformers' ]) return arr.multiline(True) diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py index 7a9b816e..de11e05b 100755 --- a/tools/update-brew-tap.py +++ b/tools/update-brew-tap.py @@ -16,9 +16,7 @@ _OWNER = 'bentoml' _REPO = 'openllm' _gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = { - 'macos_arm': 'aarch64-apple-darwin', - 'macos_intel': 'x86_64-apple-darwin', - 'linux_intel': 'x86_64-unknown-linux-musl' + 'macos_arm': 'aarch64-apple-darwin', 'macos_intel': 'x86_64-apple-darwin', 'linux_intel': 'x86_64-unknown-linux-musl' } def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str: @@ -34,9 +32,7 @@ def main() -> int: _info = api.repos.get() release_tag = api.repos.get_latest_release().name - shadict: dict[str, t.Any] = { - k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies - } + shadict: dict[str, t.Any] = {k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies} shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag)().strip() ENVIRONMENT = Environment(extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'], diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py index 5df89839..b22d7735 100755 --- a/tools/update-config-stubs.py +++ b/tools/update-config-stubs.py @@ -91,28 +91,23 @@ def main() -> int: # NOTE: inline stubs for _ConfigAttr type stubs config_attr_lines: list[str] = [] for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): - config_attr_lines.extend([ - ' ' * 4 + line for line in [ - f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', - f"'''{_value_docstring[keys]}'''\n", - ] - ]) + config_attr_lines.extend( + [' ' * 4 + line for line in [f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', f"'''{_value_docstring[keys]}'''\n",]]) # NOTE: inline runtime __getitem__ overload process lines: list[str] = [] lines.append(' ' * 2 + '# NOTE: ModelSettings arguments\n') for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): - lines.extend([ - ' ' * 2 + line for line in [ - '@overload\n', - f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n", - ] - ]) + lines.extend( + [' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n",]]) # special case variables: generation_class, extras, sampling_class lines.append(' ' * 2 + '# NOTE: generation_class, sampling_class and extras arguments\n') lines.extend([ ' ' * 2 + line for line in [ - '@overload\n', "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", '@overload\n', - "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", '@overload\n', + '@overload\n', + "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", + '@overload\n', + "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", + '@overload\n', "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n", ] ]) @@ -128,11 +123,9 @@ def main() -> int: for keys in PeftType._member_names_: lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]]) - processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT - ] + processed[end_attrs_idx + 1:start_stub_idx] + [ - ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT - ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT - ] + processed[end_idx + 1:] + processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [ + ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT + ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT] + processed[end_idx + 1:] with _TARGET_FILE.open('w') as f: f.writelines(processed) return 0 diff --git a/tools/update-dummy.py b/tools/update-dummy.py index f3aac8d8..28fbdb99 100755 --- a/tools/update-dummy.py +++ b/tools/update-dummy.py @@ -13,9 +13,7 @@ from openllm import CONFIG_MAPPING if t.TYPE_CHECKING: from collections import OrderedDict -config_requirements = { - k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items() -} +config_requirements = {k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()} _dependencies: dict[LiteralBackend, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))} _auto: dict[str, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))} @@ -30,23 +28,24 @@ def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]: def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]: _dep_list: list[str] = [ - f'"{v}"' for v in [ - _dependencies[backend], *( - t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else []) - ] + f'"{v}"' for v in [_dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])] ] if auto: cl_ = _auto[backend] else: cl_ = get_mapping(backend)[model_name] lines = [ - f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]", + f'class {cl_}(metaclass=_DummyMetaclass):', + ' ' * indentation + f"_backends=[{','.join(_dep_list)}]", ' ' * indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])" ] return lines def write_stub(backend: LiteralBackend, _path: str) -> list[str]: base = [ - f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations', - 'import typing as _t', 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends', + f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', + f'# To update this, run ./{_path}', + 'from __future__ import annotations', + 'import typing as _t', + 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends', ] base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it]) # autoclass diff --git a/tools/update-models-import.py b/tools/update-models-import.py index f401a6da..709b6f5c 100755 --- a/tools/update-models-import.py +++ b/tools/update-models-import.py @@ -11,9 +11,10 @@ def create_module_import() -> str: def create_stubs_import() -> list[str]: return [ - 'if t.TYPE_CHECKING:from . import ' + - ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]), - '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__', '__dir__=__lazy.__dir__', + 'if t.TYPE_CHECKING:from . import ' + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]), + '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', + '__all__=__lazy.__all__', + '__dir__=__lazy.__dir__', '__getattr__=__lazy.__getattr__\n' ] @@ -21,9 +22,13 @@ def main() -> int: _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)) with _TARGET_FILE.open('w') as f: f.writelines('\n'.join([ - f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations', - 'import typing as t', 'from openllm_core.utils import LazyModule', - create_module_import(), *create_stubs_import(), + f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', + f'# To update this, run ./{_path}', + 'from __future__ import annotations', + 'import typing as t', + 'from openllm_core.utils import LazyModule', + create_module_import(), + *create_stubs_import(), ])) return 0 diff --git a/tools/update-readme.py b/tools/update-readme.py index 0bb2f9de..4e9101ea 100755 --- a/tools/update-readme.py +++ b/tools/update-readme.py @@ -18,11 +18,7 @@ def main() -> int: start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT) formatted: dict[t.Literal['Model', 'Architecture', 'URL', 'Installation', 'Model Ids'], list[str | list[str]]] = { - 'Model': [], - 'Architecture': [], - 'URL': [], - 'Model Ids': [], - 'Installation': [], + 'Model': [], 'Architecture': [], 'URL': [], 'Model Ids': [], 'Installation': [], } max_install_len_div = 0 for name, config_cls in openllm.CONFIG_MAPPING.items():