From 72c6005d3bda44d8921e01fa59640749c38c6f2e Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sat, 4 Nov 2023 04:01:56 -0400 Subject: [PATCH] chore(inference): update vllm to 0.2.1.post1 and update config parsing (#554) chore(dependencies): update vllm to 0.2.1.post1 and update config parsing --- .../src/openllm_core/_configuration.py | 22 ++++++++++++++++++- openllm-python/pyproject.toml | 2 +- tools/dependencies.py | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 61c5d14a..d398809a 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -390,6 +390,8 @@ class SamplingParams(ReprMixin): stop: t.List[str] = dantic.Field(None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.') ignore_eos: bool = dantic.Field(False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.') logprobs: int = dantic.Field(None, description='Number of log probabilities to return per output token.') + prompt_logprobs: t.Optional[int] = dantic.Field(None, description='Number of log probabilities to return per input token.') + skip_special_tokens: bool = dantic.Field(True, description='Whether to skip special tokens in the generated output.') if t.TYPE_CHECKING: max_tokens: int @@ -407,6 +409,9 @@ class SamplingParams(ReprMixin): _object_setattr(self, 'temperature', attrs.pop('temperature', 1.0)) _object_setattr(self, 'top_k', attrs.pop('top_k', -1)) _object_setattr(self, 'top_p', attrs.pop('top_p', 1.0)) + _object_setattr(self, 'repetition_penalty', attrs.pop('repetition_penalty', 1.0)) + _object_setattr(self, 'length_penalty', attrs.pop('length_penalty', 1.0)) + _object_setattr(self, 'early_stopping', attrs.pop('early_stopping', False)) self.__attrs_init__(**attrs) def __getitem__(self, item: str) -> t.Any: @@ -432,7 +437,18 @@ class SamplingParams(ReprMixin): top_k = first_not_none(attrs.pop('top_k', None), default=generation_config['top_k']) top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p']) max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens']) - return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs) + repetition_penalty = first_not_none(attrs.pop('repetition_penalty', None), default=generation_config['repetition_penalty']) + length_penalty = first_not_none(attrs.pop('length_penalty', None), default=generation_config['length_penalty']) + early_stopping = first_not_none(attrs.pop('early_stopping', None), default=generation_config['early_stopping']) + return cls(_internal=True, + temperature=temperature, + top_k=top_k, + top_p=top_p, + max_tokens=max_tokens, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + early_stopping=early_stopping, + **attrs) bentoml_cattr.register_unstructure_hook_factory( lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_unstructure_fn( @@ -1234,6 +1250,10 @@ class LLMConfig(_ConfigAttr): def __getitem__(self, item: t.Literal['ignore_eos']) -> bool: ... @overload def __getitem__(self, item: t.Literal['logprobs']) -> int: ... + @overload + def __getitem__(self, item: t.Literal['prompt_logprobs']) -> t.Optional[int]: ... + @overload + def __getitem__(self, item: t.Literal['skip_special_tokens']) -> bool: ... # NOTE: PeftType arguments @overload def __getitem__(self, item: t.Literal['prompt_tuning']) -> dict[str, t.Any]: ... diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index dacfb04d..851ebb45 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -115,7 +115,7 @@ openai = ["openai[embeddings]", "tiktoken"] opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] starcoder = ["bitsandbytes"] -vllm = ["vllm>=0.2.0", "ray"] +vllm = ["vllm>=0.2.1post1", "ray"] [tool.hatch.version] fallback-version = "0.0.0" diff --git a/tools/dependencies.py b/tools/dependencies.py index 84fed2f2..368a3215 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -133,7 +133,7 @@ AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile'] PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat'] GGML_DEPS = ['ctransformers'] GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0'] -VLLM_DEPS = ['vllm>=0.2.0', 'ray'] +VLLM_DEPS = ['vllm>=0.2.1post1', 'ray'] _base_requirements: dict[str, t.Any] = { inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__