diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ff88fcd..2dd6f54e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -91,36 +91,36 @@ repos: - id: check-added-large-files - id: debug-statements - id: check-merge-conflict - - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.324 - hooks: - - id: pyright - verbose: true - args: [--level, error] - exclude: | - (?x)^( - examples/.*| - tools/.*| - tests/.*| - openllm-python/src/openllm/playground/.*| - openllm-python/tests/.*| - openllm-client/src/openllm_client/pb.*| - .github/.*| - cz.py | - hatch_build.py - )$ - additional_dependencies: - - openllm-client[grpc] - - bentoml[io]>=1.1.2 - - transformers[agents,torch,tokenizers,accelerate]>=4.29.0 - - peft - - safetensors - - optimum - - ghapi - - click==8.1.3 - - bitsandbytes - - diffusers - - soundfile + # - repo: https://github.com/RobertCraigie/pyright-python + # rev: v1.1.324 + # hooks: + # - id: pyright + # verbose: true + # args: [--level, error] + # exclude: | + # (?x)^( + # examples/.*| + # tools/.*| + # tests/.*| + # openllm-python/src/openllm/playground/.*| + # openllm-python/tests/.*| + # openllm-client/src/openllm_client/pb.*| + # .github/.*| + # cz.py | + # hatch_build.py + # )$ + # additional_dependencies: + # - openllm-client[grpc] + # - bentoml[io]>=1.1.2 + # - transformers[agents,torch,tokenizers,accelerate]>=4.29.0 + # - peft + # - safetensors + # - optimum + # - ghapi + # - click==8.1.3 + # - bitsandbytes + # - diffusers + # - soundfile - repo: meta hooks: - id: check-hooks-apply diff --git a/cz.py b/cz.py index f84c3613..53bed816 100755 --- a/cz.py +++ b/cz.py @@ -7,6 +7,7 @@ import tokenize from tabulate import tabulate TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING] + def run_cz(dir: str, package: str): headers = ['Name', 'Lines', 'Tokens/Line'] table = [] @@ -22,9 +23,11 @@ def run_cz(dir: str, package: str): for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0]): print(f'{dir_name:35s} : {sum([x[1] for x in group]):6d}') print(f'\ntotal line count: {sum([x[1] for x in table])}') + def main() -> int: run_cz('openllm-python', 'openllm') run_cz('openllm-core', 'openllm_core') run_cz('openllm-client', 'openllm_client') return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/examples/bentoml-demo/service.py b/examples/bentoml-demo/service.py index ac78fb80..9118939a 100644 --- a/examples/bentoml-demo/service.py +++ b/examples/bentoml-demo/service.py @@ -8,9 +8,11 @@ llm_config = openllm.AutoConfig.for_model(model) llm_runner = openllm.Runner(model, llm_config=llm_config) svc = bentoml.Service(name="llm-service", runners=[llm_runner]) + @svc.on_startup def download(_: bentoml.Context): llm_runner.download_model() + @svc.api(input=bentoml.io.Text(), output=bentoml.io.Text()) async def prompt(input_text: str) -> str: answer = await llm_runner.generate.async_run(input_text) diff --git a/examples/langchain-chains-demo/service.py b/examples/langchain-chains-demo/service.py index 0c1a2dcf..3860cb01 100644 --- a/examples/langchain-chains-demo/service.py +++ b/examples/langchain-chains-demo/service.py @@ -8,15 +8,18 @@ from pydantic import BaseModel import bentoml from bentoml.io import JSON, Text + class Query(BaseModel): industry: str product_name: str keywords: t.List[str] llm_config: t.Dict[str, t.Any] + def gen_llm(model_name: str, model_id: str | None = None) -> OpenLLM: lc_llm = OpenLLM(model_name=model_name, model_id=model_id, embedded=False) lc_llm.runner.download_model() return lc_llm + llm = gen_llm("dolly-v2", model_id="databricks/dolly-v2-7b") prompt = PromptTemplate( @@ -38,12 +41,15 @@ Facebook Ads copy: chain = LLMChain(llm=llm, prompt=prompt) svc = bentoml.Service("fb-ads-copy", runners=[llm.runner]) + @svc.on_startup def download(_: bentoml.Context): llm.runner.download_model() + SAMPLE_INPUT = Query( industry="SAAS", product_name="BentoML", keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"], llm_config=llm.runner.config.model_dump(), ) + @svc.api(input=JSON.from_sample(sample=SAMPLE_INPUT), output=Text()) def generate(query: Query): return chain.run({"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)}) diff --git a/examples/langchain-tools-demo/service.py b/examples/langchain-tools-demo/service.py index 51685533..da5d261a 100644 --- a/examples/langchain-tools-demo/service.py +++ b/examples/langchain-tools-demo/service.py @@ -11,6 +11,7 @@ llm = OpenLLM(model_name="dolly-v2", model_id="databricks/dolly-v2-7b", embedded tools = load_tools(["serpapi"], llm=llm) agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION) svc = bentoml.Service("langchain-openllm", runners=[llm.runner]) + @svc.api(input=Text.from_sample(sample=SAMPLE_INPUT), output=Text()) def chat(input_text: str): return agent.run(input_text) diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index 99950906..1709d171 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -21,6 +21,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny, LiteralRuntime logger = logging.getLogger(__name__) + @attr.define(slots=False, init=False) class _ClientAttr: _address: str @@ -145,6 +146,7 @@ class _ClientAttr: @functools.cached_property def inner(self) -> t.Any: raise NotImplementedError("'inner' client is not implemented.") + class _Client(_ClientAttr): _host: str _port: str @@ -175,6 +177,7 @@ class _Client(_ClientAttr): except Exception as err: logger.error('Exception caught while sending instruction to HF agent: %s', err, exc_info=err) logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address) + class _AsyncClient(_ClientAttr): _host: str _port: str @@ -230,6 +233,7 @@ class _AsyncClient(_ClientAttr): else: tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote) return f'{tool_code}\n{code}' + class BaseClient(_Client): def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError @@ -255,6 +259,7 @@ class BaseClient(_Client): if return_response == 'attrs': return r elif return_response == 'raw': return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) + class BaseAsyncClient(_AsyncClient): async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError diff --git a/openllm-client/src/openllm_client/benmin/__init__.py b/openllm-client/src/openllm_client/benmin/__init__.py index 9ea1f15b..39a233b9 100644 --- a/openllm-client/src/openllm_client/benmin/__init__.py +++ b/openllm-client/src/openllm_client/benmin/__init__.py @@ -22,6 +22,7 @@ import bentoml if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI __all__ = ['Client', 'AsyncClient'] + @attr.define(init=False) class Client: server_url: str @@ -67,6 +68,7 @@ class Client: return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs) except Exception as err: raise bentoml.exceptions.BentoMLException('Failed to wait until server ready: %s:%d' % (host, port)) from err + @attr.define(init=False) class AsyncClient: server_url: str diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py index f1aca062..22dbbfb1 100644 --- a/openllm-client/src/openllm_client/benmin/_grpc.py +++ b/openllm-client/src/openllm_client/benmin/_grpc.py @@ -22,10 +22,12 @@ pb, services = import_generated_stubs('v1') if t.TYPE_CHECKING: from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse logger = logging.getLogger(__name__) + class ClientCredentials(t.TypedDict): root_certificates: NotRequired[t.Union[bytes, str]] private_key: NotRequired[t.Union[bytes, str]] certificate_chain: NotRequired[t.Union[bytes, str]] + @overload def dispatch_channel( server_url: str, @@ -37,6 +39,7 @@ def dispatch_channel( interceptors: t.Sequence[aio.ClientInterceptor] | None = ... ) -> aio.Channel: ... + @overload def dispatch_channel( server_url: str, @@ -48,6 +51,7 @@ def dispatch_channel( interceptors: t.Sequence[aio.ClientInterceptor] | None = None ) -> grpc.Channel: ... + def dispatch_channel( server_url: str, typ: t.Literal['async', 'sync'] = 'sync', @@ -67,6 +71,7 @@ def dispatch_channel( elif typ == 'sync' and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression) elif typ == 'sync': return grpc.insecure_channel(server_url, options=options, compression=compression) else: raise ValueError(f'Unknown type: {typ}') + class GrpcClient(Client): ssl: bool ssl_client_credentials: t.Optional[ClientCredentials] @@ -172,6 +177,7 @@ class GrpcClient(Client): stubs = services.BentoServiceStub(self.inner) proto = stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs) return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content')))) + class AsyncGrpcClient(AsyncClient): ssl: bool ssl_client_credentials: t.Optional[ClientCredentials] diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py index 11772f54..ea3dff48 100644 --- a/openllm-client/src/openllm_client/benmin/_http.py +++ b/openllm-client/src/openllm_client/benmin/_http.py @@ -18,6 +18,7 @@ from bentoml._internal.service.inference_api import InferenceAPI from openllm_client.benmin import AsyncClient, Client from openllm_core.utils import ensure_exec_coro logger = logging.getLogger(__name__) + class HttpClient(Client): @functools.cached_property def inner(self) -> httpx.Client: @@ -102,6 +103,7 @@ class HttpClient(Client): # Request.headers sets a _headers variable. We will need to set this value to our fake request object. fake_req._headers = headers return ensure_exec_coro(_inference_api.output.from_http_request(fake_req)) + class AsyncHttpClient(AsyncClient): @functools.cached_property def inner(self) -> httpx.AsyncClient: diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py index ceafef7e..4cd784d6 100644 --- a/openllm-client/src/openllm_client/client.py +++ b/openllm-client/src/openllm_client/client.py @@ -4,24 +4,29 @@ from urllib.parse import urlparse from ._base import BaseAsyncClient, BaseClient logger = logging.getLogger(__name__) + def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None: address = address if '://' in address else 'http://' + address parsed = urlparse(address) self._host, *_port = parsed.netloc.split(':') if len(_port) == 0: self._port = '80' if parsed.scheme == 'http' else '443' else: self._port = next(iter(_port)) + class HTTPClient(BaseClient): def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) + class AsyncHTTPClient(BaseAsyncClient): def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) + class GrpcClient(BaseClient): def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(':') super().__init__(address, timeout) + class AsyncGrpcClient(BaseAsyncClient): def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(':') diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 54bcbc95..9ad2464c 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -76,11 +76,13 @@ __all__ = ['LLMConfig', 'GenerationConfig', 'SamplingParams', 'field_env_key'] logger = logging.getLogger(__name__) config_merger = Merger([(dict, 'merge')], ['override'], ['override']) + # case insensitive, but rename to conform with type class _PeftEnumMeta(enum.EnumMeta): def __getitem__(self, __key: str | t.Any, /) -> t.Any: if isinstance(__key, str): __key = inflection.underscore(__key).upper() return self._member_map_[__key] + # vendorred from peft.utils.config.PeftType since we don't have hard dependency on peft # see https://github.com/huggingface/peft/blob/main/src/peft/utils/config.py class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta): @@ -109,14 +111,17 @@ class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta): @staticmethod def get(__key: str | t.Any, /) -> PeftType: return PeftType[__key] # type-safe getitem. + _PEFT_TASK_TYPE_TARGET_MAPPING = {'causal_lm': 'CAUSAL_LM', 'seq2seq_lm': 'SEQ_2_SEQ_LM'} _object_setattr = object.__setattr__ + def _adapter_converter(value: AdapterType | str | PeftType | None) -> PeftType: if value is None: raise ValueError("'AdapterType' cannot be None.") if isinstance(value, PeftType): return value if value not in PeftType.supported(): raise ValueError(f"Given '{value}' is not a supported adapter type.") return PeftType.get(value) + @attr.define(slots=True, init=True) class FineTuneConfig: '''FineTuneConfig defines a default value for fine-tuning this any given LLM. @@ -193,6 +198,7 @@ class FineTuneConfig: adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode) if 'llm_config_class' in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.") return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs)) + @attr.frozen(slots=True, repr=False, init=False) class GenerationConfig(ReprMixin): '''GenerationConfig is the attrs-compatible version of ``transformers.GenerationConfig``, with some additional validation and environment constructor. @@ -317,6 +323,7 @@ class GenerationConfig(ReprMixin): @property def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)} + bentoml_cattr.register_unstructure_hook_factory( lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig), lambda cls: make_dict_unstructure_fn( @@ -329,6 +336,7 @@ bentoml_cattr.register_unstructure_hook_factory( } ) ) + @attr.frozen(slots=True, repr=False, init=False) class SamplingParams(ReprMixin): '''SamplingParams is the attr-compatible version of ``vllm.SamplingParams``. It provides some utilities to also respect shared variables from ``openllm.LLMConfig``. @@ -398,6 +406,7 @@ class SamplingParams(ReprMixin): top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p']) max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens']) return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs) + bentoml_cattr.register_unstructure_hook_factory( lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_unstructure_fn( @@ -417,6 +426,7 @@ bentoml_cattr.register_structure_hook_factory( # cached it here to save one lookup per assignment _object_getattribute = object.__getattribute__ + class ModelSettings(t.TypedDict, total=False): '''ModelSettings serve only for typing purposes as this is transcribed into LLMConfig.__config__. @@ -461,7 +471,9 @@ class ModelSettings(t.TypedDict, total=False): # tokenizer_class is the custom tokenizer class for this given LLM tokenizer_class: t.Optional[str] + _transformed_type: DictStrAny = {'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], 'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime]} + @attr.define( frozen=False, slots=True, @@ -539,6 +551,7 @@ class _ModelSettingsAttr: fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig] tokenizer_class: t.Optional[str] # update-config-stubs.py: attrs stop + # a heuristic cascading implementation resolver based on available resources def get_default_implementation(default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime: available_spec = available_resource_spec() @@ -546,6 +559,7 @@ def get_default_implementation(default_implementation_mapping: dict[LiteralResou elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt') elif resource_spec('nvidia') in available_spec: return default_implementation_mapping.get(resource_spec('nvidia'), 'pt') else: return default_implementation_mapping.get(resource_spec('cpu'), 'pt') + def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: if 'generation_class' in cl_.__config__: raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") @@ -591,9 +605,12 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ _converted[_adapter_type] = FineTuneConfig(PeftType[_adapter_type], _possible_ft_config, False, _llm_config_class) _final_value_dct['fine_tune_strategies'] = _converted return attr.evolve(_settings_attr, **_final_value_dct) + bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings) + def _setattr_class(attr_name: str, value_var: t.Any) -> str: return f"setattr(cls, '{attr_name}', {value_var})" + def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: LiteralString = 'openllm') -> t.Callable[..., None]: '''Generate the assignment script with prefix attributes __openllm___.''' args: ListStr = [] @@ -608,7 +625,9 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance annotations[attr_name] = field.type return codegen.generate_function(cls, '__assign_attr', lines, args=('cls', *args), globs=globs, annotations=annotations) + _reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'} + @attr.define(slots=True) class _ConfigAttr: @staticmethod @@ -760,6 +779,7 @@ class _ConfigAttr: '''The fine-tune strategies for this given LLM.''' __openllm_tokenizer_class__: t.Optional[str] = Field(None) '''Optional tokenizer class for this given LLM. See Llama for example.''' + # update-config-stubs.py: special stop class _ConfigBuilder: """A modified version of attrs internal _ClassBuilder, and should only be called within __init_subclass__ of LLMConfig. @@ -873,6 +893,7 @@ class _ConfigBuilder: if key in ('__repr__', '__str__', '__repr_name__', '__repr_str__', '__repr_args__'): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn) self._cls_dict['__repr_keys__'] = property(lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'}) return self + @attr.define(slots=True, init=False) class LLMConfig(_ConfigAttr): """``openllm.LLMConfig`` is a pydantic-like ``attrs`` interface that offers fast and easy-to-use APIs. @@ -1474,9 +1495,11 @@ class LLMConfig(_ConfigAttr): `openllm.LLM` also has a postprocess_generate that will just call this method. ''' return generation_result + bentoml_cattr.register_unstructure_hook_factory( lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True) ) + def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: """Structure a dictionary to a LLMConfig object. @@ -1498,5 +1521,6 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: # The rest should be passed to extras data = {k: v for k, v in data.items() if k not in cls.__openllm_accepted_keys__} return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs) + bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config) openllm_home = os.path.expanduser(os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))) diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py index 14a193d2..89fe5987 100644 --- a/openllm-core/src/openllm_core/_prompt.py +++ b/openllm-core/src/openllm_core/_prompt.py @@ -1,6 +1,7 @@ from __future__ import annotations import string import typing as t + class PromptFormatter(string.Formatter): """This PromptFormatter is largely based on langchain's implementation.""" def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any: @@ -13,7 +14,9 @@ class PromptFormatter(string.Formatter): def extract_template_variables(self, template: str) -> t.Sequence[str]: return [field[1] for field in self.parse(template) if field[1] is not None] + default_formatter = PromptFormatter() + def process_prompt(prompt: str, template: str | None = None, use_prompt_template: bool = True, **attrs: t.Any) -> str: # Currently, all default prompt will always have `instruction` key. if not use_prompt_template: return prompt diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index 2a2c82fc..7ce934a0 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -10,6 +10,7 @@ from openllm_core._configuration import GenerationConfig, LLMConfig from .utils import bentoml_cattr if t.TYPE_CHECKING: import vllm + @attr.frozen(slots=True) class GenerationInput: prompt: str @@ -41,6 +42,7 @@ class GenerationInput: 'adapter_name': attr.field(default=None, type=str) } ) + @attr.frozen(slots=True) class GenerationOutput: responses: t.List[t.Any] @@ -58,6 +60,7 @@ class GenerationOutput: if hasattr(self, key): return getattr(self, key) elif key in self.configuration: return self.configuration[key] else: raise KeyError(key) + @attr.frozen(slots=True) class MetadataOutput: model_id: str @@ -67,10 +70,12 @@ class MetadataOutput: configuration: str supports_embeddings: bool supports_hf_agent: bool + @attr.frozen(slots=True) class EmbeddingsOutput: embeddings: t.List[t.List[float]] num_tokens: int + def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]: return dict( request_id=request_output.request_id, @@ -82,6 +87,7 @@ def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.An for it in request_output.outputs ] ) + @attr.define class HfAgentInput: inputs: str diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index 9f54d42a..461a6eb5 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -18,13 +18,16 @@ from bentoml._internal.runner.strategy import THREAD_ENVS from ._typing_compat import overload from .utils import DEBUG, ReprMixin + class DynResource(t.Protocol): resource_id: t.ClassVar[str] @classmethod def from_system(cls) -> t.Sequence[t.Any]: ... + logger = logging.getLogger(__name__) + def _strtoul(s: str) -> int: '''Return -1 or positive integer sequence string starts with,.''' if not s: return -1 @@ -34,6 +37,7 @@ def _strtoul(s: str) -> int: if idx + 1 == len(s): idx += 1 # noqa: PLW2901 # NOTE: idx will be set via enumerate return int(s[:idx]) if idx > 0 else -1 + def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: rcs: list[str] = [] for elem in lst.split(','): @@ -43,16 +47,21 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: if not elem.startswith(prefix): break rcs.append(elem) return rcs + _STACK_LEVEL = 3 + @overload # variant: default callback def _parse_visible_devices() -> list[str] | None: ... + @overload # variant: specify None, and respect_env def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ... + @overload # variant: default var is something other than None def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ... + def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: '''CUDA_VISIBLE_DEVICES aware with default var for parsing spec.''' if respect_env: @@ -76,6 +85,7 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T if x < 0: break rc.append(x) return [str(i) for i in rc] + def _from_system(cls: type[DynResource]) -> list[str]: visible_devices = _parse_visible_devices() if visible_devices is None: @@ -111,15 +121,19 @@ def _from_system(cls: type[DynResource]) -> list[str]: except (ImportError, RuntimeError, AttributeError): return [] return visible_devices + @overload def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ... + @overload def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ... + @overload def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ... + def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: if isinstance(spec, int): if spec in (-1, 0): return [] @@ -133,6 +147,7 @@ def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: return [str(x) for x in spec] else: raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") + def _raw_device_uuid_nvml() -> list[str] | None: from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer @@ -167,6 +182,7 @@ def _raw_device_uuid_nvml() -> list[str] | None: uuids.append(buf.raw.decode('ascii').strip('\0')) del nvml_h return uuids + def _validate(cls: type[DynResource], val: list[t.Any]) -> None: if cls.resource_id == 'amd.com/gpu': raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'") @@ -189,6 +205,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None: if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}') except (ImportError, RuntimeError): pass + def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class( name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind}, @@ -202,6 +219,7 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[ '__module__': 'openllm._strategies' }) ) + # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm: _TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2' _AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu' @@ -226,6 +244,7 @@ AmdGpuResource = _make_resource_class( ) LiteralResourceSpec = t.Literal['cloud-tpus.google.com/v2', 'amd.com/gpu', 'nvidia.com/gpu', 'cpu'] + # convenient mapping def resource_spec(name: t.Literal['tpu', 'amd', 'nvidia', 'cpu']) -> LiteralResourceSpec: if name == 'tpu': return _TPU_RESOURCE @@ -233,6 +252,7 @@ def resource_spec(name: t.Literal['tpu', 'amd', 'nvidia', 'cpu']) -> LiteralReso elif name == 'nvidia': return _NVIDIA_GPU_RESOURCE elif name == 'cpu': return _CPU_RESOURCE else: raise ValueError("Unknown alias. Accepted: ['tpu', 'amd', 'nvidia', 'cpu']") + @functools.lru_cache def available_resource_spec() -> tuple[LiteralResourceSpec, ...]: '''This is a utility function helps to determine the available resources from given running system. @@ -246,6 +266,7 @@ def available_resource_spec() -> tuple[LiteralResourceSpec, ...]: if len(NvidiaGpuResource.from_system()) > 0: available.append(_NVIDIA_GPU_RESOURCE) available.append(_CPU_RESOURCE) return tuple(available) + class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource. @@ -356,4 +377,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): if idx >= len(gpus): raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}') dev = str(gpus[idx]) return dev + __all__ = ['CascadingResourceStrategy', 'get_resource'] diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index 0a7b1d38..8572547b 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -49,22 +49,28 @@ if sys.version_info[:2] >= (3, 10): from typing import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias else: from typing_extensions import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias + class PeftAdapterOutput(t.TypedDict): success: bool result: t.Dict[str, peft.PeftConfig] error_msg: str + class LLMEmbeddings(t.TypedDict): embeddings: t.List[t.List[float]] num_tokens: int + class AdaptersTuple(TupleAny): adapter_id: str name: t.Optional[str] config: DictStrAny + AdaptersMapping = t.Dict[AdapterType, t.Tuple[AdaptersTuple, ...]] + class RefTuple(TupleAny): git_hash: str version: VersionInfo strategy: LiteralContainerVersionStrategy + class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True @@ -74,6 +80,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]] generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] + class LLMRunner(bentoml.Runner, t.Generic[M, T]): __doc__: str __module__: str diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index b7e8a622..65839e26 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -21,6 +21,7 @@ if t.TYPE_CHECKING: CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), ( 'llama', 'LlamaConfig' ), ('mpt', 'MPTConfig'), ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'), ('baichuan', 'BaichuanConfig')]) + class _LazyConfigMapping(OrderedDict, ReprMixin): def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]): self._mapping = mapping @@ -66,9 +67,11 @@ class _LazyConfigMapping(OrderedDict, ReprMixin): def register(self, key: str, value: t.Any) -> None: if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.") self._extra_content[key] = value + CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES) # The below handle special alias when we call underscore to the name directly without processing camelcase first. CONFIG_NAME_ALIASES: dict[str, str] = {'chat_glm': 'chatglm', 'stable_lm': 'stablelm', 'star_coder': 'starcoder', 'gpt_neo_x': 'gpt_neox',} + class AutoConfig: def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError('Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.') diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index 5556d237..9bf03f56 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -23,6 +23,7 @@ or provide `--model-id` flag when running ``openllm start baichuan``: $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b' ''' DEFAULT_PROMPT_TEMPLATE = '''{instruction}''' + class BaichuanConfig(openllm_core.LLMConfig): """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology. diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index 8cd7cb05..777609ed 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -23,6 +23,7 @@ or provide `--model-id` flag when running ``openllm start chatglm``: $ openllm start chatglm --model-id='thudm/chatglm-6b-int8' ''' DEFAULT_PROMPT_TEMPLATE = '''{instruction}''' + class ChatGLMConfig(openllm_core.LLMConfig): """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework. diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index 568cac9e..f5adf687 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -37,6 +37,7 @@ DEFAULT_PROMPT_TEMPLATE = '''{intro} {instruction} {response_key} '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY) + def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int: '''Gets the token ID for a given string that has been added to the tokenizer as a special token. @@ -56,6 +57,7 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) token_ids = tokenizer.encode(key) if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}") return token_ids[0] + class DollyV2Config(openllm_core.LLMConfig): """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py index 95be5452..49b94ee7 100644 --- a/openllm-core/src/openllm_core/config/configuration_falcon.py +++ b/openllm-core/src/openllm_core/config/configuration_falcon.py @@ -28,6 +28,7 @@ DEFAULT_PROMPT_TEMPLATE = '''{context} {user_name}: {instruction} {agent}: ''' + class FalconConfig(openllm_core.LLMConfig): """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora. diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index 191dbdfd..5cda6659 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -29,6 +29,7 @@ or provide `--model-id` flag when running ``openllm start flan-t5``: $ openllm start flan-t5 --model-id google/flan-t5-xxl ''' DEFAULT_PROMPT_TEMPLATE = '''Answer the following question:\nQuestion: {instruction}\nAnswer:''' + class FlanT5Config(openllm_core.LLMConfig): """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf). diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py index 9438c608..9960c309 100644 --- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py +++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py @@ -24,6 +24,7 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``: $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b' ''' DEFAULT_PROMPT_TEMPLATE = '''{instruction}''' + class GPTNeoXConfig(openllm_core.LLMConfig): """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license. diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index 20e0207d..d590df28 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -42,9 +42,12 @@ SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = '[INST]', '[/INST]', '< str: return PROMPT_MAPPING[model_type] + DEFAULT_PROMPT_TEMPLATE = _get_prompt + class LlamaConfig(openllm_core.LLMConfig): """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py index f62451e9..f12fbd67 100644 --- a/openllm-core/src/openllm_core/config/configuration_mpt.py +++ b/openllm-core/src/openllm_core/config/configuration_mpt.py @@ -43,9 +43,12 @@ _chat_prompt, _default_prompt, _instruct_prompt = '''{instruction}''', '''{instr {response_key} '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY) PROMPT_MAPPING = {'default': _default_prompt, 'instruct': _instruct_prompt, 'storywriter': _default_prompt, 'chat': _chat_prompt} + def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type] + DEFAULT_PROMPT_TEMPLATE = _get_prompt + class MPTConfig(openllm_core.LLMConfig): """MPT is a decoder-style transformer pretrained from scratch on English text and code. diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index e3da9838..b0cfdd65 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -30,6 +30,7 @@ or provide `--model-id` flag when running ``openllm start opt``: $ openllm start opt --model-id facebook/opt-6.7b ''' DEFAULT_PROMPT_TEMPLATE = '''{instruction}''' + class OPTConfig(openllm_core.LLMConfig): """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI. diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py index 001d99ab..76e456f6 100644 --- a/openllm-core/src/openllm_core/config/configuration_stablelm.py +++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py @@ -29,6 +29,7 @@ SYSTEM_PROMPT = '''<|SYSTEM|># StableLM Tuned (Alpha version) - StableLM will refuse to participate in anything that could harm a human. ''' DEFAULT_PROMPT_TEMPLATE = '''{system_prompt}<|USER|>{instruction}<|ASSISTANT|>''' + class StableLMConfig(openllm_core.LLMConfig): """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models. diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py index 50a60625..b345b084 100644 --- a/openllm-core/src/openllm_core/config/configuration_starcoder.py +++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py @@ -23,6 +23,7 @@ $ openllm start starcoder --model-id 'bigcode/starcoder' ''' DEFAULT_PROMPT_TEMPLATE = '''{instruction}''' FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = '', '', '', '', '<|endoftext|>', '' + class StarCoderConfig(openllm_core.LLMConfig): """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded. diff --git a/openllm-core/src/openllm_core/exceptions.py b/openllm-core/src/openllm_core/exceptions.py index c9ff18c0..60adff51 100644 --- a/openllm-core/src/openllm_core/exceptions.py +++ b/openllm-core/src/openllm_core/exceptions.py @@ -2,19 +2,27 @@ from __future__ import annotations import bentoml + class OpenLLMException(bentoml.exceptions.BentoMLException): '''Base class for all OpenLLM exceptions. This extends BentoMLException.''' + class GpuNotAvailableError(OpenLLMException): '''Raised when there is no GPU available in given system.''' + class ValidationError(OpenLLMException): '''Raised when a validation fails.''' + class ForbiddenAttributeError(OpenLLMException): '''Raised when using an _internal field.''' + class MissingAnnotationAttributeError(OpenLLMException): '''Raised when a field under openllm.LLMConfig is missing annotations.''' + class MissingDependencyError(BaseException): '''Raised when a dependency is missing.''' + class Error(BaseException): '''To be used instead of naked raise.''' + class FineTuneStrategyNotSupportedError(OpenLLMException): '''Raised when a fine-tune strategy is not supported for given LLM.''' diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 4d435425..decaed5a 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -20,28 +20,25 @@ from circus.exc import ConflictError import openllm_core from bentoml._internal.configuration import ( - DEBUG_ENV_VAR as DEBUG_ENV_VAR, - GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, - QUIET_ENV_VAR as QUIET_ENV_VAR, - get_debug_mode as _get_debug_mode, - get_quiet_mode as _get_quiet_mode, - set_quiet_mode as set_quiet_mode, + DEBUG_ENV_VAR as DEBUG_ENV_VAR, + GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, + QUIET_ENV_VAR as QUIET_ENV_VAR, + get_debug_mode as _get_debug_mode, + get_quiet_mode as _get_quiet_mode, + set_quiet_mode as set_quiet_mode, ) from bentoml._internal.models.model import ModelContext as _ModelContext from bentoml._internal.types import LazyType as LazyType from bentoml._internal.utils import ( - LazyLoader as LazyLoader, - bentoml_cattr as bentoml_cattr, - calc_dir_size as calc_dir_size, - first_not_none as first_not_none, - pkg as pkg, - reserve_free_port as reserve_free_port, - resolve_user_filepath as resolve_user_filepath, -) -from openllm_core.utils.lazy import ( - LazyModule as LazyModule, - VersionInfo as VersionInfo, + LazyLoader as LazyLoader, + bentoml_cattr as bentoml_cattr, + calc_dir_size as calc_dir_size, + first_not_none as first_not_none, + pkg as pkg, + reserve_free_port as reserve_free_port, + resolve_user_filepath as resolve_user_filepath, ) +from openllm_core.utils.lazy import (LazyModule as LazyModule, VersionInfo as VersionInfo,) if t.TYPE_CHECKING: from openllm_core._typing_compat import AnyCallable logger = logging.getLogger(__name__) @@ -53,25 +50,30 @@ if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,) else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore # _GenericAlias is the actual GenericAlias implementation DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG' + def set_debug_mode(enabled: bool, level: int = 1) -> None: # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs if enabled: os.environ[DEV_DEBUG_VAR] = str(level) os.environ[DEBUG_ENV_VAR] = str(enabled) os.environ[_GRPC_DEBUG_ENV_VAR] = 'DEBUG' if enabled else 'ERROR' + def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool: try: return isinstance(cls, type) and issubclass(cls, class_or_tuple) # type: ignore[arg-type] except TypeError: if isinstance(cls, _WithArgsTypes): return False raise + def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any: loop = asyncio.get_event_loop() if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result() else: return loop.run_until_complete(coro) + def available_devices() -> tuple[str, ...]: '''Return available GPU under system. Currently only supports NVIDIA GPUs.''' from openllm_core._strategies import NvidiaGpuResource return tuple(NvidiaGpuResource.from_system()) + @functools.lru_cache(maxsize=128) def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1') -> str: """Generate a hash from given file's modification time. @@ -84,26 +86,34 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1' The generated hash. """ return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest() + @functools.lru_cache(maxsize=1) def device_count() -> int: return len(available_devices()) + # equivocal setattr to save one lookup per assignment _object_setattr = object.__setattr__ + def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None: """This makes sure that we don't overwrite any existing attributes on the object.""" _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj) if not hasattr(obj, name): _setattr(name, value) + def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key]))) + # Special debug flag controled via OPENLLMDEVDEBUG DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR))) # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins MYPY = False SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3 + def get_debug_mode() -> bool: return DEBUG or _get_debug_mode() + def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode() + class ExceptionFilter(logging.Filter): def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any): '''A filter of all exception.''' @@ -119,11 +129,14 @@ class ExceptionFilter(logging.Filter): for exc in self.EXCLUDE_EXCEPTIONS: if issubclass(etype, exc): return False return True + class InfoFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING + def gen_random_uuid(prefix: str | None = None) -> str: return '-'.join([prefix or 'openllm', str(uuid.uuid4().hex)]) + _LOGGING_CONFIG: dict[str, t.Any] = { 'version': 1, 'disable_existing_loggers': True, @@ -154,6 +167,7 @@ _LOGGING_CONFIG: dict[str, t.Any] = { 'level': logging.WARNING }, } + def configure_logging() -> None: '''Configure logging for OpenLLM. @@ -173,6 +187,7 @@ def configure_logging() -> None: _LOGGING_CONFIG['root']['level'] = logging.INFO logging.config.dictConfig(_LOGGING_CONFIG) + @functools.lru_cache(maxsize=1) def in_notebook() -> bool: try: @@ -182,7 +197,9 @@ def in_notebook() -> bool: return 'IPKernelApp' in t.cast('dict[str, t.Any]', t.cast(t.Callable[[], 'InteractiveShell'], get_ipython)().config) except (ImportError, AttributeError): return False + _dockerenv, _cgroup = Path('/.dockerenv'), Path('/proc/self/cgroup') + class suppress(contextlib.suppress, contextlib.ContextDecorator): """A version of contextlib.suppress with decorator support. @@ -191,6 +208,7 @@ class suppress(contextlib.suppress, contextlib.ContextDecorator): ... {}[''] >>> key_error() """ + def compose(*funcs: AnyCallable) -> AnyCallable: '''Compose any number of unary functions into a single unary function. @@ -211,6 +229,7 @@ def compose(*funcs: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs)) return functools.reduce(compose_two, funcs) + def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]: """Decorate a function with a transform function that is invoked on results returned from the decorated function. @@ -228,10 +247,12 @@ def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]: ``` """ return lambda func: functools.wraps(func)(compose(transform, func)) + @apply(bool) @suppress(FileNotFoundError) def _text_in_file(text: str, filename: Path) -> bool: return any(text in line for line in filename.open()) + def in_docker() -> bool: '''Is this current environment running in docker? @@ -240,15 +261,19 @@ def in_docker() -> bool: ``` ''' return _dockerenv.exists() or _text_in_file('docker', _cgroup) + T, K = t.TypeVar('T'), t.TypeVar('K') + def resolve_filepath(path: str, ctx: str | None = None) -> str: '''Resolve a file path to an absolute path, expand user and environment variables.''' try: return resolve_user_filepath(path, ctx) except FileNotFoundError: return path + def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path))) + def generate_context(framework_name: str) -> _ModelContext: framework_versions = {'transformers': pkg.get_pkg_version('transformers')} if openllm_core.utils.is_torch_available(): framework_versions['torch'] = pkg.get_pkg_version('torch') @@ -257,13 +282,16 @@ def generate_context(framework_name: str) -> _ModelContext: framework_versions['tensorflow'] = get_tf_version() if openllm_core.utils.is_flax_available(): framework_versions.update({'flax': pkg.get_pkg_version('flax'), 'jax': pkg.get_pkg_version('jax'), 'jaxlib': pkg.get_pkg_version('jaxlib')}) return _ModelContext(framework_name=framework_name, framework_versions=framework_versions) + _TOKENIZER_PREFIX = '_tokenizer_' + def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]: '''Normalize the given attrs to a model and tokenizer kwargs accordingly.''' tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)} for k in tuple(attrs.keys()): if k.startswith(_TOKENIZER_PREFIX): del attrs[k] return attrs, tokenizer_attrs + # NOTE: The set marks contains a set of modules name # that are available above and are whitelisted # to be included in the extra_objects map. @@ -312,39 +340,35 @@ _import_structure: dict[str, list[str]] = { if t.TYPE_CHECKING: # NOTE: The following exports useful utils from bentoml - from . import ( - analytics as analytics, - codegen as codegen, - dantic as dantic, - ) + from . import (analytics as analytics, codegen as codegen, dantic as dantic,) from .import_utils import ( - ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES, - OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES, - DummyMetaclass as DummyMetaclass, - EnvVarMixin as EnvVarMixin, - is_autogptq_available as is_autogptq_available, - is_bitsandbytes_available as is_bitsandbytes_available, - is_cpm_kernels_available as is_cpm_kernels_available, - is_datasets_available as is_datasets_available, - is_einops_available as is_einops_available, - is_fairscale_available as is_fairscale_available, - is_flax_available as is_flax_available, - is_grpc_available as is_grpc_available, - is_grpc_health_available as is_grpc_health_available, - is_jupyter_available as is_jupyter_available, - is_jupytext_available as is_jupytext_available, - is_notebook_available as is_notebook_available, - is_peft_available as is_peft_available, - is_sentencepiece_available as is_sentencepiece_available, - is_tf_available as is_tf_available, - is_torch_available as is_torch_available, - is_transformers_available as is_transformers_available, - is_transformers_supports_agent as is_transformers_supports_agent, - is_transformers_supports_kbit as is_transformers_supports_kbit, - is_triton_available as is_triton_available, - is_vllm_available as is_vllm_available, - is_xformers_available as is_xformers_available, - require_backends as require_backends, + ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES, + OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES, + DummyMetaclass as DummyMetaclass, + EnvVarMixin as EnvVarMixin, + is_autogptq_available as is_autogptq_available, + is_bitsandbytes_available as is_bitsandbytes_available, + is_cpm_kernels_available as is_cpm_kernels_available, + is_datasets_available as is_datasets_available, + is_einops_available as is_einops_available, + is_fairscale_available as is_fairscale_available, + is_flax_available as is_flax_available, + is_grpc_available as is_grpc_available, + is_grpc_health_available as is_grpc_health_available, + is_jupyter_available as is_jupyter_available, + is_jupytext_available as is_jupytext_available, + is_notebook_available as is_notebook_available, + is_peft_available as is_peft_available, + is_sentencepiece_available as is_sentencepiece_available, + is_tf_available as is_tf_available, + is_torch_available as is_torch_available, + is_transformers_available as is_transformers_available, + is_transformers_supports_agent as is_transformers_supports_agent, + is_transformers_supports_kbit as is_transformers_supports_kbit, + is_triton_available as is_triton_available, + is_vllm_available as is_vllm_available, + is_xformers_available as is_xformers_available, + require_backends as require_backends, ) from .representation import ReprMixin as ReprMixin __lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects=_extras) diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py index ca58b4ec..91b501d7 100644 --- a/openllm-core/src/openllm_core/utils/analytics.py +++ b/openllm-core/src/openllm_core/utils/analytics.py @@ -23,12 +23,15 @@ logger = logging.getLogger(__name__) # This variable is a proxy that will control BENTOML_DO_NOT_TRACK OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK' DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper() + @functools.lru_cache(maxsize=1) def do_not_track() -> bool: return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES + @functools.lru_cache(maxsize=1) def _usage_event_debugging() -> bool: return os.environ.get('__BENTOML_DEBUG_USAGE', str(False)).lower() == 'true' + def silent(func: t.Callable[P, T]) -> t.Callable[P, T]: @functools.wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any: @@ -41,10 +44,12 @@ def silent(func: t.Callable[P, T]) -> t.Callable[P, T]: else: logger.debug('Tracking Error: %s', err) return wrapper + @silent def track(event_properties: attr.AttrsInstance) -> None: if do_not_track(): return _internal_analytics.track(t.cast('_internal_analytics.schemas.EventMeta', event_properties)) + @contextlib.contextmanager def set_bentoml_tracking() -> t.Generator[None, None, None]: original_value = os.environ.pop(_internal_analytics.BENTOML_DO_NOT_TRACK, str(False)) @@ -53,6 +58,7 @@ def set_bentoml_tracking() -> t.Generator[None, None, None]: yield finally: os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value + class EventMeta: @property def event_name(self) -> str: @@ -62,10 +68,12 @@ class EventMeta: suffix_to_remove = '_event' if event_name.endswith(suffix_to_remove): event_name = event_name[:-len(suffix_to_remove)] return event_name + @attr.define class ModelSaveEvent(EventMeta): module: str model_size_in_kb: float + @attr.define class OpenllmCliEvent(EventMeta): cmd_group: str @@ -75,6 +83,7 @@ class OpenllmCliEvent(EventMeta): duration_in_ms: t.Any = attr.field(default=None) error_type: str = attr.field(default=None) return_code: int = attr.field(default=None) + @attr.define class StartInitEvent(EventMeta): model_name: str @@ -83,6 +92,7 @@ class StartInitEvent(EventMeta): @staticmethod def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config['model_name'], llm_config=llm_config.model_dump()) + def track_start_init(llm_config: openllm_core.LLMConfig) -> None: if do_not_track(): return track(StartInitEvent.handler(llm_config)) diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index dabd3216..91209c5c 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -18,6 +18,7 @@ logger = logging.getLogger(__name__) # sentinel object for unequivocal object() getattr _sentinel = object() + def has_own_attribute(cls: type[t.Any], attrib_name: t.Any) -> bool: """Check whether *cls* defines *attrib_name* (and doesn't just inherit it).""" attr = getattr(cls, attrib_name, _sentinel) @@ -26,14 +27,17 @@ def has_own_attribute(cls: type[t.Any], attrib_name: t.Any) -> bool: a = getattr(base_cls, attrib_name, None) if attr is a: return False return True + def get_annotations(cls: type[t.Any]) -> DictStrAny: if has_own_attribute(cls, '__annotations__'): return cls.__annotations__ return t.cast('DictStrAny', {}) + def is_class_var(annot: str | t.Any) -> bool: annot = str(annot) # Annotation can be quoted. if annot.startswith(("'", '"')) and annot.endswith(("'", '"')): annot = annot[1:-1] return annot.startswith(('typing.ClassVar', 't.ClassVar', 'ClassVar', 'typing_extensions.ClassVar',)) + def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str | None = None) -> _T: try: method_or_cls.__module__ = cls.__module__ @@ -48,8 +52,10 @@ def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str except AttributeError: pass return method_or_cls + def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = '') -> None: eval(compile(script, filename, 'exec'), globs, locs) + def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable: locs: DictStrAny = {} # In order of debuggers like PDB being able to step through the code, we add a fake linecache entry. @@ -64,6 +70,7 @@ def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> An count += 1 _compile_and_eval(script, globs, locs, filename) return locs[name] + def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.Any]: '''Create a tuple subclass to hold class attributes. @@ -86,8 +93,10 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t. if SHOW_CODEGEN: logger.info('Generated class for %s:\n\n%s', attr_class_name, '\n'.join(attr_class_template)) _compile_and_eval('\n'.join(attr_class_template), globs) return globs[attr_class_name] + def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>" + def generate_function( typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None ) -> AnyCallable: @@ -97,6 +106,7 @@ def generate_function( if annotations: meth.__annotations__ = annotations if SHOW_CODEGEN: logger.info('Generated script for %s:\n\n%s', typ, script) return meth + def make_env_transformer( cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None, ) -> AnyCallable: @@ -123,6 +133,7 @@ def make_env_transformer( ] fields_ann = 'list[attr.Attribute[t.Any]]' return generate_function(cls, '__auto_env', lines, args=('_', 'fields'), globs=globs, annotations={'_': 'type[LLMConfig]', 'fields': fields_ann, 'return': fields_ann}) + def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: '''Enhance sdk with nice repr that plays well with your brain.''' from openllm_core.utils import ReprMixin @@ -153,4 +164,5 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: func, ) ) + __all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function'] diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index d6f5cb69..b48e0aeb 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -36,8 +36,10 @@ __all__ = [ 'JsonType', 'BytesType' ] + def __dir__() -> list[str]: return sorted(__all__) + def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False) -> t.Callable[[FC], FC]: # TODO: support parsing nested attrs class and Union envvar = field.metadata['env'] @@ -66,6 +68,7 @@ def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, t show_envvar=True, envvar=envvar, ) + def env_converter(value: t.Any, env: str | None = None) -> t.Any: if env is not None: value = os.environ.get(env, value) @@ -75,6 +78,7 @@ def env_converter(value: t.Any, env: str | None = None) -> t.Any: except orjson.JSONDecodeError as err: raise RuntimeError(f"Failed to parse ({value!r}) from '{env}': {err}") from None return value + def Field( default: t.Any = None, *, @@ -137,6 +141,7 @@ def Field( attrs.pop('default') return attr.field(metadata=metadata, validator=_validator, converter=converter, **attrs) + def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]: """Transforms the pydantic field's type into a click-compatible type. @@ -167,6 +172,7 @@ def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]: if lenient_issubclass(field_type, bytes): return BytesType() # return the current type: it should be a primitive return field_type + def is_typing(field_type: type) -> bool: '''Checks whether the current type is a module-like type. @@ -180,6 +186,7 @@ def is_typing(field_type: type) -> bool: if raw is None: return False if raw is type or raw is t.Type: return True return False + def is_literal(field_type: type) -> bool: '''Checks whether the given field type is a Literal type or not. @@ -194,6 +201,7 @@ def is_literal(field_type: type) -> bool: ''' origin = t.get_origin(field_type) return origin is not None and origin is t.Literal + class ModuleType(ParamType): name = 'module' @@ -215,6 +223,7 @@ class ModuleType(ParamType): return value except Exception as exc: self.fail(f"'{value}' is not a valid object ({type(exc)}: {exc!s})", param, ctx) + class EnumChoice(click.Choice): name = 'enum' @@ -237,6 +246,7 @@ class EnumChoice(click.Choice): if isinstance(result, str): result = self.internal_type[result] return result + class LiteralChoice(EnumChoice): name = 'literal' @@ -249,6 +259,7 @@ class LiteralChoice(EnumChoice): _mapping = {str(v): v for v in values} super(EnumChoice, self).__init__(list(_mapping), case_sensitive) self.internal_type = item_type + def allows_multiple(field_type: type[t.Any]) -> bool: """Checks whether the current type allows for multiple arguments to be provided as input or not. @@ -273,6 +284,7 @@ def allows_multiple(field_type: type[t.Any]) -> bool: # For the moment, only non-composite types are allowed. return not isinstance(args, tuple) return False + def is_mapping(field_type: type) -> bool: '''Checks whether this field represents a dictionary or JSON object. @@ -289,6 +301,7 @@ def is_mapping(field_type: type) -> bool: origin = t.get_origin(field_type) if origin is None: return False return lenient_issubclass(origin, t.Mapping) + def is_container(field_type: type) -> bool: """Checks whether the current type is a container type ('contains' other types), like lists and tuples. @@ -307,6 +320,7 @@ def is_container(field_type: type) -> bool: # Early out for non-typing objects if origin is None: return False return lenient_issubclass(origin, t.Container) + def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType, ...]: '''Parses the arguments inside a container type (lists, tuples and so on). @@ -329,6 +343,7 @@ def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType return parse_single_arg(args[0]) # Then deal with fixed-length containers: Tuple[str, int, int] return tuple(parse_single_arg(arg) for arg in args) + def parse_single_arg(arg: type) -> ParamType: """Returns the click-compatible type for container origin types. @@ -349,6 +364,7 @@ def parse_single_arg(arg: type) -> ParamType: if is_container(arg): return JsonType() if lenient_issubclass(arg, bytes): return BytesType() return click_types.convert_type(arg) + class BytesType(ParamType): name = 'bytes' @@ -358,6 +374,7 @@ class BytesType(ParamType): return str.encode(value) except Exception as exc: self.fail(f"'{value}' is not a valid string ({exc!s})", param, ctx) + CYGWIN = sys.platform.startswith('cygwin') WIN = sys.platform.startswith('win') if sys.platform.startswith('win') and WIN: @@ -369,6 +386,7 @@ else: def _get_argv_encoding() -> str: return getattr(sys.stdin, 'encoding', None) or sys.getfilesystemencoding() + class CudaValueType(ParamType): name = 'cuda' envvar_list_splitter = ',' @@ -413,7 +431,9 @@ class CudaValueType(ParamType): def __repr__(self) -> str: return 'STRING' + CUDA = CudaValueType() + class JsonType(ParamType): name = 'json' diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index d0dbe21d..15fc9b52 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -28,6 +28,7 @@ USE_TF = os.environ.get('USE_TF', 'AUTO').upper() USE_TORCH = os.environ.get('USE_TORCH', 'AUTO').upper() USE_JAX = os.environ.get('USE_FLAX', 'AUTO').upper() FORCE_TF_AVAILABLE = os.environ.get('FORCE_TF_AVAILABLE', 'AUTO').upper() + def _is_package_available(package: str) -> bool: _package_available = importlib.util.find_spec(package) is not None if _package_available: @@ -36,6 +37,7 @@ def _is_package_available(package: str) -> bool: except importlib.metadata.PackageNotFoundError: _package_available = False return _package_available + _torch_available = importlib.util.find_spec('torch') is not None _tf_available = importlib.util.find_spec('tensorflow') is not None _flax_available = importlib.util.find_spec('jax') is not None and importlib.util.find_spec('flax') is not None @@ -56,44 +58,64 @@ _autogptq_available = _is_package_available('auto_gptq') _sentencepiece_available = _is_package_available('sentencepiece') _xformers_available = _is_package_available('xformers') _fairscale_available = _is_package_available('fairscale') + def is_transformers_available() -> bool: return _transformers_available + def is_grpc_available() -> bool: return _grpc_available + def is_grpc_health_available() -> bool: return _grpc_health_available + def is_transformers_supports_kbit() -> bool: return pkg.pkg_version_info('transformers')[:2] >= (4, 30) + def is_transformers_supports_agent() -> bool: return pkg.pkg_version_info('transformers')[:2] >= (4, 29) + def is_jupyter_available() -> bool: return _jupyter_available + def is_jupytext_available() -> bool: return _jupytext_available + def is_notebook_available() -> bool: return _notebook_available + def is_triton_available() -> bool: return _triton_available + def is_datasets_available() -> bool: return _datasets_available + def is_peft_available() -> bool: return _peft_available + def is_einops_available() -> bool: return _einops_available + def is_cpm_kernels_available() -> bool: return _cpm_kernel_available + def is_bitsandbytes_available() -> bool: return _bitsandbytes_available + def is_autogptq_available() -> bool: return _autogptq_available + def is_vllm_available() -> bool: return _vllm_available + def is_sentencepiece_available() -> bool: return _sentencepiece_available + def is_xformers_available() -> bool: return _xformers_available + def is_fairscale_available() -> bool: return _fairscale_available + def is_torch_available() -> bool: global _torch_available if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: @@ -106,6 +128,7 @@ def is_torch_available() -> bool: logger.info('Disabling PyTorch because USE_TF is set') _torch_available = False return _torch_available + def is_tf_available() -> bool: global _tf_available if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES: _tf_available = True @@ -143,6 +166,7 @@ def is_tf_available() -> bool: logger.info('Disabling Tensorflow because USE_TORCH is set') _tf_available = False return _tf_available + def is_flax_available() -> bool: global _flax_available if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES: @@ -155,6 +179,7 @@ def is_flax_available() -> bool: else: _flax_available = False return _flax_available + VLLM_IMPORT_ERROR_WITH_PYTORCH = '''\ {0} requires the vLLM library but it was not found in your environment. However, we were able to find a PyTorch installation. PyTorch classes do not begin @@ -270,6 +295,7 @@ BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, ), ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), ( 'fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR) )]) + class DummyMetaclass(abc.ABCMeta): '''Metaclass for dummy object. @@ -280,6 +306,7 @@ class DummyMetaclass(abc.ABCMeta): def __getattribute__(cls, key: str) -> t.Any: if key.startswith('_'): return super().__getattribute__(key) require_backends(cls, cls._backends) + def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None: if not isinstance(backends, (list, tuple)): backends = list(backends) name = o.__name__ if hasattr(o, '__name__') else o.__class__.__name__ @@ -294,6 +321,7 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None: if 'flax' not in backends and is_flax_available() and not is_vllm_available(): raise ImportError(VLLM_IMPORT_ERROR_WITH_FLAX.format(name)) failed = [msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()] if failed: raise ImportError(''.join(failed)) + class EnvVarMixin(ReprMixin): model_name: str config: str diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py index 92b0aebd..73b7d5ff 100644 --- a/openllm-core/src/openllm_core/utils/lazy.py +++ b/openllm-core/src/openllm_core/utils/lazy.py @@ -15,6 +15,7 @@ import attr import openllm_core __all__ = ['VersionInfo', 'LazyModule'] + # vendorred from attrs @functools.total_ordering @attr.attrs(eq=False, order=False, slots=True, frozen=True, repr=False) @@ -53,7 +54,9 @@ class VersionInfo: def __repr__(self) -> str: return '{0}.{1}.{2}'.format(*attr.astuple(self)[:3]) + _sentinel, _reserved_namespace = object(), {'__openllm_migration__'} + class LazyModule(types.ModuleType): # Very heavily inspired by optuna.integration._IntegrationModule: https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py def __init__( diff --git a/openllm-core/src/openllm_core/utils/representation.py b/openllm-core/src/openllm_core/utils/representation.py index 3b75b2d9..733562c3 100644 --- a/openllm-core/src/openllm_core/utils/representation.py +++ b/openllm-core/src/openllm_core/utils/representation.py @@ -9,6 +9,7 @@ from openllm_core import utils if t.TYPE_CHECKING: from openllm_core._typing_compat import TypeAlias ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None] + class ReprMixin: @property @abstractmethod diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index 720825f9..7ef98cf9 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -13,6 +13,7 @@ if t.TYPE_CHECKING: import torch _GENERIC_EMBEDDING_ID = 'sentence-transformers/all-MiniLM-L6-v2' _BENTOMODEL_ID = 'sentence-transformers--all-MiniLM-L6-v2' + def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: try: return bentoml.transformers.get(ids) @@ -36,6 +37,7 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: _GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'] ) return bentomodel + class GenericEmbeddingRunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True @@ -67,4 +69,5 @@ class GenericEmbeddingRunnable(bentoml.Runnable): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + __all__ = ['GenericEmbeddingRunnable'] diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index 1c4e8fdc..52a3af15 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -8,6 +8,7 @@ if t.TYPE_CHECKING: import torch, openllm # reexport from transformers LogitsProcessorList = transformers.LogitsProcessorList StoppingCriteriaList = transformers.StoppingCriteriaList + class StopSequenceCriteria(transformers.StoppingCriteria): def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] @@ -15,9 +16,11 @@ class StopSequenceCriteria(transformers.StoppingCriteria): def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences) + class StopOnTokens(transformers.StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} + def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList: generation_config = config.generation_config logits_processor = transformers.LogitsProcessorList() @@ -26,16 +29,20 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr if 1e-8 <= generation_config['top_p']: logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p'])) if generation_config['top_k'] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k'])) return logits_processor + # NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used. SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length'] + def get_context_length(config: transformers.PretrainedConfig) -> int: rope_scaling = getattr(config, 'rope_scaling', None) rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0 for key in SEQLEN_KEYS: if getattr(config, key, None) is not None: return int(rope_scaling_factor * getattr(config, key)) return 2048 + def is_sentence_complete(output: str) -> bool: return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”')) + def is_partial_stop(output: str, stop_str: str) -> bool: '''Check whether the output contains a partial stop str.''' for i in range(0, min(len(output), len(stop_str))): diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 4505f05b..954cf4d4 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -49,15 +49,19 @@ else: ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConfig', str]]] logger = logging.getLogger(__name__) + class ModelSignatureDict(t.TypedDict, total=False): batchable: bool batch_dim: t.Union[t.Tuple[int, int], int] input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]] output_spec: NotRequired[t.Any] + def normalise_model_name(name: str) -> str: return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub('[^a-zA-Z0-9]+', '-', name) + # the below is similar to peft.utils.other.CONFIG_NAME PEFT_CONFIG_NAME = 'adapter_config.json' + def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapping: '''Resolve the type of the PeftConfig given the adapter_map. @@ -88,7 +92,9 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp if _peft_type not in resolved: resolved[_peft_type] = () resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),) return resolved + _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'} + class LLMInterface(abc.ABC, t.Generic[M, T]): '''This defines the loose contract for all openllm.LLM implementations.''' @property @@ -241,23 +247,31 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): **attrs: t.Any ) -> None: '''Generated __attrs_init__ for openllm.LLM.''' + _R = t.TypeVar('_R', covariant=True) + class _import_model_wrapper(t.Generic[_R, M, T], t.Protocol): def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: ... + class _load_model_wrapper(t.Generic[M, T], t.Protocol): def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: ... + class _load_tokenizer_wrapper(t.Generic[M, T], t.Protocol): def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: ... + class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol): def __call__(self, llm: LLM[M, T]) -> T: ... + class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol): def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: ... + _object_setattr = object.__setattr__ + # NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation. def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]: @functools.wraps(f) @@ -269,11 +283,14 @@ def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Ca return f(self, *decls, trust_remote_code=trust_remote_code, **attrs) return wrapper + _DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer' + def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs: return vllm.EngineArgs( model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode='auto', tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype='auto', worker_use_ray=False ) + def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: @functools.wraps(f) def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: @@ -289,12 +306,14 @@ def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], return f(self, *(*model_decls, *decls), **{**model_attrs, **attrs}) return wrapper + def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]) -> t.Callable[[LLM[M, T]], T]: @functools.wraps(f) def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) return wrapper + def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]: @functools.wraps(f) def wrapper(self: LLM[M, T]) -> None: @@ -302,6 +321,7 @@ def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M f(self) return wrapper + def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]: @functools.wraps(f) def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: @@ -312,6 +332,7 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[L f(self, save_directory, **attrs) return wrapper + def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: # update docstring for given entrypoint original_fn = getattr(cls, fn, getattr(LLMInterface, fn)) @@ -323,6 +344,7 @@ def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: ''' setattr(cls, fn, original_fn) return original_fn + def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: attributes = { 'import_model': _wrapped_import_model, @@ -361,8 +383,10 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]] lines.extend([_setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"), f"__gen_docstring(cls, '{fn}')",]) anns[key] = interface_anns.get(key) return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations=anns) + def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]['outputs'][0]['text'] + def vllm_generate_iterator( self: LLM['vllm.LLMEngine', T], prompt: str, /, *, echo: bool = False, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any ) -> t.Iterator[dict[str, t.Any]]: @@ -387,6 +411,7 @@ def vllm_generate_iterator( else: text_outputs = [output.text for output in request_output.outputs] yield {'text': text_outputs, 'error_code': 0} if request_output.finished: break + def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]: request_id: str = attrs.pop('request_id', None) if request_id is None: raise ValueError('request_id must not be None.') @@ -396,7 +421,9 @@ def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) - while self.model.has_unfinished_requests(): outputs.extend([r for r in self.model.step() if r.finished]) return [unmarshal_vllm_outputs(i) for i in outputs] + _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class('AdaptersTuple', ['adapter_id', 'name', 'config']) + @attr.define(slots=True, repr=False, init=False) class LLM(LLMInterface[M, T], ReprMixin): if t.TYPE_CHECKING: __name__: str @@ -1140,6 +1167,7 @@ class LLM(LLMInterface[M, T], ReprMixin): del past_key_values, out gc.collect() torch.cuda.empty_cache() + # fmt: off @overload def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index da02f3ac..18c2952e 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -14,12 +14,15 @@ autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), logger = logging.getLogger(__name__) QuantiseMode = t.Literal['int8', 'int4', 'gptq'] + @overload def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ... + @overload def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: ... + def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index e94605f5..650fdf28 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -38,17 +38,20 @@ runners: list[AbstractRunner] = [runner] if not runner.supports_embeddings: runners.append(generic_embedding_runner) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None}) + @svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() responses = await runner.generate.async_run(qa_inputs.prompt, **{'adapter_name': qa_inputs.adapter_name, **config}) return openllm.GenerationOutput(responses=responses, configuration=config) + @svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream')) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: echo = input_dict.pop('echo', False) qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump()) + @svc.api( route='/v1/metadata', input=bentoml.io.Text(), @@ -72,6 +75,7 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent ) + @svc.api( route='/v1/embeddings', input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']), @@ -111,6 +115,7 @@ async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type] responses = (await embed_call.async_run(phrases))[0] return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens']) + if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): async def hf_agent(request: Request) -> Response: @@ -127,11 +132,13 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): hf_app = Starlette(debug=True, routes=[Route('/agent', hf_agent, methods=['POST'])]) svc.mount_asgi_app(hf_app, path='/hf') + # general metadata app async def list_adapter_v1(_: Request) -> Response: res: dict[str, t.Any] = {} if runner.peft_adapters['success'] is True: res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()} res.update({'success': runner.peft_adapters['success'], 'error_msg': runner.peft_adapters['error_msg']}) return JSONResponse(res, status_code=200) + adapters_app_v1 = Starlette(debug=True, routes=[Route('/adapters', list_adapter_v1, methods=['GET'])]) svc.mount_asgi_app(adapters_app_v1, path='/v1') diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index fcfe317a..3e0fd119 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -30,6 +30,7 @@ if t.TYPE_CHECKING: logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' + def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None: '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.''' if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None @@ -48,6 +49,7 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope env.install(builder.build_system_requires) return builder.build('wheel', path, config_settings={'--global-option': '--quiet'}) raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.') + def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions: packages = ['openllm', 'scipy'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] @@ -100,6 +102,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d ] if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=['https://download.pytorch.org/whl/cu118']) + def construct_docker_options( llm: openllm.LLM[t.Any, t.Any], _: FS, @@ -137,8 +140,10 @@ def construct_docker_options( if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value']) env_dict[_env.runtime] = _env['runtime_value'] return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template) + OPENLLM_MODEL_NAME = '# openllm: model name' OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' + class ModelNameFormatter(string.Formatter): model_keyword: LiteralString = '__model_name__' @@ -156,11 +161,15 @@ class ModelNameFormatter(string.Formatter): return True except ValueError: return False + class ModelIdFormatter(ModelNameFormatter): model_keyword: LiteralString = '__model_id__' + class ModelAdapterMapFormatter(ModelNameFormatter): model_keyword: LiteralString = '__model_adapter_map__' + _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' + def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None: from openllm_core.utils import DEBUG model_name = llm.config['model_name'] @@ -174,6 +183,7 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents) if DEBUG: logger.info('Generated script:\n%s', script) llm_fs.writetext(llm.config['service_name'], script) + @inject def create_bento( bento_tag: bentoml.Tag, diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index bcf91a2d..328c1b48 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -40,17 +40,23 @@ _OWNER = 'bentoml' _REPO = 'openllm' _module_location = openllm_core.utils.pkg.source_locations('openllm') + @functools.lru_cache @openllm_core.utils.apply(str.lower) def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg] + def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s) + def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) - timedelta(days=r)).strftime('%Y-%m-%dT%H:%M:%SZ') + class VersionNotSupported(openllm.exceptions.OpenLLMException): """Raised when the stable release is too low that it doesn't include OpenLLM base container.""" + _RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy']) + def nightly_resolver(cls: type[RefResolver]) -> str: # NOTE: all openllm container will have sha- # This will use docker to run skopeo to determine the correct latest tag that is available @@ -64,6 +70,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str: return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message']) # now is the correct behaviour return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2] + @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: git_hash: str = attr.field() @@ -108,9 +115,11 @@ class RefResolver: if self.strategy == 'latest': return 'latest' elif self.strategy == 'nightly': return self.git_hash else: return repr(self.version) + @functools.lru_cache(maxsize=256) def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: return RefResolver.from_strategy(strategy).tag + def build_container( registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = 'release', @@ -146,13 +155,16 @@ def build_container( except Exception as err: raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err return tags + if t.TYPE_CHECKING: CONTAINER_NAMES: dict[LiteralContainerRegistry, str] supported_registries: list[str] __all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver'] + def __dir__() -> list[str]: return sorted(__all__) + def __getattr__(name: str) -> t.Any: if name == 'supported_registries': return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))() elif name == 'CONTAINER_NAMES': return _CONTAINER_REGISTRY diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 729e8992..f3fc3b07 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -31,10 +31,13 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain'] _AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command]) + def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})] + def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)] + def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny: # TODO: Support amd.com/gpu on k8s _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '') @@ -55,7 +58,9 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env) return environ + _adapter_mapping_key = 'adapter_map' + def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None: if not value: return None if _adapter_mapping_key not in ctx.params: ctx.params[_adapter_mapping_key] = {} @@ -69,6 +74,7 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ... pass ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None return None + def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command: llm_config = openllm.AutoConfig.for_model(model) command_attrs: DictStrAny = dict( @@ -212,6 +218,7 @@ Available official model_id(s): [default: {llm_config['default_id']}] return config return start_cmd + def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command: context_settings = command_attrs.pop('context_settings', {}) context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True}) @@ -224,6 +231,7 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * return llm_config return noop + def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None: if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'") if quantize and llm_config.default_implementation() == 'vllm': @@ -232,6 +240,7 @@ def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: Lite if requirements is not None and len(requirements) > 0: missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow') + def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: def wrapper(fn: FC) -> t.Callable[[FC], FC]: composed = openllm.utils.compose( @@ -301,6 +310,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab return composed(fn) return wrapper + def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None: if value is None: return value if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})') @@ -308,10 +318,12 @@ def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tup # NOTE: --device all is a special case if len(el) == 1 and el[0] == 'all': return tuple(map(str, openllm.utils.available_devices())) return el + # NOTE: A list of bentoml option that is not needed for parsing. # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this. # NOTE: production is also deprecated _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'} + def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]: '''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.''' from bentoml_cli.cli import cli @@ -339,7 +351,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig] return group(f) return decorator + _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True) + def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]: '''General ``@click`` decorator with some sauce. @@ -356,8 +370,10 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs)) return decorator + cli_option = functools.partial(_click_factory_type, attr='option') cli_argument = functools.partial(_click_factory_type, attr='argument') + def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]: output = ['json', 'pretty', 'porcelain'] @@ -377,6 +393,7 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput shell_complete=complete_output_var, **attrs )(f) + def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--fast/--no-fast', @@ -390,10 +407,13 @@ def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC ''', **attrs )(f) + def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f) + def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f) + def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--model-id', @@ -404,10 +424,13 @@ def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.E help='Optional model_id name or path for (fine-tune) weight.', **attrs )(f) + def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f) + def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f) + def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--quantise', @@ -433,6 +456,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model > [!NOTE] that quantization are currently only available in *PyTorch* models.''', **attrs )(f) + def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--workers-per-resource', @@ -458,6 +482,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = ), **attrs )(f) + def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--bettertransformer', @@ -469,6 +494,7 @@ def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = Fal if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.', **attrs )(f) + def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--serialisation', @@ -498,6 +524,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal ''', **attrs )(f) + def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--container-registry', @@ -517,7 +544,9 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> ''', **attrs )(f) + _wpr_strategies = {'round_robin', 'conserved'} + def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None: if value is None: return value value = inflection.underscore(value) @@ -529,6 +558,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None else: return value + def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None: if value is None: return value if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param) diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index d32cbc67..3dedf91f 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -22,6 +22,7 @@ if t.TYPE_CHECKING: from openllm_core._configuration import LLMConfig from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, LiteralRuntime, LiteralString logger = logging.getLogger(__name__) + def _start( model_name: str, /, @@ -108,6 +109,7 @@ def _start( return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main( args=args if len(args) > 0 else None, standalone_mode=False ) + @inject def _build( model_name: str, @@ -213,6 +215,7 @@ def _build( if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") return bentoml.get(matched.group(1), _bento_store=bento_store) + def _import_model( model_name: str, /, @@ -262,6 +265,7 @@ def _import_model( if additional_args is not None: args.extend(additional_args) if quantize is not None: args.extend(['--quantize', quantize]) return import_command.main(args=args, standalone_mode=False) + def _list_models() -> dict[str, t.Any]: '''List all available models within the local store.''' from .entrypoint import models_command diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py index c2a9af60..5ced51fd 100644 --- a/openllm-python/src/openllm/cli/extension/build_base_container.py +++ b/openllm-python/src/openllm/cli/extension/build_base_container.py @@ -8,6 +8,7 @@ import openllm from openllm.cli import termui from openllm.cli._factory import container_registry_option, machine_option if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy + @click.command( 'build_base_container', context_settings=termui.CONTEXT_SETTINGS, diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py index 3cf4ea31..8c59f667 100644 --- a/openllm-python/src/openllm/cli/extension/dive_bentos.py +++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py @@ -12,6 +12,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer from openllm.cli import termui from openllm.cli._factory import bento_complete_envvar, machine_option if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore + @click.command('dive_bentos', context_settings=termui.CONTEXT_SETTINGS) @click.argument('bento', type=str, shell_complete=bento_complete_envvar) @machine_option diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py index d6683844..0e0fb5a8 100644 --- a/openllm-python/src/openllm/cli/extension/get_containerfile.py +++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py @@ -13,6 +13,7 @@ from openllm.cli import termui from openllm.cli._factory import bento_complete_envvar from openllm_core.utils import bentoml_cattr if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore + @click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.') @click.argument('bento', type=str, shell_complete=bento_complete_envvar) @click.pass_context diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py index 995d8ac1..ef8d944c 100644 --- a/openllm-python/src/openllm/cli/extension/get_prompt.py +++ b/openllm-python/src/openllm/cli/extension/get_prompt.py @@ -11,6 +11,7 @@ from openllm.cli import termui from openllm.cli._factory import machine_option, model_complete_envvar, output_option from openllm_core._prompt import process_prompt LiteralOutput = t.Literal['json', 'pretty', 'porcelain'] + @click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS) @click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar) @click.argument('prompt', type=click.STRING) diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py index fe8c832c..e5320451 100644 --- a/openllm-python/src/openllm/cli/extension/list_bentos.py +++ b/openllm-python/src/openllm/cli/extension/list_bentos.py @@ -9,6 +9,7 @@ import openllm from bentoml._internal.utils import human_readable_size from openllm.cli import termui from openllm.cli._factory import LiteralOutput, output_option + @click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS) @output_option(default_value='json') @click.pass_context diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py index b5ab145a..5ec7814b 100644 --- a/openllm-python/src/openllm/cli/extension/list_models.py +++ b/openllm-python/src/openllm/cli/extension/list_models.py @@ -11,6 +11,7 @@ from bentoml._internal.utils import human_readable_size from openllm.cli import termui from openllm.cli._factory import LiteralOutput, model_complete_envvar, model_name_argument, output_option if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny + @click.command('list_models', context_settings=termui.CONTEXT_SETTINGS) @model_name_argument(required=False, shell_complete=model_complete_envvar) @output_option(default_value='json') diff --git a/openllm-python/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py index afccb5f0..80d74d33 100644 --- a/openllm-python/src/openllm/cli/extension/playground.py +++ b/openllm-python/src/openllm/cli/extension/playground.py @@ -20,11 +20,13 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny logger = logging.getLogger(__name__) + def load_notebook_metadata() -> DictStrAny: with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f: content = yaml.safe_load(f) if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.") return content + @click.command('playground', context_settings=termui.CONTEXT_SETTINGS) @click.argument('output-dir', default=None, required=False) @click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server') diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py index 0ef9891e..90c396b4 100644 --- a/openllm-python/src/openllm/cli/termui.py +++ b/openllm-python/src/openllm/cli/termui.py @@ -7,9 +7,11 @@ import inflection import openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny + def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None: attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs) + COLUMNS: int = int(os.environ.get('COLUMNS', str(120))) CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore} __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS'] diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py index ee428f92..b3339ea0 100644 --- a/openllm-python/src/openllm/client.py +++ b/openllm-python/src/openllm/client.py @@ -15,7 +15,9 @@ import typing as t import openllm_client if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient + def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client)) + def __getattr__(it: str) -> t.Any: return getattr(openllm_client, it) diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index e7a45f1e..36d9bd9b 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -22,6 +22,7 @@ if t.TYPE_CHECKING: ConfigModelItemsView = _odict_items[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]] logger = logging.getLogger(__name__) + class BaseAutoLLMClass: _model_mapping: t.ClassVar[_LazyAutoMapping] @@ -81,6 +82,7 @@ class BaseAutoLLMClass: raise ValueError( f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])})." ) + def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any: if attr is None: return if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr) @@ -93,6 +95,7 @@ def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any: except ValueError: raise ValueError(f'Could not find {attr} neither in {module} nor in {openllm_module}!') from None raise ValueError(f'Could not find {attr} in {openllm_module}!') + class _LazyAutoMapping(OrderedDict, ReprMixin): """Based on transformers.models.auto.configuration_auto._LazyAutoMapping. @@ -168,4 +171,5 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping: if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.") self._extra_content[key] = value + __all__ = ['BaseAutoLLMClass', '_LazyAutoMapping'] diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py index 7740aba2..b28564d2 100644 --- a/openllm-python/src/openllm/models/auto/modeling_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_auto.py @@ -9,5 +9,6 @@ MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2 'opt', 'OPT' ), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) + class AutoLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_MAPPING diff --git a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py index 0341aea3..afd236b2 100644 --- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py @@ -7,5 +7,6 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass, _LazyAutoMapping MODEL_FLAX_MAPPING_NAMES = OrderedDict([('flan_t5', 'FlaxFlanT5'), ('opt', 'FlaxOPT')]) MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES) + class AutoFlaxLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_FLAX_MAPPING diff --git a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py index c1b92529..b7cf02c0 100644 --- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py @@ -7,5 +7,6 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass, _LazyAutoMapping MODEL_TF_MAPPING_NAMES = OrderedDict([('flan_t5', 'TFFlanT5'), ('opt', 'TFOPT')]) MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES) + class AutoTFLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_TF_MAPPING diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index 37c7310a..2e387898 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -9,5 +9,6 @@ MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2 'opt', 'VLLMOPT' ), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) + class AutoVLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING diff --git a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py index d349a658..75a52794 100644 --- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py +++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py index e9c5a134..44ea3d2e 100644 --- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py +++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import vllm, transformers + class VLLMBaichuan(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py index e76ef17f..52c05ccd 100644 --- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py +++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py index d930c85b..868e6722 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -9,12 +9,15 @@ from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf else: torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow') logger = logging.getLogger(__name__) + @overload def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ... + @overload def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ... + def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline: # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information. class InstructionTextGenerationPipeline(transformers.Pipeline): @@ -115,6 +118,7 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr return records return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline + class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedTokenizer']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py index f22f79d6..8d34a2d1 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py @@ -6,6 +6,7 @@ import openllm if t.TYPE_CHECKING: import vllm, transformers logger = logging.getLogger(__name__) + class VLLMDollyV2(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizer']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py index b16cd7cf..d32151de 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py @@ -4,6 +4,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import torch, transformers else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers') + class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py index 61c4aa1d..95c9ac22 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py @@ -6,6 +6,7 @@ import openllm if t.TYPE_CHECKING: import vllm, transformers logger = logging.getLogger(__name__) + class VLLMFalcon(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py index fb421edf..bce584c4 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformers.T5TokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index 7a13fd15..40354f0a 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -5,6 +5,7 @@ import openllm from openllm_core._prompt import process_prompt from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE if t.TYPE_CHECKING: import transformers + class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index 6af703fe..eafb9946 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transformers.T5TokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py index d2661f45..f35b54da 100644 --- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py +++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py @@ -6,6 +6,7 @@ import openllm if t.TYPE_CHECKING: import transformers logger = logging.getLogger(__name__) + class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py index 818871fe..f91b3e01 100644 --- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py +++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import vllm, transformers + class VLLMGPTNeoX(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index 148edf1f..c8eb6632 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py index 54c0a875..c02eb0b1 100644 --- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py @@ -3,5 +3,6 @@ import typing as t import openllm if t.TYPE_CHECKING: import vllm, transformers + class VLLMLlama(openllm.LLM['vllm.LLMEngine', 'transformers.LlamaTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index 394e63ae..d6725ff1 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -8,6 +8,7 @@ from openllm.utils import generate_labels, is_triton_available if t.TYPE_CHECKING: import transformers, torch logger = logging.getLogger(__name__) + def get_mpt_config( model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True ) -> transformers.PretrainedConfig: @@ -22,6 +23,7 @@ def get_mpt_config( # setting max_seq_len config.max_seq_len = max_sequence_length return config + class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py index f816b343..19334d27 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers, vllm + class VLLMMPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py index d48fe8cf..23c08479 100644 --- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py @@ -11,6 +11,7 @@ if t.TYPE_CHECKING: import transformers else: transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers') logger = logging.getLogger(__name__) + class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py index be954ba8..596bc49e 100644 --- a/openllm-python/src/openllm/models/opt/modeling_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_opt.py @@ -6,6 +6,7 @@ import openllm if t.TYPE_CHECKING: import transformers logger = logging.getLogger(__name__) + class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py index 6c30f6a9..41700ac0 100644 --- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py @@ -5,6 +5,7 @@ import bentoml import openllm from openllm_core.utils import generate_labels if t.TYPE_CHECKING: import transformers + class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py index 9e87ad60..778ebb6f 100644 --- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py @@ -5,6 +5,7 @@ import openllm from openllm_core._prompt import process_prompt from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE if t.TYPE_CHECKING: import vllm, transformers + class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py index 01290c2e..0fd4bfe7 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py @@ -3,6 +3,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import transformers + class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py index 1d02d02c..d9f1703e 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py @@ -4,6 +4,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import vllm, transformers + class VLLMStableLM(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py index 5812ab96..de251e0d 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py @@ -7,6 +7,7 @@ import openllm from openllm.utils import generate_labels from openllm_core.config.configuration_starcoder import EOD, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX if t.TYPE_CHECKING: import transformers + class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.GPT2TokenizerFast']): __openllm_internal__ = True diff --git a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py index 20a9e822..87278717 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py @@ -4,6 +4,7 @@ import typing as t import openllm if t.TYPE_CHECKING: import vllm, transformers + class VLLMStarCoder(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2TokenizerFast']): __openllm_internal__ = True tokenizer_id = 'local' diff --git a/openllm-python/src/openllm/playground/falcon_tuned.py b/openllm-python/src/openllm/playground/falcon_tuned.py index c5dc9025..17ba7924 100644 --- a/openllm-python/src/openllm/playground/falcon_tuned.py +++ b/openllm-python/src/openllm/playground/falcon_tuned.py @@ -24,6 +24,7 @@ from datasets import load_dataset from trl import SFTTrainer DEFAULT_MODEL_ID = "ybelkada/falcon-7b-sharded-bf16" DATASET_NAME = "timdettmers/openassistant-guanaco" + @dataclasses.dataclass class TrainingArguments: per_device_train_batch_size: int = dataclasses.field(default=4) @@ -40,10 +41,12 @@ class TrainingArguments: group_by_length: bool = dataclasses.field(default=True) lr_scheduler_type: str = dataclasses.field(default="constant") output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "falcon")) + @dataclasses.dataclass class ModelArguments: model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID) max_sequence_length: int = dataclasses.field(default=512) + parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, diff --git a/openllm-python/src/openllm/playground/features.py b/openllm-python/src/openllm/playground/features.py index 2d31b5a7..c6776759 100644 --- a/openllm-python/src/openllm/playground/features.py +++ b/openllm-python/src/openllm/playground/features.py @@ -12,6 +12,7 @@ MAX_NEW_TOKENS = 384 Q = "Answer the following question, step by step:\n{q}\nA:" question = "What is the meaning of life?" + def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("question", default=question) @@ -42,9 +43,11 @@ def main() -> int: logger.info("=" * 10, "Response:", r.llm.postprocess_generate(prompt, res)) return 0 + def _mp_fn(index: t.Any): # noqa # type: ignore # For xla_spawn (TPUs) main() + if openllm.utils.in_notebook(): main() else: diff --git a/openllm-python/src/openllm/playground/llama2_qlora.py b/openllm-python/src/openllm/playground/llama2_qlora.py index b867c174..a1e72fd4 100644 --- a/openllm-python/src/openllm/playground/llama2_qlora.py +++ b/openllm-python/src/openllm/playground/llama2_qlora.py @@ -29,6 +29,7 @@ from random import randint, randrange import bitsandbytes as bnb from datasets import load_dataset + # COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py def find_all_linear_names(model): lora_module_names = set() @@ -40,11 +41,13 @@ def find_all_linear_names(model): if "lm_head" in lora_module_names: # needed for 16-bit lora_module_names.remove("lm_head") return list(lora_module_names) + # Change this to the local converted path if you don't have access to the meta-llama model DEFAULT_MODEL_ID = "meta-llama/Llama-2-7b-hf" # change this to 'main' if you want to use the latest llama DEFAULT_MODEL_VERSION = "335a02887eb6684d487240bbc28b5699298c3135" DATASET_NAME = "databricks/databricks-dolly-15k" + def format_dolly(sample): instruction = f"### Instruction\n{sample['instruction']}" context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None @@ -52,12 +55,15 @@ def format_dolly(sample): # join all the parts together prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None]) return prompt + # template dataset to add prompt to each sample def template_dataset(sample, tokenizer): sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}" return sample + # empty list to save remainder from batches to use in next batch remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []} + def chunk(sample, chunk_length=2048): # define global remainder variable to save remainder from batches to use in next batch global remainder @@ -78,6 +84,7 @@ def chunk(sample, chunk_length=2048): # prepare labels result["labels"] = result["input_ids"].copy() return result + def prepare_datasets(tokenizer, dataset_name=DATASET_NAME): # Load dataset from the hub dataset = load_dataset(dataset_name, split="train") @@ -96,6 +103,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME): # Print total number of samples print(f"Total number of samples: {len(lm_dataset)}") return lm_dataset + def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True, ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]: from peft.tuners.lora import LoraLayer @@ -130,6 +138,7 @@ def prepare_for_int4_training(model_id: str, model_version: str | None = None, g if bf16 and module.weight.dtype == torch.float32: module = module.to(torch.bfloat16) return model, tokenizer + @dataclasses.dataclass class TrainingArguments: per_device_train_batch_size: int = dataclasses.field(default=1) @@ -141,12 +150,14 @@ class TrainingArguments: report_to: str = dataclasses.field(default="none") output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "llama")) save_strategy: str = dataclasses.field(default="no") + @dataclasses.dataclass class ModelArguments: model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID) model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION) seed: int = dataclasses.field(default=42) merge_weights: bool = dataclasses.field(default=False) + if openllm.utils.in_notebook(): model_args, training_rags = ModelArguments(), TrainingArguments() else: @@ -160,6 +171,7 @@ else: # import the model first hand openllm.import_model("llama", model_id=model_args.model_id, model_version=model_args.model_version) + def train_loop(model_args: ModelArguments, training_args: TrainingArguments): import peft @@ -194,4 +206,5 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments): model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB") else: trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora")) + train_loop(model_args, training_args) diff --git a/openllm-python/src/openllm/playground/opt_tuned.py b/openllm-python/src/openllm/playground/opt_tuned.py index 6f04fd05..2043b65e 100644 --- a/openllm-python/src/openllm/playground/opt_tuned.py +++ b/openllm-python/src/openllm/playground/opt_tuned.py @@ -24,6 +24,7 @@ from datasets import load_dataset if t.TYPE_CHECKING: from peft import PeftModel DEFAULT_MODEL_ID = "facebook/opt-6.7b" + def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments): return transformers.Trainer( model=model, @@ -31,6 +32,7 @@ def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, da args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), ) + @dataclasses.dataclass class TrainingArguments: per_device_train_batch_size: int = dataclasses.field(default=4) @@ -41,9 +43,11 @@ class TrainingArguments: fp16: bool = dataclasses.field(default=True) logging_steps: int = dataclasses.field(default=1) output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "opt")) + @dataclasses.dataclass class ModelArguments: model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID) + parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index 5c3c677b..e75c5e6b 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -37,6 +37,7 @@ if t.TYPE_CHECKING: from . import constants as constants, ggml as ggml, transformers as transformers P = ParamSpec('P') + def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: '''Load the tokenizer from BentoML store. @@ -66,10 +67,13 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: elif tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id else: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) return tokenizer + class _Caller(t.Protocol[P]): def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ... + _extras = ['get', 'import_model', 'save_pretrained', 'load_model'] + def _make_dispatch_function(fn: str) -> _Caller[P]: def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: """Generic function dispatch to correct serialisation submodules based on LLM runtime. @@ -81,6 +85,7 @@ def _make_dispatch_function(fn: str) -> _Caller[P]: return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs) return caller + if t.TYPE_CHECKING: def get(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: @@ -94,10 +99,13 @@ if t.TYPE_CHECKING: def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M: ... + _import_structure: dict[str, list[str]] = {'ggml': [], 'transformers': [], 'constants': []} __all__ = ['ggml', 'transformers', 'constants', 'load_tokenizer', *_extras] + def __dir__() -> list[str]: return sorted(__all__) + def __getattr__(name: str) -> t.Any: if name == 'load_tokenizer': return load_tokenizer elif name in _import_structure: return importlib.import_module(f'.{name}', __name__) diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py index 5f2244d1..24d961f0 100644 --- a/openllm-python/src/openllm/serialisation/ggml.py +++ b/openllm-python/src/openllm/serialisation/ggml.py @@ -10,8 +10,10 @@ import openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import M _conversion_strategy = {'pt': 'ggml'} + def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model: raise NotImplementedError('Currently work in progress.') + def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model: '''Return an instance of ``bentoml.Model`` from given LLM instance. @@ -31,7 +33,9 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) raise + def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M: raise NotImplementedError('Currently work in progress.') + def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None: raise NotImplementedError('Currently work in progress.') diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index fc71c5fb..29acfe92 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -34,6 +34,7 @@ else: logger = logging.getLogger(__name__) __all__ = ['import_model', 'get', 'load_model', 'save_pretrained'] + @inject def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model: """Auto detect model type from given model_id and import it to bentoml's model store. @@ -136,6 +137,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, # in the case where users first run openllm start without the model available locally. if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() return bentomodel + def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: '''Return an instance of ``bentoml.Model`` from given LLM instance. @@ -157,6 +159,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: except bentoml.exceptions.NotFound as err: if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) raise err from None + def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: '''Load the model from BentoML store. @@ -189,6 +192,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer() if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model) return t.cast('M', model) + def save_pretrained( llm: openllm.LLM[M, T], save_directory: str, diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 0a8c3089..243d837f 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -18,6 +18,7 @@ else: transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch') _object_setattr = object.__setattr__ + def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]: '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig. @@ -37,10 +38,12 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype') config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs) return config, hub_attrs, attrs + def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T: __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None) if __cls is None: raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`') return __cls + def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass: if llm.config['trust_remote_code']: autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM' @@ -55,9 +58,11 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1 else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.') return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx]) + def check_unintialised_params(model: torch.nn.Module) -> None: unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')] if len(unintialized) > 0: raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}') + def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model: based: DictStrAny = copy.deepcopy(bentomodel.info.metadata) based.update(metadata) @@ -65,6 +70,7 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based )) return bentomodel + # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ('__call__',) diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py index 882de1c5..4743d8c0 100644 --- a/openllm-python/src/openllm/serialisation/transformers/weights.py +++ b/openllm-python/src/openllm/serialisation/transformers/weights.py @@ -6,8 +6,10 @@ from huggingface_hub import HfApi if t.TYPE_CHECKING: import openllm from openllm_core._typing_compat import M, T + def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool: return any(s.rfilename.endswith('.safetensors') for s in HfApi().model_info(model_id, revision=revision).siblings) + @attr.define(slots=True) class HfIgnore: safetensors = '*.safetensors' diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 1ff88a86..fc973e3c 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -11,6 +11,7 @@ import openllm if t.TYPE_CHECKING: from ._typing_compat import LiteralRuntime logger = logging.getLogger(__name__) + @contextlib.contextmanager def build_bento( model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, runtime: t.Literal['ggml', 'transformers'] = 'transformers', cleanup: bool = False @@ -21,6 +22,7 @@ def build_bento( if cleanup: logger.info('Deleting %s', bento.tag) bentoml.bentos.delete(bento.tag) + @contextlib.contextmanager def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]: if isinstance(bento, bentoml.Bento): bento_tag = bento.tag @@ -36,6 +38,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N if cleanup: logger.info('Deleting container %s', image_tag) subprocess.check_output([executable, 'rmi', '-f', image_tag]) + @contextlib.contextmanager def prepare( model: str, diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 75eccf9c..637f3e98 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -8,17 +8,14 @@ import typing as t import openllm_core -from . import ( - dummy_flax_objects as dummy_flax_objects, - dummy_pt_objects as dummy_pt_objects, - dummy_tf_objects as dummy_tf_objects, - dummy_vllm_objects as dummy_vllm_objects, -) +from . import dummy_flax_objects as dummy_flax_objects, dummy_pt_objects as dummy_pt_objects, dummy_tf_objects as dummy_tf_objects, dummy_vllm_objects as dummy_vllm_objects if t.TYPE_CHECKING: import openllm from openllm_core._typing_compat import LiteralRuntime + def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: return {'runtime': llm.runtime, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation_format': llm._serialisation_format} + def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: import openllm if implementation == 'tf': return openllm.AutoTFLLM @@ -26,9 +23,12 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o elif implementation == 'pt': return openllm.AutoLLM elif implementation == 'vllm': return openllm.AutoVLLM else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')") + __all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects'] + def __dir__() -> t.Sequence[str]: return sorted(__all__) + def __getattr__(it: str) -> t.Any: if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it) else: raise AttributeError(f'module {__name__} has no attribute {it}') diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py index c2ea2e4d..28c27dac 100644 --- a/openllm-python/tests/_strategies/_configuration.py +++ b/openllm-python/tests/_strategies/_configuration.py @@ -9,6 +9,7 @@ from openllm_core._configuration import ModelSettings logger = logging.getLogger(__name__) env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_name in openllm.CONFIG_MAPPING.keys()]) + @st.composite def model_settings(draw: st.DrawFn): '''Strategy for generating ModelSettings objects.''' @@ -28,6 +29,7 @@ def model_settings(draw: st.DrawFn): 'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), } return draw(st.builds(ModelSettings, **kwargs)) + def make_llm_config( cls_name: str, dunder_config: dict[str, t.Any] | ModelSettings, diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index a0f78fdf..c6080adb 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -14,6 +14,7 @@ import openllm from openllm_core._configuration import GenerationConfig, ModelSettings, field_env_key from ._strategies._configuration import make_llm_config, model_settings + # XXX: @aarnphm fixes TypedDict behaviour in 3.11 @pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this') def test_missing_default(): @@ -23,6 +24,7 @@ def test_missing_default(): make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']}) with pytest.raises(ValueError, match='Missing required fields *'): make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},) + def test_forbidden_access(): cl_ = make_llm_config( 'ForbiddenAccess', { @@ -34,6 +36,7 @@ def test_forbidden_access(): assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), 'GenerationConfig',) assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), 'SamplingParams',) assert openllm.utils.lenient_issubclass(cl_.__openllm_generation_class__, GenerationConfig) + @given(model_settings()) def test_class_normal_gen(gen_settings: ModelSettings): assume(gen_settings['default_id'] and all(i for i in gen_settings['model_ids'])) @@ -41,19 +44,23 @@ def test_class_normal_gen(gen_settings: ModelSettings): assert issubclass(cl_, openllm.LLMConfig) for key in gen_settings: assert object.__getattribute__(cl_, f'__openllm_{key}__') == gen_settings.__getitem__(key) + @given(model_settings(), st.integers()) def test_simple_struct_dump(gen_settings: ModelSettings, field1: int): cl_ = make_llm_config('IdempotentLLM', gen_settings, fields=(('field1', 'float', field1),)) assert cl_().model_dump()['field1'] == field1 + @given(model_settings(), st.integers()) def test_config_derivation(gen_settings: ModelSettings, field1: int): cl_ = make_llm_config('IdempotentLLM', gen_settings, fields=(('field1', 'float', field1),)) new_cls = cl_.model_derivate('DerivedLLM', default_id='asdfasdf') assert new_cls.__openllm_default_id__ == 'asdfasdf' + @given(model_settings()) def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings): cl_ = make_llm_config('AttrsProtocolLLM', gen_settings) assert attr.has(cl_) + @given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),) def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float): cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),) @@ -72,10 +79,12 @@ def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperatu pas_nested = cl_(generation_config={'temperature': input_temperature}, field1=input_field1) assert pas_nested.model_dump()['field1'] == input_field1 assert pas_nested.model_dump()['generation_config']['temperature'] == input_temperature + @contextlib.contextmanager def patch_env(**attrs: t.Any): with mock.patch.dict(os.environ, attrs, clear=True): yield + def test_struct_envvar(): with patch_env(**{field_env_key('env_llm', 'field1'): '4', field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',}): @@ -93,6 +102,7 @@ def test_struct_envvar(): overwrite_default = EnvLLM() assert overwrite_default.field1 == 4 assert overwrite_default['temperature'] == 0.2 + def test_struct_provided_fields(): class EnvLLM(openllm.LLMConfig): __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',} @@ -104,6 +114,7 @@ def test_struct_provided_fields(): sent = EnvLLM.model_construct_env(field1=20, temperature=0.4) assert sent.field1 == 20 assert sent.generation_config.temperature == 0.4 + def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mk: mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0)) @@ -115,11 +126,13 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat ).model_construct_env(field1=20.0, temperature=0.4) assert sent.generation_config.temperature == 0.4 assert sent.field1 == 20.0 + @given(model_settings()) @pytest.mark.parametrize(('return_dict', 'typ'), [(True, dict), (False, transformers.GenerationConfig)]) def test_conversion_to_transformers(return_dict: bool, typ: type[t.Any], gen_settings: ModelSettings): cl_ = make_llm_config('ConversionLLM', gen_settings) assert isinstance(cl_().to_generation_config(return_as_dict=return_dict), typ) + @given(model_settings()) def test_click_conversion(gen_settings: ModelSettings): # currently our conversion omit Union type. @@ -131,6 +144,7 @@ def test_click_conversion(gen_settings: ModelSettings): filtered = {k for k, v in cl_.__openllm_hints__.items() if t.get_origin(v) is not t.Union} click_options_filtered = [i for i in wrapped.__click_params__ if i.name and not i.name.startswith('fake_')] assert len(filtered) == len(click_options_filtered) + @pytest.mark.parametrize('model_name', openllm.CONFIG_MAPPING.keys()) def test_configuration_dict_protocol(model_name: str): config = openllm.AutoConfig.for_model(model_name) diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py index be9b812f..02655e11 100644 --- a/openllm-python/tests/conftest.py +++ b/openllm-python/tests/conftest.py @@ -10,6 +10,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralRuntime _FRAMEWORK_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',} _PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',} + def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]: if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.") runtime_impl: tuple[LiteralRuntime, ...] = tuple() @@ -19,10 +20,12 @@ def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunn for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()): llm = openllm.Runner(model, model_id=_FRAMEWORK_MAPPING[model], ensure_available=True, implementation=framework, init_local=True,) yield prompt, llm + def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: if os.getenv('GITHUB_ACTIONS') is None: if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames: metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])]) + def pytest_sessionfinish(session: pytest.Session, exitstatus: int): # If no tests are collected, pytest exists with code 5, which makes the CI fail. if exitstatus == 5: session.exitstatus = 0 diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index 47e1a40e..e63a255e 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -29,6 +29,7 @@ if t.TYPE_CHECKING: from openllm._configuration import GenerationConfig from openllm.client import BaseAsyncClient + class ResponseComparator(JSONSnapshotExtension): def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData: if openllm.utils.LazyType(ListAny).isinstance(data): @@ -66,9 +67,11 @@ class ResponseComparator(JSONSnapshotExtension): return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config)) return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]) + @pytest.fixture() def response_snapshot(snapshot: SnapshotAssertion): return snapshot.use_extension(ResponseComparator) + @attr.define(init=False) class _Handle(ABC): port: int @@ -100,6 +103,7 @@ class _Handle(ABC): except Exception: time.sleep(1) raise RuntimeError(f'Handle failed to initialise within {timeout} seconds.') + @attr.define(init=False) class LocalHandle(_Handle): process: subprocess.Popen[bytes] @@ -109,10 +113,12 @@ class LocalHandle(_Handle): def status(self) -> bool: return self.process.poll() is None + class HandleProtocol(t.Protocol): @contextlib.contextmanager def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]: ... + @attr.define(init=False) class DockerHandle(_Handle): container_name: str @@ -124,6 +130,7 @@ class DockerHandle(_Handle): def status(self) -> bool: container = self.docker_client.containers.get(self.container_name) return container.status in ['running', 'created'] + @contextlib.contextmanager def _local_handle( model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False, @@ -146,6 +153,7 @@ def _local_handle( proc.stdout.close() if proc.stderr: proc.stderr.close() + @contextlib.contextmanager def _container_handle( model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False, @@ -192,19 +200,23 @@ def _container_handle( print(container_output, file=sys.stderr) container.remove() + @pytest.fixture(scope='session', autouse=True) def clean_context() -> t.Generator[contextlib.ExitStack, None, None]: stack = contextlib.ExitStack() yield stack stack.close() + @pytest.fixture(scope='module') def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]: loop = asyncio.get_event_loop() yield loop loop.close() + @pytest.fixture(params=['container', 'local'], scope='session') def deployment_mode(request: pytest.FixtureRequest) -> str: return request.param + @pytest.fixture(scope='module') def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal['container', 'local']): if deployment_mode == 'container': diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py index fd3c6d22..85fe83f8 100644 --- a/openllm-python/tests/models/flan_t5_test.py +++ b/openllm-python/tests/models/flan_t5_test.py @@ -10,15 +10,18 @@ if t.TYPE_CHECKING: from .conftest import HandleProtocol, ResponseComparator, _Handle model = 'flan_t5' model_id = 'google/flan-t5-small' + @pytest.fixture(scope='module') def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,): with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle + @pytest.fixture(scope='module') async def flan_t5(flan_t5_handle: _Handle): await flan_t5_handle.health(240) return flan_t5_handle.client + @pytest.mark.asyncio() async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): client = await flan_t5 diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py index 3be257b4..b6db0798 100644 --- a/openllm-python/tests/models/opt_test.py +++ b/openllm-python/tests/models/opt_test.py @@ -10,15 +10,18 @@ if t.TYPE_CHECKING: from .conftest import HandleProtocol, ResponseComparator, _Handle model = 'opt' model_id = 'facebook/opt-125m' + @pytest.fixture(scope='module') def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,): with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle + @pytest.fixture(scope='module') async def opt_125m(opt_125m_handle: _Handle): await opt_125m_handle.health(240) return opt_125m_handle.client + @pytest.mark.asyncio() async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): client = await opt_125m diff --git a/openllm-python/tests/models_test.py b/openllm-python/tests/models_test.py index 7ffd56e4..9d7b7800 100644 --- a/openllm-python/tests/models_test.py +++ b/openllm-python/tests/models_test.py @@ -4,16 +4,19 @@ import typing as t import pytest if t.TYPE_CHECKING: import openllm + @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI') def test_flan_t5_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) assert llm(prompt, temperature=0.8, top_p=0.23) + @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI') def test_opt_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) assert llm(prompt, temperature=0.9, top_k=8) + @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI') def test_baichuan_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index 4f16dd4b..291fa3b2 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -14,6 +14,7 @@ HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5' actions_xfail = functools.partial( pytest.mark.xfail, condition=os.getenv('GITHUB_ACTIONS') is not None, reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.', ) + @actions_xfail def test_general_build_with_internal_testing(): bento_store = BentoMLContainer.bento_store.get() @@ -26,6 +27,7 @@ def test_general_build_with_internal_testing(): bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING) assert len(bento_store.list(bento.tag)) == 1 + @actions_xfail def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): local_path = tmp_path_factory.mktemp('local_t5') @@ -37,11 +39,13 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): llm.save_pretrained(local_path) assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local') + @pytest.fixture() def dockerfile_template(tmp_path_factory: pytest.TempPathFactory): file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template' file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}") return file + @pytest.mark.usefixtures('dockerfile_template') @actions_xfail def test_build_with_custom_dockerfile(dockerfile_template: Path): diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py index da2d34c7..b0d40761 100644 --- a/openllm-python/tests/strategies_test.py +++ b/openllm-python/tests/strategies_test.py @@ -8,6 +8,7 @@ import bentoml from openllm_core import _strategies as strategy from openllm_core._strategies import CascadingResourceStrategy, NvidiaGpuResource, get_resource if t.TYPE_CHECKING: from _pytest.monkeypatch import MonkeyPatch + def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv('CUDA_VISIBLE_DEVICES', '0,1') @@ -15,6 +16,7 @@ def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 2 assert resource == ['0', '1'] mcls.delenv('CUDA_VISIBLE_DEVICES') + def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv('CUDA_VISIBLE_DEVICES', '0,2,-1,1') @@ -22,6 +24,7 @@ def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 2 assert resource == ['0', '2'] mcls.delenv('CUDA_VISIBLE_DEVICES') + def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv('CUDA_VISIBLE_DEVICES', '-1') @@ -29,6 +32,7 @@ def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 0 assert resource == [] mcls.delenv('CUDA_VISIBLE_DEVICES') + def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv('CUDA_VISIBLE_DEVICES', 'GPU-5ebe9f43-ac33420d4628') @@ -54,6 +58,7 @@ def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 1 assert resource == ['MIG-GPU-5ebe9f43-ac33420d4628'] mcls.delenv('CUDA_VISIBLE_DEVICES') + @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='skip GPUs test on CI') def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: @@ -64,6 +69,7 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.') assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.') assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID') + def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: # to make this tests works with system that has GPU @@ -90,10 +96,13 @@ def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): NvidiaGpuResource.from_spec(1.5) with pytest.raises(ValueError): assert NvidiaGpuResource.from_spec(-2) + class GPURunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu') + def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False): return get_resource(x, y, validate=validate) + @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu']) def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource) @@ -104,6 +113,7 @@ def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str): assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1 assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1 assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1 + @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu']) def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource) @@ -142,6 +152,7 @@ def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str): assert envs.get('CUDA_VISIBLE_DEVICES') == '7,8' envs = CascadingResourceStrategy.get_worker_env(GPURunnable, {gpu_type: [2, 6, 7, 8, 9]}, 0.4, 2) assert envs.get('CUDA_VISIBLE_DEVICES') == '9' + @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu']) def test_cascade_strategy_disabled_via_env(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource) diff --git a/pyproject.toml b/pyproject.toml index 94940909..d7f2d881 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -265,14 +265,14 @@ ALLOW_MULTILINE_LAMBDAS = false ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false ALLOW_SPLIT_BEFORE_DICT_VALUE = false ARITHMETIC_PRECEDENCE_INDICATION = true -BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 0 +BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 0 BLANK_LINE_BEFORE_CLASS_DOCSTRING = false BLANK_LINE_BEFORE_MODULE_DOCSTRING = false BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false COALESCE_BRACKETS = true COLUMN_LIMIT = 192 -CONTINUATION_ALIGN_STYLE = "VALIGN-RIGHT" +CONTINUATION_ALIGN_STYLE = "SPACE" DEDENT_CLOSING_BRACKETS = true DISABLE_ENDING_COMMA_HEURISTIC = true EACH_DICT_ENTRY_ON_SEPARATE_LINE = true diff --git a/tools/dependencies.py b/tools/dependencies.py index af0470b9..3e895f39 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -9,6 +9,7 @@ sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src')) import openllm _OWNER, _REPO = 'bentoml', 'openllm' + @dataclasses.dataclass(frozen=True) class Classifier: identifier: t.Dict[str, str] = dataclasses.field( @@ -53,6 +54,7 @@ class Classifier: @staticmethod def create_status_classifier(level: int) -> str: return Classifier.create_classifier('status', Classifier.status()[level]) + @dataclasses.dataclass(frozen=True) class Dependencies: name: str @@ -95,6 +97,7 @@ class Dependencies: @classmethod def from_tuple(cls, *decls: t.Any) -> Dependencies: return cls(*decls) + lower_bentoml_constraint = '1.1.2' _BENTOML_EXT = ['io'] _TRANSFORMERS_EXT = ['torch', 'tokenizers', 'accelerate'] @@ -138,8 +141,10 @@ _base_requirements.update({v: _locals.get(f'{inflection.underscore(v).upper()}_D _base_requirements = {k: v for k, v in sorted(_base_requirements.items())} fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}' + def correct_style(it: t.Any) -> t.Any: return it + def create_classifiers() -> Array: arr = correct_style(tomlkit.array()) arr.extend([ @@ -159,6 +164,7 @@ def create_classifiers() -> Array: *Classifier.create_python_classifier(), ]) return arr.multiline(True) + def create_optional_table() -> Table: all_array = tomlkit.array() all_array.append(f"openllm[{','.join(_base_requirements)}]") @@ -169,6 +175,7 @@ def create_optional_table() -> Table: table.add(tomlkit.nl()) return table + def create_url_table(_info: t.Any) -> Table: table = tomlkit.table() _urls = { @@ -183,6 +190,7 @@ def create_url_table(_info: t.Any) -> Table: } table.update({k: v for k, v in sorted(_urls.items())}) return table + def build_system() -> Table: table = tomlkit.table() table.add('build-backend', 'hatchling.build') @@ -190,11 +198,13 @@ def build_system() -> Table: requires_array.extend(['hatchling==1.18.0', 'hatch-vcs==0.3.0', 'hatch-fancy-pypi-readme==23.1.0']) table.add('requires', requires_array.multiline(True)) return table + def authors() -> Array: arr = correct_style(tomlkit.array()) arr.append(dict(name='Aaron Pham', email='aarnphm@bentoml.com')) arr.append(dict(name='BentoML Team', email='contact@bentoml.com')) return arr.multiline(True) + def keywords() -> Array: arr = correct_style(tomlkit.array()) arr.extend([ @@ -217,6 +227,7 @@ def keywords() -> Array: 'Transformers' ]) return arr.multiline(True) + def build_cli_extensions() -> Table: table = tomlkit.table() ext: dict[str, str] = {'openllm': 'openllm.cli.entrypoint:cli'} @@ -228,6 +239,7 @@ def build_cli_extensions() -> Table: }) table.update(ext) return table + def main() -> int: api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False) _info = api.repos.get() @@ -258,4 +270,5 @@ def main() -> int: with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f: f.write(tomlkit.dumps(pyproject)) return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/generate-coverage.py b/tools/generate-coverage.py index d3e845c4..4bf3d9ed 100755 --- a/tools/generate-coverage.py +++ b/tools/generate-coverage.py @@ -8,6 +8,7 @@ from lxml import etree ROOT = Path(__file__).resolve().parent.parent PACKAGES = {'openllm-python/src/openllm/': 'openllm'} + def main() -> int: coverage_report = ROOT / 'coverage.xml' root = etree.fromstring(coverage_report.read_text()) @@ -42,4 +43,5 @@ def main() -> int: coverage_summary = ROOT / 'coverage-summary.json' coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8') return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py index fc94a3a1..2e5c9756 100755 --- a/tools/update-brew-tap.py +++ b/tools/update-brew-tap.py @@ -18,12 +18,15 @@ _REPO = 'openllm' _gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = { 'macos_arm': 'aarch64-apple-darwin', 'macos_intel': 'x86_64-apple-darwin', 'linux_intel': 'x86_64-unknown-linux-musl' } + def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str: if target == 'archive': return f'{svn_url}/archive/{tag}.tar.gz' return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz" + # curl -sSL /archive/refs/tags/.tar.gz | shasum -a256 | cut -d'' -f1 def get_release_hash_command(svn_url: str, tag: str) -> Pipeline: return curl['-sSL', svn_url] | shasum['-a256'] | cut['-d', ' ', '-f1'] + def main() -> int: api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False) _info = api.repos.get() @@ -54,4 +57,5 @@ def main() -> int: ) f.write('\n') return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py index bc849797..5f57797c 100755 --- a/tools/update-config-stubs.py +++ b/tools/update-config-stubs.py @@ -17,10 +17,12 @@ _TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration. sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__()) from openllm_core._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams from openllm_core.utils import codegen + def process_annotations(annotations: str) -> str: if 'NotRequired' in annotations: return annotations[len('NotRequired['):-1] elif 'Required' in annotations: return annotations[len('Required['):-1] else: return annotations + _value_docstring = { 'default_id': '''Return the default model to use when using 'openllm start '. This could be one of the keys in 'self.model_ids' or custom users model. @@ -81,6 +83,7 @@ _value_docstring = { } _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'} + def main() -> int: with _TARGET_FILE.open('r') as f: processed = f.readlines() @@ -135,4 +138,5 @@ def main() -> int: with _TARGET_FILE.open('w') as f: f.writelines(processed) return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/update-dummy.py b/tools/update-dummy.py index b610812e..20ddf0af 100755 --- a/tools/update-dummy.py +++ b/tools/update-dummy.py @@ -15,12 +15,16 @@ if t.TYPE_CHECKING: from collections import OrderedDict config_requirements = {k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()} _dependencies: dict[LiteralRuntime, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('torch', 'tensorflow', 'flax', 'vllm'))} _auto: dict[str, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))} + def get_target_dummy_file(framework: LiteralRuntime) -> Path: return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{framework}_objects.py' + def mapping_names(framework: LiteralRuntime): return 'MODEL_MAPPING_NAMES' if framework == 'pt' else f'MODEL_{framework.upper()}_MAPPING_NAMES' + def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]: return getattr(auto, mapping_names(framework)) + def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]: _dep_list: list[str] = [ f'"{v}"' for v in [_dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])] @@ -33,6 +37,7 @@ def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int ' '*indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])" ] return lines + def write_stub(framework: LiteralRuntime, _path: str) -> list[str]: base = [ f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', @@ -48,10 +53,12 @@ def write_stub(framework: LiteralRuntime, _path: str) -> list[str]: _imports = [f'"{v}"' for v in get_mapping(framework).values()] base += [f'{mapping_names(framework)}:_t.Any=None', f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n"] return base + def main() -> int: _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)) for framework in _dependencies: with get_target_dummy_file(framework).open('w') as f: f.write('\n'.join(write_stub(framework, _path))) return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/update-models-import.py b/tools/update-models-import.py index 098e3147..97f8b2b7 100755 --- a/tools/update-models-import.py +++ b/tools/update-models-import.py @@ -3,9 +3,11 @@ from __future__ import annotations import os from pathlib import Path _TARGET_FILE = Path(__file__).parent.parent / 'openllm-python' / 'src' / 'openllm' / 'models' / '__init__.py' + def create_module_import() -> str: r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']] return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}" + def create_stubs_import() -> list[str]: return [ 'if t.TYPE_CHECKING:from . import ' + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]), @@ -14,6 +16,7 @@ def create_stubs_import() -> list[str]: '__dir__=__lazy.__dir__', '__getattr__=__lazy.__getattr__\n' ] + def main() -> int: _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)) with _TARGET_FILE.open('w') as f: @@ -29,4 +32,5 @@ def main() -> int: ]) ) return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/update-readme.py b/tools/update-readme.py index 398002f7..38d92278 100755 --- a/tools/update-readme.py +++ b/tools/update-readme.py @@ -8,6 +8,7 @@ END_COMMENT = f'\n' ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src')) import openllm + def main() -> int: with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f: deps = tomlkit.parse(f.read()).value['project']['optional-dependencies'] @@ -56,4 +57,5 @@ def main() -> int: with open(os.path.join(ROOT, 'README.md'), 'w') as f: f.writelines(readme) return 0 + if __name__ == '__main__': raise SystemExit(main()) diff --git a/tools/write-coverage-report.py b/tools/write-coverage-report.py index 53edac46..f24cd9c1 100755 --- a/tools/write-coverage-report.py +++ b/tools/write-coverage-report.py @@ -6,6 +6,7 @@ import orjson PRECISION = Decimal('.01') ROOT = Path(__file__).resolve().parent.parent + def main() -> int: coverage_summary = ROOT / 'coverage-summary.json' @@ -35,4 +36,5 @@ def main() -> int: with coverage_report.open('w', encoding='utf-8') as f: f.write(''.join(lines)) return 0 + if __name__ == '__main__': raise SystemExit(main())