From 7e1fb35a718fab6c9f522d6ee7b6ccb1ab41a16f Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sun, 12 Nov 2023 14:55:37 -0500 Subject: [PATCH] chore(llm): expose quantise and lazy load heavy imports (#617) * chore(llm): expose quantise and lazy load heavy imports Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: move transformers to TYPE_CHECKING block Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- openllm-python/src/openllm/_llm.py | 198 ++++++++++++------ openllm-python/src/openllm/_runners.py | 13 +- openllm-python/src/openllm/bundle/_package.py | 2 +- openllm-python/src/openllm/cli/entrypoint.py | 12 +- .../src/openllm/entrypoints/_openapi.py | 45 ++-- openllm-python/src/openllm/entrypoints/hf.py | 10 +- .../src/openllm/entrypoints/openai.py | 17 +- .../serialisation/transformers/__init__.py | 8 +- 8 files changed, 189 insertions(+), 116 deletions(-) diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 14eaa6ba..abf1aee8 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -9,8 +9,6 @@ import typing as t import attr import inflection import orjson -import torch -import transformers from huggingface_hub import hf_hub_download @@ -58,6 +56,7 @@ from .serialisation.constants import PEFT_CONFIG_NAME if t.TYPE_CHECKING: import peft + import transformers from bentoml._internal.runner.runnable import RunnableMethod from bentoml._internal.runner.runner import RunnerMethod @@ -124,8 +123,8 @@ class LLM(t.Generic[M, T], ReprMixin): _quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None _quantise: LiteralQuantise | None _model_decls: TupleAny - _model_attrs: DictStrAny - _tokenizer_attrs: DictStrAny + __model_attrs: DictStrAny + __tokenizer_attrs: DictStrAny _tag: bentoml.Tag _adapter_map: AdapterMap | None _serialisation: LiteralSerialisation @@ -133,7 +132,6 @@ class LLM(t.Generic[M, T], ReprMixin): _prompt_template: PromptTemplate | None _system_message: str | None - _bentomodel: bentoml.Model = attr.field(init=False) __llm_config__: LLMConfig | None = None __llm_backend__: LiteralBackend = None # type: ignore __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None @@ -168,16 +166,11 @@ class LLM(t.Generic[M, T], ReprMixin): _local = False if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True - backend = t.cast( - LiteralBackend, - first_not_none( - backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt' - ), - ) - quantize = first_not_none( - quantize, t.cast(t.Optional[LiteralQuantise], os.getenv('OPENLLM_QUANTIZE')), default=None + backend = first_not_none( + backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt' ) + quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None) # elif quantization_config is None and quantize is not None: # quantization_config, attrs = infer_quantisation_config(self, quantize, **attrs) attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage}) @@ -199,17 +192,17 @@ class LLM(t.Generic[M, T], ReprMixin): self.__attrs_init__( model_id=model_id, revision=model_version, - tag=bentoml.Tag.from_taglike(t.cast(t.Union[str, bentoml.Tag], model_tag)), + tag=bentoml.Tag.from_taglike(model_tag), quantization_config=quantization_config, quantise=quantize, model_decls=args, - model_attrs=dict(**self.import_kwargs[0], **model_attrs), - tokenizer_attrs=dict(**self.import_kwargs[-1], **tokenizer_attrs), adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, serialisation=serialisation, local=_local, prompt_template=prompt_template, system_message=system_message, + LLM__model_attrs=model_attrs, + LLM__tokenizer_attrs=tokenizer_attrs, llm_backend__=backend, llm_config__=llm_config, llm_trust_remote_code__=trust_remote_code, @@ -221,7 +214,6 @@ class LLM(t.Generic[M, T], ReprMixin): model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code) # resolve the tag self._tag = model.tag - self._bentomodel = model @apply(lambda val: tuple(str.lower(i) if i else i for i in val)) def _make_tag_components(self, model_id, model_version, backend) -> tuple[str, str | None]: @@ -241,72 +233,141 @@ class LLM(t.Generic[M, T], ReprMixin): ) return f'{backend}-{normalise_model_name(model_id)}', model_version - # yapf: disable - def __setattr__(self,attr,value): - if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') - super().__setattr__(attr,value) + def __setattr__(self, attr, value): + if attr in _reserved_namespace: + raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') + super().__setattr__(attr, value) + @property - def __repr_keys__(self): return {'model_id', 'revision', 'backend', 'type'} + def _model_attrs(self) -> dict[str, t.Any]: + return {**self.import_kwargs[0], **self.__model_attrs} + + @property + def _tokenizer_attrs(self) -> dict[str, t.Any]: + return {**self.import_kwargs[1], **self.__tokenizer_attrs} + + @property + def __repr_keys__(self): + return {'model_id', 'revision', 'backend', 'type'} + def __repr_args__(self): - yield 'model_id',self._model_id if not self._local else self.tag.name - yield 'revision',self._revision if self._revision else self.tag.version - yield 'backend',self.__llm_backend__ - yield 'type',self.llm_type + yield 'model_id', self._model_id if not self._local else self.tag.name + yield 'revision', self._revision if self._revision else self.tag.version + yield 'backend', self.__llm_backend__ + yield 'type', self.llm_type + @property - def import_kwargs(self)->tuple[dict[str, t.Any],dict[str, t.Any]]: return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {'padding_side': 'left', 'truncation_side': 'left'} + def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: + import torch + + return { + 'device_map': 'auto' if torch.cuda.is_available() else None, + 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32, + }, {'padding_side': 'left', 'truncation_side': 'left'} + @property - def trust_remote_code(self)->bool:return first_not_none(check_bool_env('TRUST_REMOTE_CODE',False),default=self.__llm_trust_remote_code__) + def trust_remote_code(self) -> bool: + return first_not_none(check_bool_env('TRUST_REMOTE_CODE', False), default=self.__llm_trust_remote_code__) + @property - def runner_name(self)->str:return f"llm-{self.config['start_name']}-runner" + def runner_name(self) -> str: + return f"llm-{self.config['start_name']}-runner" + @property - def model_id(self)->str:return self._model_id + def model_id(self) -> str: + return self._model_id + @property - def revision(self)->str:return t.cast(str, self._revision) + def revision(self) -> str: + return t.cast(str, self._revision) + @property - def tag(self)->bentoml.Tag:return self._tag + def tag(self) -> bentoml.Tag: + return self._tag + @property - def bentomodel(self)->bentoml.Model:return openllm.serialisation.get(self) + def bentomodel(self) -> bentoml.Model: + return openllm.serialisation.get(self) + @property - def quantization_config(self)->transformers.BitsAndBytesConfig|transformers.GPTQConfig|transformers.AwqConfig: + def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig: if self.__llm_quantization_config__ is None: - if self._quantization_config is not None:self.__llm_quantization_config__=self._quantization_config - elif self._quantise is not None:self.__llm_quantization_config__,self._model_attrs=infer_quantisation_config(self, self._quantise, **self._model_attrs) - else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.") + if self._quantization_config is not None: + self.__llm_quantization_config__ = self._quantization_config + elif self._quantise is not None: + self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config( + self, self._quantise, **self._model_attrs + ) + else: + raise ValueError("Either 'quantization_config' or 'quantise' must be specified.") return self.__llm_quantization_config__ + @property - def has_adapters(self)->bool:return self._adapter_map is not None + def has_adapters(self) -> bool: + return self._adapter_map is not None + @property - def local(self)->bool:return self._local + def local(self) -> bool: + return self._local + + @property + def quantise(self) -> LiteralQuantise | None: + return self._quantise + # NOTE: The section below defines a loose contract with langchain's LLM interface. @property - def llm_type(self)->str:return normalise_model_name(self._model_id) + def llm_type(self) -> str: + return normalise_model_name(self._model_id) + @property - def identifying_params(self)->DictStrAny:return {'configuration': self.config.model_dump_json().decode(),'model_ids': orjson.dumps(self.config['model_ids']).decode(),'model_id': self.model_id} + def identifying_params(self) -> DictStrAny: + return { + 'configuration': self.config.model_dump_json().decode(), + 'model_ids': orjson.dumps(self.config['model_ids']).decode(), + 'model_id': self.model_id, + } + @property - def llm_parameters(self)->tuple[tuple[tuple[t.Any,...],DictStrAny],DictStrAny]:return (self._model_decls,self._model_attrs),self._tokenizer_attrs + def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: + return (self._model_decls, self._model_attrs), self._tokenizer_attrs + # NOTE: This section is the actual model, tokenizer, and config reference here. @property - def config(self)->LLMConfig: - if self.__llm_config__ is None:self.__llm_config__=openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) + def config(self) -> LLMConfig: + if self.__llm_config__ is None: + self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) return self.__llm_config__ + @property - def tokenizer(self)->T: - if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self,**self.llm_parameters[-1]) + def tokenizer(self) -> T: + if self.__llm_tokenizer__ is None: + self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1]) return self.__llm_tokenizer__ + @property - def runner(self)->LLMRunner[M, T]: - if self.__llm_runner__ is None:self.__llm_runner__=_RunnerFactory(self) + def runner(self) -> LLMRunner[M, T]: + if self.__llm_runner__ is None: + self.__llm_runner__ = _RunnerFactory(self) return self.__llm_runner__ + @property - def model(self)->M: + def model(self) -> M: if self.__llm_model__ is None: - model=openllm.serialisation.load_model(self,*self._model_decls,**self._model_attrs) + model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. if self.__llm_backend__ == 'pt': - loaded_in_kbit = getattr(model,'is_loaded_in_8bit',False) or getattr(model,'is_loaded_in_4bit',False) or getattr(model,'is_quantized',False) + import torch + + loaded_in_kbit = ( + getattr(model, 'is_loaded_in_8bit', False) + or getattr(model, 'is_loaded_in_4bit', False) + or getattr(model, 'is_quantized', False) + ) if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: - try: model = model.to('cuda') - except Exception as err: raise OpenLLMException(f'Failed to load model into GPU: {err}\n. See https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.') from err + try: + model = model.to('cuda') + except Exception as err: + raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err if self.has_adapters: logger.debug('Applying the following adapters: %s', self.adapter_map) for adapter_dict in self.adapter_map.values(): @@ -314,23 +375,29 @@ class LLM(t.Generic[M, T], ReprMixin): model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config) self.__llm_model__ = model return self.__llm_model__ + @property def adapter_map(self) -> ResolvedAdapterMap: try: import peft as _ # noqa: F401 except ImportError as err: - raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'") from err - if not self.has_adapters: raise AttributeError('Adapter map is not available.') + raise MissingDependencyError( + "Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'" + ) from err + if not self.has_adapters: + raise AttributeError('Adapter map is not available.') assert self._adapter_map is not None if self.__llm_adapter_map__ is None: _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map} for adapter_type, adapter_tuple in self._adapter_map.items(): - base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type)) + base = first_not_none( + self.config['fine_tune_strategies'].get(adapter_type), + default=self.config.make_fine_tune_config(adapter_type), + ) for adapter in adapter_tuple: _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id) self.__llm_adapter_map__ = _map return self.__llm_adapter_map__ - # yapf: enable def prepare_for_training( self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any @@ -475,15 +542,24 @@ def _RunnerFactory( else: system_message = None - # yapf: disable - def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'} + def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: + return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'} + def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs: - yield 'runner_methods', {method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None} for method in _.runner_methods} + yield ( + 'runner_methods', + { + method.name: { + 'batchable': method.config.batchable, + 'batch_dim': method.config.batch_dim if method.config.batchable else None, + } + for method in _.runner_methods + }, + ) yield 'config', self.config.model_dump(flatten=True) yield 'llm_type', self.llm_type yield 'backend', backend yield 'llm_tag', self.tag - # yapf: enable return types.new_class( self.__class__.__name__ + 'Runner', diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index 36a597fb..d4b0622d 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -50,8 +50,8 @@ class vLLMRunnable(bentoml.Runnable): if dev >= 2: num_gpus = min(dev // 2 * 2, dev) quantization = None - if llm._quantise and llm._quantise in {'awq', 'squeezellm'}: - quantization = llm._quantise + if llm.quantise and llm.quantise in {'awq', 'squeezellm'}: + quantization = llm.quantise try: self.model = vllm.AsyncLLMEngine.from_engine_args( vllm.AsyncEngineArgs( @@ -111,7 +111,6 @@ class PyTorchRunnable(bentoml.Runnable): self.model = llm.model self.tokenizer = llm.tokenizer self.config = llm.config - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @bentoml.Runnable.method(batchable=False) async def generate_iterator( @@ -155,17 +154,17 @@ class PyTorchRunnable(bentoml.Runnable): finish_reason: t.Optional[FinishReason] = None for i in range(config['max_new_tokens']): if i == 0: # prefill - out = self.model(torch.as_tensor([prompt_token_ids], device=self.device), use_cache=True) + out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True) else: # decoding out = self.model( - torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values + torch.as_tensor([[token]], device=self.model.device), use_cache=True, past_key_values=past_key_values ) logits = out.logits past_key_values = out.past_key_values if logits_processor: if config['repetition_penalty'] > 1.0: - tmp_output_ids: t.Any = torch.as_tensor([output_token_ids], device=self.device) + tmp_output_ids: t.Any = torch.as_tensor([output_token_ids], device=self.model.device) else: tmp_output_ids = None last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0] @@ -173,7 +172,7 @@ class PyTorchRunnable(bentoml.Runnable): last_token_logits = logits[0, -1, :] # Switch to CPU by avoiding some bugs in mps backend. - if self.device.type == 'mps': + if self.model.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu') if config['temperature'] < 1e-5 or config['top_p'] < 1e-8: # greedy diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 2b52230b..86b61c60 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -148,7 +148,7 @@ def construct_docker_options( if llm._prompt_template: env_dict['OPENLLM_PROMPT_TEMPLATE'] = repr(llm._prompt_template.to_string()) if quantize: - env_dict['OPENLLM_QUANTISE'] = str(quantize) + env_dict['OPENLLM_QUANTIZE'] = str(quantize) return DockerOptions( base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py index ae08090e..bd83e4da 100644 --- a/openllm-python/src/openllm/cli/entrypoint.py +++ b/openllm-python/src/openllm/cli/entrypoint.py @@ -621,8 +621,8 @@ def process_environ( 'OPENLLM_CONFIG': config.model_dump_json(flatten=True).decode(), } ) - if llm._quantise: - environ['OPENLLM_QUANTIZE'] = str(llm._quantise) + if llm.quantise: + environ['OPENLLM_QUANTIZE'] = str(llm.quantise) if system_message: environ['OPENLLM_SYSTEM_MESSAGE'] = system_message if prompt_template: @@ -650,8 +650,8 @@ def process_workers_per_resource(wpr: str | float | int, device: tuple[str, ...] def build_bento_instruction(llm, model_id, serialisation, adapter_map): cmd_name = f'openllm build {model_id}' - if llm._quantise: - cmd_name += f' --quantize {llm._quantise}' + if llm.quantise: + cmd_name += f' --quantize {llm.quantise}' cmd_name += f' --serialization {serialisation}' if adapter_map is not None: cmd_name += ' ' + ' '.join( @@ -994,8 +994,8 @@ def build_command( 'OPENLLM_MODEL_ID': llm.model_id, } ) - if llm._quantise: - os.environ['OPENLLM_QUANTIZE'] = str(llm._quantise) + if llm.quantise: + os.environ['OPENLLM_QUANTIZE'] = str(llm.quantise) if system_message: os.environ['OPENLLM_SYSTEM_MESSAGE'] = system_message if prompt_template: diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py index 882a6e42..8ebdc62f 100644 --- a/openllm-python/src/openllm/entrypoints/_openapi.py +++ b/openllm-python/src/openllm/entrypoints/_openapi.py @@ -24,7 +24,7 @@ if t.TYPE_CHECKING: P = ParamSpec('P') OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0' # NOTE: OpenAI schema -LIST_MODEL_SCHEMA = """\ +LIST_MODELS_SCHEMA = """\ --- consumes: - application/json @@ -55,14 +55,14 @@ responses: schema: $ref: '#/components/schemas/ModelList' """ -CHAT_COMPLETION_SCHEMA = """\ +CHAT_COMPLETIONS_SCHEMA = """\ --- consumes: - application/json description: >- Given a list of messages comprising a conversation, the model will return a response. -operationId: openai__create_chat_completions +operationId: openai__chat_completions produces: - application/json tags: @@ -193,7 +193,7 @@ responses: } description: Bad Request """ -COMPLETION_SCHEMA = """\ +COMPLETIONS_SCHEMA = """\ --- consumes: - application/json @@ -201,7 +201,7 @@ description: >- Given a prompt, the model will return one or more predicted completions, and can also return the probabilities of alternative tokens at each position. We recommend most users use our Chat completions API. -operationId: openai__create_completions +operationId: openai__completions produces: - application/json tags: @@ -423,15 +423,17 @@ responses: description: Not Found """ +_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')} -def add_schema_definitions(append_str: str) -> t.Callable[[t.Callable[P, t.Any]], t.Callable[P, t.Any]]: - def docstring_decorator(func: t.Callable[P, t.Any]) -> t.Callable[P, t.Any]: - if func.__doc__ is None: - func.__doc__ = '' - func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip() + +def add_schema_definitions(func: t.Callable[P, t.Any]) -> t.Callable[P, t.Any]: + append_str = _SCHEMAS.get(func.__name__.lower(), '') + if not append_str: return func - - return docstring_decorator + if func.__doc__ is None: + func.__doc__ = '' + func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip() + return func class OpenLLMSchemaGenerator(SchemaGenerator): @@ -558,7 +560,7 @@ def append_schemas( # HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec. from bentoml._internal.service.openapi.specification import OpenAPISpecification - svc_schema: t.Any = svc.openapi_spec + svc_schema = svc.openapi_spec if isinstance(svc_schema, (OpenAPISpecification, MKSchema)): svc_schema = svc_schema.asdict() if 'tags' in generated_schema: @@ -572,14 +574,15 @@ def append_schemas( svc_schema['components']['schemas'].update(generated_schema['components']['schemas']) svc_schema['paths'].update(generated_schema['paths']) - from bentoml._internal.service import ( - openapi, # HACK: mk this attribute until we have a better way to add starlette schemas. - ) + # HACK: mk this attribute until we have a better way to add starlette schemas. + from bentoml._internal.service import openapi - # yapf: disable - def mk_generate_spec(svc:bentoml.Service,openapi_version:str=OPENAPI_VERSION)->MKSchema:return MKSchema(svc_schema) - def mk_asdict(self:OpenAPISpecification)->dict[str,t.Any]:return svc_schema - openapi.generate_spec=mk_generate_spec + def mk_generate_spec(svc, openapi_version=OPENAPI_VERSION): + return MKSchema(svc_schema) + + def mk_asdict(self): + return svc_schema + + openapi.generate_spec = mk_generate_spec OpenAPISpecification.asdict = mk_asdict - # yapf: disable return svc diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py index be21af8a..c5743920 100644 --- a/openllm-python/src/openllm/entrypoints/hf.py +++ b/openllm-python/src/openllm/entrypoints/hf.py @@ -14,8 +14,6 @@ from starlette.routing import Route from openllm_core.utils import converter -from ._openapi import HF_ADAPTERS_SCHEMA -from ._openapi import HF_AGENT_SCHEMA from ._openapi import add_schema_definitions from ._openapi import append_schemas from ._openapi import get_generator @@ -54,7 +52,7 @@ def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Servic debug=True, routes=[ Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']), - Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']), + Route('/adapters', endpoint=functools.partial(hf_adapters, llm=llm), name='adapters', methods=['GET']), Route('/schema', endpoint=openapi_schema, include_in_schema=False), ], ) @@ -71,7 +69,7 @@ def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ) -@add_schema_definitions(HF_AGENT_SCHEMA) +@add_schema_definitions async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response: json_str = await req.body() try: @@ -92,8 +90,8 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response: return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).') -@add_schema_definitions(HF_ADAPTERS_SCHEMA) -def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response: +@add_schema_definitions +def hf_adapters(req: Request, llm: openllm.LLM[M, T]) -> Response: if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.') return JSONResponse( diff --git a/openllm-python/src/openllm/entrypoints/openai.py b/openllm-python/src/openllm/entrypoints/openai.py index a951ef4e..909a2bec 100644 --- a/openllm-python/src/openllm/entrypoints/openai.py +++ b/openllm-python/src/openllm/entrypoints/openai.py @@ -18,9 +18,6 @@ from openllm_core._schemas import SampleLogprobs from openllm_core.utils import converter from openllm_core.utils import gen_random_uuid -from ._openapi import CHAT_COMPLETION_SCHEMA -from ._openapi import COMPLETION_SCHEMA -from ._openapi import LIST_MODEL_SCHEMA from ._openapi import add_schema_definitions from ._openapi import append_schemas from ._openapi import get_generator @@ -127,8 +124,8 @@ def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Servic debug=True, routes=[ Route('/models', functools.partial(list_models, llm=llm), methods=['GET']), - Route('/completions', functools.partial(create_completions, llm=llm), methods=['POST']), - Route('/chat/completions', functools.partial(create_chat_completions, llm=llm), methods=['POST']), + Route('/completions', functools.partial(completions, llm=llm), methods=['POST']), + Route('/chat/completions', functools.partial(chat_completions, llm=llm), methods=['POST']), ], ) mount_path = '/v1' @@ -138,7 +135,7 @@ def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Servic # GET /v1/models -@add_schema_definitions(LIST_MODEL_SCHEMA) +@add_schema_definitions def list_models(_: Request, llm: openllm.LLM[M, T]) -> Response: return JSONResponse( converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value @@ -146,8 +143,8 @@ def list_models(_: Request, llm: openllm.LLM[M, T]) -> Response: # POST /v1/chat/completions -@add_schema_definitions(CHAT_COMPLETION_SCHEMA) -async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Response: +@add_schema_definitions +async def chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Response: # TODO: Check for length based on model context_length json_str = await req.body() try: @@ -263,8 +260,8 @@ async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Respo # POST /v1/completions -@add_schema_definitions(COMPLETION_SCHEMA) -async def create_completions(req: Request, llm: openllm.LLM[M, T]) -> Response: +@add_schema_definitions +async def completions(req: Request, llm: openllm.LLM[M, T]) -> Response: # TODO: Check for length based on model context_length json_str = await req.body() try: diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 599aa6af..88ce1117 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -62,7 +62,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs) _patch_correct_tag(llm, config) _, tokenizer_attrs = llm.llm_parameters - quantize = llm._quantise + quantize = llm.quantise safe_serialisation = openllm.utils.first_not_none( attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors' ) @@ -132,7 +132,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon try: bentomodel.enter_cloudpickle_context(external_modules, imported_modules) tokenizer.save_pretrained(bentomodel.path) - if llm._quantization_config or (llm._quantise and llm._quantise not in {'squeezellm', 'awq'}): + if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}): attrs['quantization_config'] = llm.quantization_config if quantize == 'gptq': from optimum.gptq.constants import GPTQ_CONFIG @@ -205,7 +205,7 @@ def check_unintialised_params(model): def load_model(llm, *decls, **attrs): - if llm._quantise in {'awq', 'squeezellm'}: + if llm.quantise in {'awq', 'squeezellm'}: raise RuntimeError('AWQ is not yet supported with PyTorch backend.') config, attrs = transformers.AutoConfig.from_pretrained( llm.bentomodel.path, return_unused_kwargs=True, trust_remote_code=llm.trust_remote_code, **attrs @@ -217,7 +217,7 @@ def load_model(llm, *decls, **attrs): device_map = 'auto' elif torch.cuda.device_count() == 1: device_map = 'cuda:0' - if llm._quantise in {'int8', 'int4'}: + if llm.quantise in {'int8', 'int4'}: attrs['quantization_config'] = llm.quantization_config if '_quantize' in llm.bentomodel.info.metadata: