fix(gptq): use upstream integration (#297)

* wip Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * feat: GPTQ transformers integration Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * fix: only load if variable is available and add changelog Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * chore: remove boilerplate check Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-04-19 06:30:40 -04:00 · 2023-09-04 14:05:50 -04:00
parent 3da869e728
commit 956b3a53bc
23 changed files with 197 additions and 248 deletions
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -39,10 +39,11 @@ classifiers = [
 ]
 dependencies = [
  "bentoml[io]>=1.1.2",
-  "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "transformers[torch,tokenizers,accelerate]>=4.32.1",
  "openllm-client",
  "safetensors",
-  "optimum",
+  "optimum>=1.12.0",
+  "accelerate",
  "ghapi",
  "tabulate[widechars]>=0.9.0",
  "click>=8.1.3",
@@ -99,13 +100,13 @@ all = ["openllm[full]"]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
 falcon = ["einops", "xformers"]
-fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+fine-tune = ["peft>=0.5.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 full = [
-  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
+    "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
-gptq = ["auto-gptq[triton]"]
+gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
 grpc = ["openllm-client[grpc]"]
 llama = ["fairscale", "sentencepiece"]
 mpt = ["triton", "einops"]
@@ -150,7 +151,7 @@ dependencies = [
  # avoid https://github.com/pallets/click/issues/2558
  "click==8.1.3",
  "bentoml==1.1.2",
-  "transformers>=4.31.0",
+  "transformers>=4.32.1",
  "pandas-stubs",
  "types-psutil",
  "types-tabulate",
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -28,6 +28,7 @@ from openllm_core._typing_compat import AdaptersTuple
 from openllm_core._typing_compat import AdapterType
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import LLMRunnable
 from openllm_core._typing_compat import LLMRunner
@@ -63,7 +64,6 @@ from .utils import infer_auto_class

 if t.TYPE_CHECKING:

-  import auto_gptq as autogptq
  import peft
  import torch
  import transformers
@@ -71,7 +71,6 @@ if t.TYPE_CHECKING:
  from openllm_core._configuration import PeftType
  from openllm_core.utils.representation import ReprArgs
 else:
-  autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
  transformers = LazyLoader('transformers', globals(), 'transformers')
  torch = LazyLoader('torch', globals(), 'torch')
  peft = LazyLoader('peft', globals(), 'peft')
@@ -80,6 +79,8 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf

 logger = logging.getLogger(__name__)

+_object_setattr = object.__setattr__
+
 def normalise_model_name(name: str) -> str:
  if validate_is_path(name): return os.path.basename(resolve_filepath(name))
  name = name.replace('/', '--')
@@ -280,7 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin):

    def __attrs_init__(self,
                       config: LLMConfig,
-                       quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
+                       quantize: t.Optional[LiteralQuantise],
+                       quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
                       model_id: str,
                       model_decls: TupleAny,
                       model_attrs: DictStrAny,
@@ -288,17 +290,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
                       tag: bentoml.Tag,
                       adapters_mapping: t.Optional[AdaptersMapping],
                       model_version: t.Optional[str],
-                       quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                       serialisation_format: t.Literal['safetensors', 'legacy'],
+                       serialisation: t.Literal['safetensors', 'legacy'],
                       _local: bool,
                       **attrs: t.Any) -> None:
      '''Generated __attrs_init__ for openllm.LLM.'''

  config: LLMConfig
  '''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
-  quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
+  quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None
  '''Quantisation config for quantised model on the fly.'''
-
+  _quantize: LiteralQuantise | None
  _model_id: str
  _model_decls: TupleAny
  _model_attrs: DictStrAny
@@ -306,8 +307,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
  _tag: bentoml.Tag
  _adapters_mapping: AdaptersMapping | None
  _model_version: str
-  _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None
-  _serialisation_format: t.Literal['safetensors', 'legacy']
+  _serialisation: t.Literal['safetensors', 'legacy']
  _local: bool

  def __init_subclass__(cls: type[LLM[M, T]]) -> None:
@@ -376,11 +376,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
                      model_version: str | None = None,
                      llm_config: LLMConfig | None = None,
                      *args: t.Any,
-                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                      quantize: LiteralQuantise | None = None,
                      adapter_id: str | None = None,
                      adapter_name: str | None = None,
                      adapter_map: dict[str, str | None] | None = None,
-                      quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+                      quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
                      serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
                      **attrs: t.Any) -> LLM[M, T]:
    '''Instantiate a pretrained LLM.
@@ -403,9 +403,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
    model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
    ```

-    For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
-    to ``auto_gptq.BaseQuantizeConfig``.
-
    ### Adapter options:

    > This is used in conjunction with the fine-tuning features
@@ -427,7 +424,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
                    will use `config_class` to construct default configuration.
        quantize: The quantization to use for this LLM. Defaults to None. Possible values
                  include int8, int4 and gptq.
-        quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
+        quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `transformers.GPTQConfig`) to use. Note that this is mutually exclusive with `quantize`
        serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
                      Default behaviour is similar to ``safe_serialization=False``.
        adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
@@ -440,13 +437,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
    _local = False
    _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
    if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
-    quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
+    quantize = first_not_none(quantize, t.cast(t.Optional[LiteralQuantise], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)

    # quantization setup
    if quantization_config and quantize:
      raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
    if quantization_config is None and quantize is not None:
-      quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
+      # in case users input `tokenizer` to __init__, default to the _model_id
+      _gptq_tokenizer = attrs.pop('tokenizer', _model_id)
+      quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs)
    if quantize == 'gptq': serialisation = 'safetensors'
    elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy'  # Currently working-in-progress

@@ -476,10 +475,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
               model_id=_model_id,
               llm_config=llm_config,
               quantization_config=quantization_config,
-               _quantize_method=quantize,
+               _quantize=quantize,
               _model_version=_tag.version,
               _tag=_tag,
-               _serialisation_format=serialisation,
+               _serialisation=serialisation,
               _local=_local,
               _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
               **attrs)
@@ -534,12 +533,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
               *args: t.Any,
               model_id: str,
               llm_config: LLMConfig,
-               quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
+               quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
               _adapters_mapping: AdaptersMapping | None,
               _tag: bentoml.Tag,
-               _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
+               _quantize: LiteralQuantise | None,
               _model_version: str,
-               _serialisation_format: t.Literal['safetensors', 'legacy'],
+               _serialisation: t.Literal['safetensors', 'legacy'],
               _local: bool,
               **attrs: t.Any,
               ):
@@ -641,6 +640,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
    # NOTE: Save the args and kwargs for latter load
    self.__attrs_init__(llm_config,
                        quantization_config,
+                        _quantize,
                        model_id,
                        args, {
                            **model_kwds, **normalized_model_kwds
@@ -650,8 +650,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
                        _tag,
                        _adapters_mapping,
                        _model_version,
-                        _quantize_method,
-                        _serialisation_format,
+                        _serialisation,
                        _local)

    self.llm_post_init()
@@ -672,7 +671,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

  @adapters_mapping.setter
  def adapters_mapping(self, value: AdaptersMapping) -> None:
-    self._adapters_mapping = value
+    _object_setattr(self, '_adapters_mapping', value)

  @property
  def __repr_keys__(self) -> set[str]:
@@ -709,13 +708,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
  def tag(self) -> bentoml.Tag:
    return self._tag

-  def ensure_model_id_exists(self) -> bentoml.Model:
+  def save_pretrained(self) -> bentoml.Model:
    return openllm.import_model(self.config['start_name'],
                                model_id=self.model_id,
                                model_version=self._model_version,
                                backend=self.__llm_backend__,
-                                quantize=self._quantize_method,
-                                serialisation_format=self._serialisation_format)
+                                quantize=self._quantize,
+                                serialisation=self._serialisation)

  @property
  def _bentomodel(self) -> bentoml.Model:
@@ -1085,11 +1084,11 @@ def Runner(model_name: str,
           model_id: str | None = ...,
           model_version: str | None = ...,
           llm_config: LLMConfig | None = ...,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
+           quantize: LiteralQuantise | None = ...,
           adapter_id: str | None = ...,
           adapter_name: str | None = ...,
           adapter_map: dict[str, str | None] | None = ...,
-           quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+           quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
           serialisation: t.Literal['safetensors', 'legacy'] = ...,
           **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
  ...
@@ -1270,7 +1269,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
                             'config': self.config,
                             'backend': self.__llm_backend__,
                             'peft_adapters': property(fget=available_adapters),
-                             'download_model': self.ensure_model_id_exists,
+                             'download_model': self.save_pretrained,
                             '__call__': _wrapped_generate_run,
                             'embed': _wrapped_embeddings_run,
                             '__module__': self.__module__,
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -3,47 +3,68 @@ from __future__ import annotations
 import logging
 import typing as t

+import torch
+import transformers
+
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import overload
-from openllm_core.utils import LazyLoader
 from openllm_core.utils import is_autogptq_available
 from openllm_core.utils import is_bitsandbytes_available
-from openllm_core.utils import is_transformers_supports_kbit
-from openllm_core.utils import pkg
+from openllm_core.utils import is_optimum_supports_gptq

 if t.TYPE_CHECKING:
  from openllm_core._typing_compat import DictStrAny

  from ._llm import LLM

-autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
-
 logger = logging.getLogger(__name__)

-QuantiseMode = t.Literal['int8', 'int4', 'gptq']
-
@overload
 def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
  ...

@overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
  ...

-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
  int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
  int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)

-  autogptq_attrs: DictStrAny = {
-      'bits': attrs.pop('gptq_bits', 4),
-      'group_size': attrs.pop('gptq_group_size', -1),
-      'damp_percent': attrs.pop('gptq_damp_percent', 0.01),
-      'desc_act': attrs.pop('gptq_desc_act', True),
-      'sym': attrs.pop('gptq_sym', True),
-      'true_sequential': attrs.pop('gptq_true_sequential', True),
-  }
+  def create_gptq_config() -> transformers.GPTQConfig:
+    gptq_bits = attrs.pop('bits', 4)
+    gptq_tokenizer = attrs.pop('tokenizer', None)
+    gptq_dataset = attrs.pop('dataset', 'c4')
+    gptq_group_size = attrs.pop('group_size', 128)
+    gptq_damp_percent = attrs.pop('damp_percent', 0.1)
+    gptq_desc_act = attrs.pop('desc_act', False)
+    gptq_sym = attrs.pop('sym', True)
+    gptq_true_sequential = attrs.pop('true_sequential', True)
+    gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False)
+    gptq_model_seqlen = attrs.pop('model_seqlen', None)
+    gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None)
+    gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None)
+    gptq_batch_size = attrs.pop('batch_size', 1)
+    gptq_pad_token_id = attrs.pop('pad_token_id', None)
+    gptq_disable_exllama = attrs.pop('disable_exllama', False)
+    return transformers.GPTQConfig(bits=gptq_bits,
+                                   tokenizer=gptq_tokenizer,
+                                   dataset=gptq_dataset,
+                                   group_size=gptq_group_size,
+                                   damp_percent=gptq_damp_percent,
+                                   desc_act=gptq_desc_act,
+                                   sym=gptq_sym,
+                                   true_sequential=gptq_true_sequential,
+                                   use_cuda_fp16=gptq_use_cuda_fp16,
+                                   model_seqlen=gptq_model_seqlen,
+                                   block_name_to_quantize=gptq_block_name_to_quantize,
+                                   module_name_preceding_first_block=gptq_module_name_preceding_first_block,
+                                   batch_size=gptq_batch_size,
+                                   pad_token_id=gptq_pad_token_id,
+                                   disable_exllama=gptq_disable_exllama)

  def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
    if int8_skip_modules is None: int8_skip_modules = []
@@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
    raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
  if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'int4':
-    if is_transformers_supports_kbit():
-      quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
-                                                            bnb_4bit_compute_dtype=int4_compute_dtype,
-                                                            bnb_4bit_quant_type=int4_quant_type,
-                                                            bnb_4bit_use_double_quant=int4_use_double_quant)
-    else:
-      logger.warning(
-          "'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
-          pkg.pkg_version_info('transformers'))
-      quantisation_config = create_int8_config(int8_skip_modules)
+    quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
+                                                          bnb_4bit_compute_dtype=int4_compute_dtype,
+                                                          bnb_4bit_quant_type=int4_quant_type,
+                                                          bnb_4bit_use_double_quant=int4_use_double_quant)
  elif quantise == 'gptq':
-    if not is_autogptq_available():
+    if not is_autogptq_available() or not is_optimum_supports_gptq():
      logger.warning(
-          "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
+          "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
      )
      quantisation_config = create_int8_config(int8_skip_modules)
    else:
-      quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
+      quantisation_config = create_gptq_config()
  else:
    raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
  return quantisation_config, attrs
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -109,12 +109,11 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
             ],
             'num_tokens': 20
         }))
-async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+async def embeddings_v1(phrases: list[str]) -> list[openllm.EmbeddingsOutput]:
  embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode  # type: ignore[type-arg,assignment,valid-type]
-  responses = (await embed_call.async_run(phrases))[0]
-  return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens'])
+  return await embed_call.async_run(phrases)

-if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
+if runner.supports_hf_agent:

  async def hf_agent(request: Request) -> Response:
    json_str = await request.body()
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -128,19 +128,19 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
                             quantize: LiteralString | None,
                             adapter_map: dict[str, str | None] | None,
                             dockerfile_template: str | None,
-                             serialisation_format: t.Literal['safetensors', 'legacy'],
+                             serialisation: t.Literal['safetensors', 'legacy'],
                             container_registry: LiteralContainerRegistry,
                             container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
  from openllm.cli._factory import parse_config_options
  environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
  env: openllm_core.utils.EnvVarMixin = llm.config['env']
-  if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
+  if env['backend_value'] == 'vllm': serialisation = 'legacy'
  env_dict = {
      env.backend: env['backend_value'],
      env.config: f"'{llm.config.model_dump_json().decode()}'",
      env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
      'OPENLLM_MODEL': llm.config['model_name'],
-      'OPENLLM_SERIALIZATION': serialisation_format,
+      'OPENLLM_SERIALIZATION': serialisation,
      'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'",
      'BENTOML_DEBUG': str(True),
      'BENTOML_QUIET': str(False),
@@ -207,7 +207,7 @@ def create_bento(bento_tag: bentoml.Tag,
                 dockerfile_template: str | None,
                 adapter_map: dict[str, str | None] | None = None,
                 extra_dependencies: tuple[str, ...] | None = None,
-                 serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+                 serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
                 container_registry: LiteralContainerRegistry = 'ecr',
                 container_version_strategy: LiteralContainerVersionStrategy = 'release',
                 _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
@@ -246,7 +246,7 @@ def create_bento(bento_tag: bentoml.Tag,
                                                                  quantize,
                                                                  adapter_map,
                                                                  dockerfile_template,
-                                                                  serialisation_format,
+                                                                  serialisation,
                                                                  container_registry,
                                                                  container_version_strategy))

--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -22,6 +22,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import get_literal_args
@@ -131,15 +132,15 @@ Available official model_id(s): [default: {llm_config['default_id']}]
                model_version: str | None,
                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
                device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+                quantize: LiteralQuantise | None,
                backend: LiteralBackend,
-                serialisation_format: t.Literal['safetensors', 'legacy'],
+                serialisation: t.Literal['safetensors', 'legacy'],
                cors: bool,
                adapter_id: str | None,
                return_process: bool,
                **attrs: t.Any,
                ) -> LLMConfig | subprocess.Popen[bytes]:
-    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
+    if serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
          fg='yellow')
@@ -184,11 +185,11 @@ Available official model_id(s): [default: {llm_config['default_id']}]
        'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
        'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
        'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
-        'OPENLLM_SERIALIZATION': serialisation_format,
-        env.backend: env['backend_value']
+        'OPENLLM_SERIALIZATION': serialisation,
+        env.backend: env['backend_value'],
    })
    if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
-    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
+    if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value'])

    llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
                                                                         model_id=start_env[env.model_id],
@@ -196,7 +197,8 @@ Available official model_id(s): [default: {llm_config['default_id']}]
                                                                         llm_config=config,
                                                                         ensure_available=True,
                                                                         adapter_map=adapter_map,
-                                                                         serialisation=serialisation_format)
+                                                                         quantize=env['quantize_value'],
+                                                                         serialisation=serialisation)
    start_env.update({env.config: llm.config.model_dump_json().decode()})

    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
@@ -262,8 +264,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab

            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            ''',
-                           ),
+            '''),
        quantize_option(factory=cog.optgroup),
        serialisation_option(factory=cog.optgroup),
        cog.optgroup.option('--device',
@@ -457,7 +458,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--serialisation',
                    '--serialization',
-                    'serialisation_format',
+                    'serialisation',
                    type=click.Choice(['safetensors', 'legacy']),
                    default='safetensors',
                    show_default=True,
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -26,6 +26,7 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralBackend
  from openllm_core._typing_compat import LiteralContainerRegistry
  from openllm_core._typing_compat import LiteralContainerVersionStrategy
+  from openllm_core._typing_compat import LiteralQuantise
  from openllm_core._typing_compat import LiteralString

 logger = logging.getLogger(__name__)
@@ -37,7 +38,7 @@ def _start(model_name: str,
           timeout: int = 30,
           workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
           device: tuple[str, ...] | t.Literal['all'] | None = None,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           quantize: LiteralQuantise | None = None,
           adapter_map: dict[LiteralString, str | None] | None = None,
           backend: LiteralBackend | None = None,
           additional_args: list[str] | None = None,
@@ -109,7 +110,7 @@ def _build(model_name: str,
           model_id: str | None = None,
           model_version: str | None = None,
           bento_version: str | None = None,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           quantize: LiteralQuantise | None = None,
           adapter_map: dict[str, str | None] | None = None,
           build_ctx: str | None = None,
           enable_features: tuple[str, ...] | None = None,
@@ -120,7 +121,7 @@ def _build(model_name: str,
           container_version_strategy: LiteralContainerVersionStrategy | None = None,
           push: bool = False,
           containerize: bool = False,
-           serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+           serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
           additional_args: list[str] | None = None,
           bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
  """Package a LLM into a Bento.
@@ -160,14 +161,14 @@ def _build(model_name: str,
                  container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
    container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
-    serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+    serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
    additional_args: Additional arguments to pass to ``openllm build``.
    bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.

  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
+  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation]
  if quantize: args.extend(['--quantize', quantize])
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
  if push: args.extend(['--push'])
@@ -203,8 +204,8 @@ def _import_model(model_name: str,
                  model_id: str | None = None,
                  model_version: str | None = None,
                  backend: LiteralBackend = 'pt',
-                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                  serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
+                  quantize: LiteralQuantise | None = None,
+                  serialisation: t.Literal['legacy', 'safetensors'] = 'safetensors',
                  additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
  """Import a LLM into local store.

@@ -228,7 +229,7 @@ def _import_model(model_name: str,
              - int8: Quantize the model with 8bit (bitsandbytes required)
              - int4: Quantize the model with 4bit (bitsandbytes required)
              - gptq: Quantize the model with GPTQ (auto-gptq required)
-    serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+    serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
    Default behaviour is similar to ``safe_serialization=False``.
    additional_args: Additional arguments to pass to ``openllm import``.

@@ -236,7 +237,7 @@ def _import_model(model_name: str,
    ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
-  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
+  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation]
  if model_id is not None: args.append(model_id)
  if model_version is not None: args.extend(['--model-version', str(model_version)])
  if additional_args is not None: args.extend(additional_args)
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -54,7 +54,6 @@ import openllm
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelStore
 from openllm import bundle
-from openllm import serialisation
 from openllm.exceptions import OpenLLMException
 from openllm.models.auto import CONFIG_MAPPING
 from openllm.models.auto import MODEL_FLAX_MAPPING_NAMES
@@ -67,6 +66,7 @@ from openllm.utils import infer_auto_class
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import Self
@@ -84,7 +84,6 @@ from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
 from openllm_core.utils import get_quiet_mode
 from openllm_core.utils import is_torch_available
-from openllm_core.utils import is_transformers_supports_agent
 from openllm_core.utils import resolve_user_filepath
 from openllm_core.utils import set_debug_mode
 from openllm_core.utils import set_quiet_mode
@@ -343,8 +342,8 @@ def import_command(
    output: LiteralOutput,
    machine: bool,
    backend: LiteralBackend,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
-    serialisation_format: t.Literal['safetensors', 'legacy'],
+    quantize: LiteralQuantise | None,
+    serialisation: t.Literal['safetensors', 'legacy'],
 ) -> bentoml.Model:
  """Setup LLM interactively.

@@ -369,7 +368,7 @@ def import_command(

  \b
  ```bash
-  $ openllm download opt facebook/opt-2.7b
+  $ openllm import opt facebook/opt-2.7b
  ```

  \b
@@ -400,17 +399,19 @@ def import_command(
  env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
  backend = first_not_none(backend, default=env['backend_value'])
  llm = infer_auto_class(backend).for_model(
-      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
+      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False,
+                                                                         quantize=env['quantize_value'],
+serialisation=serialisation
  )
  _previously_saved = False
  try:
-    _ref = serialisation.get(llm)
+    _ref = openllm.serialisation.get(llm)
    _previously_saved = True
  except openllm.exceptions.OpenLLMException:
    if not machine and output == 'pretty':
      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
      termui.echo(msg, fg='yellow', nl=True)
-    _ref = serialisation.get(llm, auto_import=True)
+    _ref = openllm.serialisation.get(llm, auto_import=True)
    if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
  if machine: return _ref
  elif output == 'pretty':
@@ -472,7 +473,7 @@ def build_command(
    bento_version: str | None,
    overwrite: bool,
    output: LiteralOutput,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+    quantize: LiteralQuantise | None,
    enable_features: tuple[str, ...] | None,
    workers_per_resource: float | None,
    adapter_id: tuple[str, ...],
@@ -483,7 +484,7 @@ def build_command(
    dockerfile_template: t.TextIO | None,
    containerize: bool,
    push: bool,
-    serialisation_format: t.Literal['safetensors', 'legacy'],
+    serialisation: t.Literal['safetensors', 'legacy'],
    container_registry: LiteralContainerRegistry,
    container_version_strategy: LiteralContainerVersionStrategy,
    force_push: bool,
@@ -517,12 +518,12 @@ def build_command(
  # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
  # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
  try:
-    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
+    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation, env.backend: env['backend_value']})
    if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
    if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])

    llm = infer_auto_class(env['backend_value']).for_model(
-        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
+        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=serialisation, **attrs
    )

    labels = dict(llm.identifying_params)
@@ -798,7 +799,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
  except http.client.BadStatusLine:
    raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
  if agent == 'hf':
-    if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
    _memoized = {k: v[0] for k, v in _memoized.items() if v}
    client._hf_agent.set_stream(logger.info)
    if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -49,7 +49,7 @@ class BaseAutoLLMClass:
    ```
    '''
    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
-    if ensure_available: llm.ensure_model_id_exists()
+    if ensure_available: llm.save_pretrained()
    return llm

  @classmethod
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -37,6 +37,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
  from .transformers._helpers import process_config

  config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
+
  bentomodel_fs = fs.open_fs(llm._bentomodel.path)
  if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
    with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -14,13 +14,14 @@ import openllm

 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelOptions
+from openllm_core._typing_compat import M
+from openllm_core._typing_compat import T

 from ._helpers import check_unintialised_params
 from ._helpers import infer_autoclass_from_llm
 from ._helpers import infer_tokenizers_from_llm
 from ._helpers import make_model_signatures
 from ._helpers import process_config
-from ._helpers import update_model
 from .weights import HfIgnore

 if t.TYPE_CHECKING:
@@ -32,8 +33,6 @@ if t.TYPE_CHECKING:

  from bentoml._internal.models import ModelStore
  from openllm_core._typing_compat import DictStrAny
-  from openllm_core._typing_compat import M
-  from openllm_core._typing_compat import T
 else:
  autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
  torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
@@ -63,16 +62,23 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
  """
  config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
  _, tokenizer_attrs = llm.llm_parameters
-  quantize_method = llm._quantize_method
-  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
+  quantize = llm._quantize
+  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
  # Disable safe serialization with vLLM
  if llm.__llm_backend__ == 'vllm': safe_serialisation = False
-  metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
+  metadata: DictStrAny = {'safe_serialisation': safe_serialisation}
+  if quantize: metadata['_quantize'] = quantize
+  architectures = getattr(config, 'architectures', [])
+  if not architectures: raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
+  metadata['_pretrained_class'] = architectures[0]
+
  signatures: DictStrAny = {}

-  if quantize_method == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
+  if quantize == 'gptq':
+    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
+      )
    if llm.config['model_type'] != 'causal_lm':
      raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
    signatures['generate'] = {'batchable': False}
@@ -82,7 +88,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
    if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
      attrs.pop('quantization_config')
    if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
-    metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
+    metadata['_framework'] = llm.__llm_backend__
+    signatures.update(make_model_signatures(llm))

  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
  if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
@@ -95,42 +102,22 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
                                    options=ModelOptions(),
                                    context=openllm.utils.generate_context(framework_name='openllm'),
                                    labels=openllm.utils.generate_labels(llm),
-                                    signatures=signatures if signatures else make_model_signatures(llm))
+                                    metadata=metadata,
+                                    signatures=signatures)
  with openllm.utils.analytics.set_bentoml_tracking():
    try:
      bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
      tokenizer.save_pretrained(bentomodel.path)
-      if quantize_method == 'gptq':
-        if not openllm.utils.is_autogptq_available():
-          raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-        if llm.config['model_type'] != 'causal_lm':
-          raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
-        model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
-                                                            *decls,
-                                                            quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-                                                            trust_remote_code=trust_remote_code,
-                                                            use_safetensors=safe_serialisation,
-                                                            **hub_attrs,
-                                                            **attrs)
-        update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
-        model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
+      if llm._local:
+        # possible local path
+        logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
+        model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
+        # for trust_remote_code to work
+        bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
+        model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
      else:
-        architectures = getattr(config, 'architectures', [])
-        if not architectures:
-          raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
-        architecture = architectures[0]
-        update_model(bentomodel, metadata={'_pretrained_class': architecture})
-        if llm._local:
-          # possible local path
-          logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
-          model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
-          # for trust_remote_code to work
-          bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
-          model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
-        else:
-          # we will clone the all tings into the bentomodel path without loading model into memory
-          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
+        # we will clone the all tings into the bentomodel path without loading model into memory
+        snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
    except Exception:
      raise
    else:
@@ -165,29 +152,27 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:

 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
  config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
-  safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
-                                                    attrs.pop('safe_serialization', None),
-                                                    default=llm._serialisation_format == 'safetensors')
-  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
-                                                       *decls,
-                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-                                                       trust_remote_code=llm.trust_remote_code,
-                                                       use_safetensors=safe_serialization,
-                                                       **hub_attrs,
-                                                       **attrs)
+  auto_class = infer_autoclass_from_llm(llm, config)
+  device_map: str | None = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)

-  device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
-  model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
-                                                                *decls,
-                                                                config=config,
-                                                                trust_remote_code=llm.trust_remote_code,
-                                                                device_map=device_map,
-                                                                **hub_attrs,
-                                                                **attrs).eval()
-  if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
+  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
+    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
+      )
+    if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+
+    model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs)
+    # TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
+    # from accelerate import init_empty_weights
+    # from optimum.gptq import load_quantized_model
+    # # disable exllama if gptq is loaded on CPU
+    # disable_exllama = not torch.cuda.is_available()
+    # with init_empty_weights():
+    #   empty = auto_class.from_pretrained(llm.model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto')
+    # empty.tie_weights()
+    # model = load_quantized_model(empty, save_folder=llm._bentomodel.path, device_map='auto', disable_exllama=disable_exllama)
+  else:
+    model = auto_class.from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval()
+    if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
  return t.cast('M', model)
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -5,7 +5,6 @@ import typing as t
 import openllm
 import openllm_core

-from bentoml._internal.models.model import ModelInfo
 from bentoml._internal.models.model import ModelSignature
 from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
 from openllm.serialisation.constants import HUB_ATTRS
@@ -16,8 +15,6 @@ if t.TYPE_CHECKING:

  from transformers.models.auto.auto_factory import _BaseAutoModelClass

-  import bentoml
-
  from bentoml._internal.models.model import ModelSignaturesType
  from openllm_core._typing_compat import DictStrAny
  from openllm_core._typing_compat import M
@@ -25,8 +22,6 @@ if t.TYPE_CHECKING:
 else:
  transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')

-_object_setattr = object.__setattr__
-
 def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
  '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.

@@ -73,24 +68,6 @@ def check_unintialised_params(model: torch.nn.Module) -> None:
  if len(unintialized) > 0:
    raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')

-def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
-  based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
-  based.update(metadata)
-  _object_setattr(
-      bentomodel,
-      '_info',
-      ModelInfo(  # type: ignore[call-arg] # XXX: remove me once upstream is merged
-          tag=bentomodel.info.tag,
-          module=bentomodel.info.module,
-          labels=bentomodel.info.labels,
-          options=bentomodel.info.options.to_dict(),
-          signatures=bentomodel.info.signatures,
-          context=bentomodel.info.context,
-          api_version=bentomodel.info.api_version,
-          creation_time=bentomodel.info.creation_time,
-          metadata=based))
-  return bentomodel
-
 # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
 def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
  infer_fn: tuple[str, ...] = ('__call__',)
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -11,11 +11,12 @@ import openllm

 if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralBackend
+  from openllm_core._typing_compat import LiteralQuantise

 logger = logging.getLogger(__name__)

@contextlib.contextmanager
-def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
+def build_bento(model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
  logger.info('Building BentoML for %s', model)
  bento = openllm.build(model, model_id=model_id, quantize=quantize)
  yield bento
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -19,13 +19,7 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralBackend

 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
-  return {
-      'backend': llm.__llm_backend__,
-      'framework': 'openllm',
-      'model_name': llm.config['model_name'],
-      'architecture': llm.config['architecture'],
-      'serialisation_format': llm._serialisation_format
-  }
+  return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}

 def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
  import openllm
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -24,6 +24,7 @@ import openllm
 from openllm._llm import normalise_model_name
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import ListAny
+from openllm_core._typing_compat import LiteralQuantise

 logger = logging.getLogger(__name__)

@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
    return container.status in ['running', 'created']

@contextlib.contextmanager
-def _local_handle(model: str,
-                  model_id: str,
-                  image_tag: str,
-                  deployment_mode: t.Literal['container', 'local'],
-                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                  *,
-                  _serve_grpc: bool = False,
-                  ):
+def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
  with openllm.utils.reserve_free_port() as port:
    pass

@@ -169,14 +163,7 @@ def _local_handle(model: str,
    proc.stderr.close()

@contextlib.contextmanager
-def _container_handle(model: str,
-                      model_id: str,
-                      image_tag: str,
-                      deployment_mode: t.Literal['container', 'local'],
-                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                      *,
-                      _serve_grpc: bool = False,
-                      ):
+def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
  envvar = openllm.utils.EnvVarMixin(model)

  with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: