diff --git a/changelog.d/297.refactor.md b/changelog.d/297.refactor.md
new file mode 100644
index 00000000..d88ff541
--- /dev/null
+++ b/changelog.d/297.refactor.md
@@ -0,0 +1 @@
+Refactor GPTQ to use official implementation from transformers>=4.32
diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml
index f4466905..573f73cf 100644
--- a/openllm-client/pyproject.toml
+++ b/openllm-client/pyproject.toml
@@ -105,7 +105,7 @@ dependencies = [
   # avoid https://github.com/pallets/click/issues/2558
   "click==8.1.3",
   "bentoml==1.1.2",
-  "transformers>=4.31.0",
+  "transformers>=4.32.1",
   "pandas-stubs",
   "types-psutil",
   "types-tabulate",
diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py
index ae453ec2..5188c187 100644
--- a/openllm-client/src/openllm_client/_base.py
+++ b/openllm-client/src/openllm_client/_base.py
@@ -19,7 +19,6 @@ from openllm_core._typing_compat import overload
 from openllm_core.utils import bentoml_cattr
 from openllm_core.utils import ensure_exec_coro
 from openllm_core.utils import is_transformers_available
-from openllm_core.utils import is_transformers_supports_agent
 
 from .benmin import AsyncClient as AsyncBentoClient
 from .benmin import Client as BentoClient
@@ -94,8 +93,6 @@ class _ClientAttr:
       raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
     if not self.supports_hf_agent:
       raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
-    if not is_transformers_supports_agent():
-      raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
     import transformers
     return transformers.HfAgent(urljoin(self._address, '/hf/agent'))
 
@@ -215,8 +212,6 @@ class _AsyncClient(_ClientAttr):
     else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
 
   async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if not is_transformers_supports_agent():
-      raise RuntimeError('This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0')
     if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
     from transformers.tools.agents import clean_code_for_run
     from transformers.tools.agents import get_tool_creation_code
diff --git a/openllm-core/pyproject.toml b/openllm-core/pyproject.toml
index f519700a..b025e889 100644
--- a/openllm-core/pyproject.toml
+++ b/openllm-core/pyproject.toml
@@ -118,7 +118,7 @@ dependencies = [
   # avoid https://github.com/pallets/click/issues/2558
   "click==8.1.3",
   "bentoml==1.1.2",
-  "transformers>=4.31.0",
+  "transformers>=4.32.1",
   "pandas-stubs",
   "types-psutil",
   "types-tabulate",
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index 21939026..1054a233 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -11,7 +11,6 @@ import bentoml
 from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict
 
 if t.TYPE_CHECKING:
-  import auto_gptq as autogptq
   import peft
   import transformers
   import vllm
@@ -26,11 +25,7 @@ if t.TYPE_CHECKING:
 
   from .utils.lazy import VersionInfo
 
-M = t.TypeVar(
-    'M',
-    bound=
-    't.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]'
-)
+M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel]')
 T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')
 
 def get_literal_args(typ: t.Any) -> tuple[str, ...]:
@@ -43,6 +38,7 @@ ListStr = t.List[str]
 TupleAny = t.Tuple[t.Any, ...]
 At = t.TypeVar('At', bound=attr.AttrsInstance)
 
+LiteralQuantise = t.Literal['int8', 'int4', 'gptq']
 LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc']
 AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3']
 
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index bdf4e20a..dfe76ea6 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -332,8 +332,6 @@ _import_structure: dict[str, list[str]] = {
         'is_bitsandbytes_available',
         'is_peft_available',
         'is_datasets_available',
-        'is_transformers_supports_kbit',
-        'is_transformers_supports_agent',
         'is_jupyter_available',
         'is_jupytext_available',
         'is_notebook_available',
@@ -344,7 +342,8 @@ _import_structure: dict[str, list[str]] = {
         'is_fairscale_available',
         'is_grpc_available',
         'is_grpc_health_available',
-        'is_transformers_available'
+        'is_transformers_available',
+        'is_optimum_supports_gptq',
     ]
 }
 
@@ -368,13 +367,12 @@ if t.TYPE_CHECKING:
   from .import_utils import is_jupyter_available as is_jupyter_available
   from .import_utils import is_jupytext_available as is_jupytext_available
   from .import_utils import is_notebook_available as is_notebook_available
+  from .import_utils import is_optimum_supports_gptq as is_optimum_supports_gptq
   from .import_utils import is_peft_available as is_peft_available
   from .import_utils import is_sentencepiece_available as is_sentencepiece_available
   from .import_utils import is_tf_available as is_tf_available
   from .import_utils import is_torch_available as is_torch_available
   from .import_utils import is_transformers_available as is_transformers_available
-  from .import_utils import is_transformers_supports_agent as is_transformers_supports_agent
-  from .import_utils import is_transformers_supports_kbit as is_transformers_supports_kbit
   from .import_utils import is_triton_available as is_triton_available
   from .import_utils import is_vllm_available as is_vllm_available
   from .import_utils import is_xformers_available as is_xformers_available
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index fe5d44c6..e0a4aef2 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -74,11 +74,8 @@ def is_grpc_available() -> bool:
 def is_grpc_health_available() -> bool:
   return _grpc_health_available
 
-def is_transformers_supports_kbit() -> bool:
-  return pkg.pkg_version_info('transformers')[:2] >= (4, 30)
-
-def is_transformers_supports_agent() -> bool:
-  return pkg.pkg_version_info('transformers')[:2] >= (4, 29)
+def is_optimum_supports_gptq() -> bool:
+  return pkg.pkg_version_info('optimum')[:2] >= (0, 12)
 
 def is_jupyter_available() -> bool:
   return _jupyter_available
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 129f5075..c2d225b1 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -39,10 +39,11 @@ classifiers = [
 ]
 dependencies = [
   "bentoml[io]>=1.1.2",
-  "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "transformers[torch,tokenizers,accelerate]>=4.32.1",
   "openllm-client",
   "safetensors",
-  "optimum",
+  "optimum>=1.12.0",
+  "accelerate",
   "ghapi",
   "tabulate[widechars]>=0.9.0",
   "click>=8.1.3",
@@ -99,13 +100,13 @@ all = ["openllm[full]"]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
 falcon = ["einops", "xformers"]
-fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+fine-tune = ["peft>=0.5.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 full = [
-  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
+    "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
-gptq = ["auto-gptq[triton]"]
+gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
 grpc = ["openllm-client[grpc]"]
 llama = ["fairscale", "sentencepiece"]
 mpt = ["triton", "einops"]
@@ -150,7 +151,7 @@ dependencies = [
   # avoid https://github.com/pallets/click/issues/2558
   "click==8.1.3",
   "bentoml==1.1.2",
-  "transformers>=4.31.0",
+  "transformers>=4.32.1",
   "pandas-stubs",
   "types-psutil",
   "types-tabulate",
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 89160e3c..7e0ad43f 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -28,6 +28,7 @@ from openllm_core._typing_compat import AdaptersTuple
 from openllm_core._typing_compat import AdapterType
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import LLMRunnable
 from openllm_core._typing_compat import LLMRunner
@@ -63,7 +64,6 @@ from .utils import infer_auto_class
 
 if t.TYPE_CHECKING:
 
-  import auto_gptq as autogptq
   import peft
   import torch
   import transformers
@@ -71,7 +71,6 @@ if t.TYPE_CHECKING:
   from openllm_core._configuration import PeftType
   from openllm_core.utils.representation import ReprArgs
 else:
-  autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
   transformers = LazyLoader('transformers', globals(), 'transformers')
   torch = LazyLoader('torch', globals(), 'torch')
   peft = LazyLoader('peft', globals(), 'peft')
@@ -80,6 +79,8 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf
 
 logger = logging.getLogger(__name__)
 
+_object_setattr = object.__setattr__
+
 def normalise_model_name(name: str) -> str:
   if validate_is_path(name): return os.path.basename(resolve_filepath(name))
   name = name.replace('/', '--')
@@ -280,7 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
     def __attrs_init__(self,
                        config: LLMConfig,
-                       quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
+                       quantize: t.Optional[LiteralQuantise],
+                       quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
                        model_id: str,
                        model_decls: TupleAny,
                        model_attrs: DictStrAny,
@@ -288,17 +290,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
                        tag: bentoml.Tag,
                        adapters_mapping: t.Optional[AdaptersMapping],
                        model_version: t.Optional[str],
-                       quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                       serialisation_format: t.Literal['safetensors', 'legacy'],
+                       serialisation: t.Literal['safetensors', 'legacy'],
                        _local: bool,
                        **attrs: t.Any) -> None:
       '''Generated __attrs_init__ for openllm.LLM.'''
 
   config: LLMConfig
   '''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
-  quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
+  quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None
   '''Quantisation config for quantised model on the fly.'''
-
+  _quantize: LiteralQuantise | None
   _model_id: str
   _model_decls: TupleAny
   _model_attrs: DictStrAny
@@ -306,8 +307,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
   _tag: bentoml.Tag
   _adapters_mapping: AdaptersMapping | None
   _model_version: str
-  _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None
-  _serialisation_format: t.Literal['safetensors', 'legacy']
+  _serialisation: t.Literal['safetensors', 'legacy']
   _local: bool
 
   def __init_subclass__(cls: type[LLM[M, T]]) -> None:
@@ -376,11 +376,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
                       model_version: str | None = None,
                       llm_config: LLMConfig | None = None,
                       *args: t.Any,
-                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                      quantize: LiteralQuantise | None = None,
                       adapter_id: str | None = None,
                       adapter_name: str | None = None,
                       adapter_map: dict[str, str | None] | None = None,
-                      quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+                      quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
                       serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
                       **attrs: t.Any) -> LLM[M, T]:
     '''Instantiate a pretrained LLM.
@@ -403,9 +403,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
     ```
 
-    For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
-    to ``auto_gptq.BaseQuantizeConfig``.
-
     ### Adapter options:
 
     > This is used in conjunction with the fine-tuning features
@@ -427,7 +424,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
                     will use `config_class` to construct default configuration.
         quantize: The quantization to use for this LLM. Defaults to None. Possible values
                   include int8, int4 and gptq.
-        quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
+        quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `transformers.GPTQConfig`) to use. Note that this is mutually exclusive with `quantize`
         serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
                       Default behaviour is similar to ``safe_serialization=False``.
         adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
@@ -440,13 +437,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
     _local = False
     _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
     if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
-    quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
+    quantize = first_not_none(quantize, t.cast(t.Optional[LiteralQuantise], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
 
     # quantization setup
     if quantization_config and quantize:
       raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
     if quantization_config is None and quantize is not None:
-      quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
+      # in case users input `tokenizer` to __init__, default to the _model_id
+      _gptq_tokenizer = attrs.pop('tokenizer', _model_id)
+      quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs)
     if quantize == 'gptq': serialisation = 'safetensors'
     elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy'  # Currently working-in-progress
 
@@ -476,10 +475,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
                model_id=_model_id,
                llm_config=llm_config,
                quantization_config=quantization_config,
-               _quantize_method=quantize,
+               _quantize=quantize,
                _model_version=_tag.version,
                _tag=_tag,
-               _serialisation_format=serialisation,
+               _serialisation=serialisation,
                _local=_local,
                _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
                **attrs)
@@ -534,12 +533,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
                *args: t.Any,
                model_id: str,
                llm_config: LLMConfig,
-               quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
+               quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
                _adapters_mapping: AdaptersMapping | None,
                _tag: bentoml.Tag,
-               _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
+               _quantize: LiteralQuantise | None,
                _model_version: str,
-               _serialisation_format: t.Literal['safetensors', 'legacy'],
+               _serialisation: t.Literal['safetensors', 'legacy'],
                _local: bool,
                **attrs: t.Any,
                ):
@@ -641,6 +640,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     # NOTE: Save the args and kwargs for latter load
     self.__attrs_init__(llm_config,
                         quantization_config,
+                        _quantize,
                         model_id,
                         args, {
                             **model_kwds, **normalized_model_kwds
@@ -650,8 +650,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
                         _tag,
                         _adapters_mapping,
                         _model_version,
-                        _quantize_method,
-                        _serialisation_format,
+                        _serialisation,
                         _local)
 
     self.llm_post_init()
@@ -672,7 +671,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
   @adapters_mapping.setter
   def adapters_mapping(self, value: AdaptersMapping) -> None:
-    self._adapters_mapping = value
+    _object_setattr(self, '_adapters_mapping', value)
 
   @property
   def __repr_keys__(self) -> set[str]:
@@ -709,13 +708,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def tag(self) -> bentoml.Tag:
     return self._tag
 
-  def ensure_model_id_exists(self) -> bentoml.Model:
+  def save_pretrained(self) -> bentoml.Model:
     return openllm.import_model(self.config['start_name'],
                                 model_id=self.model_id,
                                 model_version=self._model_version,
                                 backend=self.__llm_backend__,
-                                quantize=self._quantize_method,
-                                serialisation_format=self._serialisation_format)
+                                quantize=self._quantize,
+                                serialisation=self._serialisation)
 
   @property
   def _bentomodel(self) -> bentoml.Model:
@@ -1085,11 +1084,11 @@ def Runner(model_name: str,
            model_id: str | None = ...,
            model_version: str | None = ...,
            llm_config: LLMConfig | None = ...,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
+           quantize: LiteralQuantise | None = ...,
            adapter_id: str | None = ...,
            adapter_name: str | None = ...,
            adapter_map: dict[str, str | None] | None = ...,
-           quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+           quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
            serialisation: t.Literal['safetensors', 'legacy'] = ...,
            **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
   ...
@@ -1270,7 +1269,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
                              'config': self.config,
                              'backend': self.__llm_backend__,
                              'peft_adapters': property(fget=available_adapters),
-                             'download_model': self.ensure_model_id_exists,
+                             'download_model': self.save_pretrained,
                              '__call__': _wrapped_generate_run,
                              'embed': _wrapped_embeddings_run,
                              '__module__': self.__module__,
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index 83258aac..16f02893 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -3,47 +3,68 @@ from __future__ import annotations
 import logging
 import typing as t
 
+import torch
+import transformers
+
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import overload
-from openllm_core.utils import LazyLoader
 from openllm_core.utils import is_autogptq_available
 from openllm_core.utils import is_bitsandbytes_available
-from openllm_core.utils import is_transformers_supports_kbit
-from openllm_core.utils import pkg
+from openllm_core.utils import is_optimum_supports_gptq
 
 if t.TYPE_CHECKING:
   from openllm_core._typing_compat import DictStrAny
 
   from ._llm import LLM
 
-autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
-
 logger = logging.getLogger(__name__)
 
-QuantiseMode = t.Literal['int8', 'int4', 'gptq']
-
 @overload
 def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
   ...
 
 @overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
   ...
 
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
   # 8 bit configuration
   int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
   int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
   int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
   int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)
 
-  autogptq_attrs: DictStrAny = {
-      'bits': attrs.pop('gptq_bits', 4),
-      'group_size': attrs.pop('gptq_group_size', -1),
-      'damp_percent': attrs.pop('gptq_damp_percent', 0.01),
-      'desc_act': attrs.pop('gptq_desc_act', True),
-      'sym': attrs.pop('gptq_sym', True),
-      'true_sequential': attrs.pop('gptq_true_sequential', True),
-  }
+  def create_gptq_config() -> transformers.GPTQConfig:
+    gptq_bits = attrs.pop('bits', 4)
+    gptq_tokenizer = attrs.pop('tokenizer', None)
+    gptq_dataset = attrs.pop('dataset', 'c4')
+    gptq_group_size = attrs.pop('group_size', 128)
+    gptq_damp_percent = attrs.pop('damp_percent', 0.1)
+    gptq_desc_act = attrs.pop('desc_act', False)
+    gptq_sym = attrs.pop('sym', True)
+    gptq_true_sequential = attrs.pop('true_sequential', True)
+    gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False)
+    gptq_model_seqlen = attrs.pop('model_seqlen', None)
+    gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None)
+    gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None)
+    gptq_batch_size = attrs.pop('batch_size', 1)
+    gptq_pad_token_id = attrs.pop('pad_token_id', None)
+    gptq_disable_exllama = attrs.pop('disable_exllama', False)
+    return transformers.GPTQConfig(bits=gptq_bits,
+                                   tokenizer=gptq_tokenizer,
+                                   dataset=gptq_dataset,
+                                   group_size=gptq_group_size,
+                                   damp_percent=gptq_damp_percent,
+                                   desc_act=gptq_desc_act,
+                                   sym=gptq_sym,
+                                   true_sequential=gptq_true_sequential,
+                                   use_cuda_fp16=gptq_use_cuda_fp16,
+                                   model_seqlen=gptq_model_seqlen,
+                                   block_name_to_quantize=gptq_block_name_to_quantize,
+                                   module_name_preceding_first_block=gptq_module_name_preceding_first_block,
+                                   batch_size=gptq_batch_size,
+                                   pad_token_id=gptq_pad_token_id,
+                                   disable_exllama=gptq_disable_exllama)
 
   def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
     if int8_skip_modules is None: int8_skip_modules = []
@@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
     raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
   if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
   elif quantise == 'int4':
-    if is_transformers_supports_kbit():
-      quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
-                                                            bnb_4bit_compute_dtype=int4_compute_dtype,
-                                                            bnb_4bit_quant_type=int4_quant_type,
-                                                            bnb_4bit_use_double_quant=int4_use_double_quant)
-    else:
-      logger.warning(
-          "'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
-          pkg.pkg_version_info('transformers'))
-      quantisation_config = create_int8_config(int8_skip_modules)
+    quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
+                                                          bnb_4bit_compute_dtype=int4_compute_dtype,
+                                                          bnb_4bit_quant_type=int4_quant_type,
+                                                          bnb_4bit_use_double_quant=int4_use_double_quant)
   elif quantise == 'gptq':
-    if not is_autogptq_available():
+    if not is_autogptq_available() or not is_optimum_supports_gptq():
       logger.warning(
-          "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
+          "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
       )
       quantisation_config = create_int8_config(int8_skip_modules)
     else:
-      quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
+      quantisation_config = create_gptq_config()
   else:
     raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
   return quantisation_config, attrs
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 70fd1608..fc2be132 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -109,12 +109,11 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
              ],
              'num_tokens': 20
          }))
-async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+async def embeddings_v1(phrases: list[str]) -> list[openllm.EmbeddingsOutput]:
   embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode  # type: ignore[type-arg,assignment,valid-type]
-  responses = (await embed_call.async_run(phrases))[0]
-  return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens'])
+  return await embed_call.async_run(phrases)
 
-if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
+if runner.supports_hf_agent:
 
   async def hf_agent(request: Request) -> Response:
     json_str = await request.body()
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 5cb23f20..262d5730 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -128,19 +128,19 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
                              quantize: LiteralString | None,
                              adapter_map: dict[str, str | None] | None,
                              dockerfile_template: str | None,
-                             serialisation_format: t.Literal['safetensors', 'legacy'],
+                             serialisation: t.Literal['safetensors', 'legacy'],
                              container_registry: LiteralContainerRegistry,
                              container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
   from openllm.cli._factory import parse_config_options
   environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
   env: openllm_core.utils.EnvVarMixin = llm.config['env']
-  if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
+  if env['backend_value'] == 'vllm': serialisation = 'legacy'
   env_dict = {
       env.backend: env['backend_value'],
       env.config: f"'{llm.config.model_dump_json().decode()}'",
       env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
       'OPENLLM_MODEL': llm.config['model_name'],
-      'OPENLLM_SERIALIZATION': serialisation_format,
+      'OPENLLM_SERIALIZATION': serialisation,
       'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'",
       'BENTOML_DEBUG': str(True),
       'BENTOML_QUIET': str(False),
@@ -207,7 +207,7 @@ def create_bento(bento_tag: bentoml.Tag,
                  dockerfile_template: str | None,
                  adapter_map: dict[str, str | None] | None = None,
                  extra_dependencies: tuple[str, ...] | None = None,
-                 serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+                 serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
                  container_registry: LiteralContainerRegistry = 'ecr',
                  container_version_strategy: LiteralContainerVersionStrategy = 'release',
                  _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
@@ -246,7 +246,7 @@ def create_bento(bento_tag: bentoml.Tag,
                                                                   quantize,
                                                                   adapter_map,
                                                                   dockerfile_template,
-                                                                  serialisation_format,
+                                                                  serialisation,
                                                                   container_registry,
                                                                   container_version_strategy))
 
diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
index b1d1b5af..085e0ed1 100644
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -22,6 +22,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import get_literal_args
@@ -131,15 +132,15 @@ Available official model_id(s): [default: {llm_config['default_id']}]
                 model_version: str | None,
                 workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
                 device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+                quantize: LiteralQuantise | None,
                 backend: LiteralBackend,
-                serialisation_format: t.Literal['safetensors', 'legacy'],
+                serialisation: t.Literal['safetensors', 'legacy'],
                 cors: bool,
                 adapter_id: str | None,
                 return_process: bool,
                 **attrs: t.Any,
                 ) -> LLMConfig | subprocess.Popen[bytes]:
-    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
+    if serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
       termui.echo(
           f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
           fg='yellow')
@@ -184,11 +185,11 @@ Available official model_id(s): [default: {llm_config['default_id']}]
         'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
         'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
         'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
-        'OPENLLM_SERIALIZATION': serialisation_format,
-        env.backend: env['backend_value']
+        'OPENLLM_SERIALIZATION': serialisation,
+        env.backend: env['backend_value'],
     })
     if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
-    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
+    if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value'])
 
     llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
                                                                          model_id=start_env[env.model_id],
@@ -196,7 +197,8 @@ Available official model_id(s): [default: {llm_config['default_id']}]
                                                                          llm_config=config,
                                                                          ensure_available=True,
                                                                          adapter_map=adapter_map,
-                                                                         serialisation=serialisation_format)
+                                                                         quantize=env['quantize_value'],
+                                                                         serialisation=serialisation)
     start_env.update({env.config: llm.config.model_dump_json().decode()})
 
     server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
@@ -262,8 +264,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
 
             - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
             - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            ''',
-                           ),
+            '''),
         quantize_option(factory=cog.optgroup),
         serialisation_option(factory=cog.optgroup),
         cog.optgroup.option('--device',
@@ -457,7 +458,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--serialisation',
                     '--serialization',
-                    'serialisation_format',
+                    'serialisation',
                     type=click.Choice(['safetensors', 'legacy']),
                     default='safetensors',
                     show_default=True,
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index c981f126..3812eb23 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -26,6 +26,7 @@ if t.TYPE_CHECKING:
   from openllm_core._typing_compat import LiteralBackend
   from openllm_core._typing_compat import LiteralContainerRegistry
   from openllm_core._typing_compat import LiteralContainerVersionStrategy
+  from openllm_core._typing_compat import LiteralQuantise
   from openllm_core._typing_compat import LiteralString
 
 logger = logging.getLogger(__name__)
@@ -37,7 +38,7 @@ def _start(model_name: str,
            timeout: int = 30,
            workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
            device: tuple[str, ...] | t.Literal['all'] | None = None,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           quantize: LiteralQuantise | None = None,
            adapter_map: dict[LiteralString, str | None] | None = None,
            backend: LiteralBackend | None = None,
            additional_args: list[str] | None = None,
@@ -109,7 +110,7 @@ def _build(model_name: str,
            model_id: str | None = None,
            model_version: str | None = None,
            bento_version: str | None = None,
-           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           quantize: LiteralQuantise | None = None,
            adapter_map: dict[str, str | None] | None = None,
            build_ctx: str | None = None,
            enable_features: tuple[str, ...] | None = None,
@@ -120,7 +121,7 @@ def _build(model_name: str,
            container_version_strategy: LiteralContainerVersionStrategy | None = None,
            push: bool = False,
            containerize: bool = False,
-           serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+           serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
            additional_args: list[str] | None = None,
            bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
   """Package a LLM into a Bento.
@@ -160,14 +161,14 @@ def _build(model_name: str,
                   container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
     container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
     container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
-    serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+    serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
     additional_args: Additional arguments to pass to ``openllm build``.
     bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
 
   Returns:
       ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
-  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
+  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation]
   if quantize: args.extend(['--quantize', quantize])
   if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
   if push: args.extend(['--push'])
@@ -203,8 +204,8 @@ def _import_model(model_name: str,
                   model_id: str | None = None,
                   model_version: str | None = None,
                   backend: LiteralBackend = 'pt',
-                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                  serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
+                  quantize: LiteralQuantise | None = None,
+                  serialisation: t.Literal['legacy', 'safetensors'] = 'safetensors',
                   additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
   """Import a LLM into local store.
 
@@ -228,7 +229,7 @@ def _import_model(model_name: str,
               - int8: Quantize the model with 8bit (bitsandbytes required)
               - int4: Quantize the model with 4bit (bitsandbytes required)
               - gptq: Quantize the model with GPTQ (auto-gptq required)
-    serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+    serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
     Default behaviour is similar to ``safe_serialization=False``.
     additional_args: Additional arguments to pass to ``openllm import``.
 
@@ -236,7 +237,7 @@ def _import_model(model_name: str,
     ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
   from .entrypoint import import_command
-  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
+  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation]
   if model_id is not None: args.append(model_id)
   if model_version is not None: args.extend(['--model-version', str(model_version)])
   if additional_args is not None: args.extend(additional_args)
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index 93445c34..f3c60091 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -54,7 +54,6 @@ import openllm
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelStore
 from openllm import bundle
-from openllm import serialisation
 from openllm.exceptions import OpenLLMException
 from openllm.models.auto import CONFIG_MAPPING
 from openllm.models.auto import MODEL_FLAX_MAPPING_NAMES
@@ -67,6 +66,7 @@ from openllm.utils import infer_auto_class
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import LiteralBackend
+from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import Self
@@ -84,7 +84,6 @@ from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
 from openllm_core.utils import get_quiet_mode
 from openllm_core.utils import is_torch_available
-from openllm_core.utils import is_transformers_supports_agent
 from openllm_core.utils import resolve_user_filepath
 from openllm_core.utils import set_debug_mode
 from openllm_core.utils import set_quiet_mode
@@ -343,8 +342,8 @@ def import_command(
     output: LiteralOutput,
     machine: bool,
     backend: LiteralBackend,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
-    serialisation_format: t.Literal['safetensors', 'legacy'],
+    quantize: LiteralQuantise | None,
+    serialisation: t.Literal['safetensors', 'legacy'],
 ) -> bentoml.Model:
   """Setup LLM interactively.
 
@@ -369,7 +368,7 @@ def import_command(
 
   \b
   ```bash
-  $ openllm download opt facebook/opt-2.7b
+  $ openllm import opt facebook/opt-2.7b
   ```
 
   \b
@@ -400,17 +399,19 @@ def import_command(
   env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
   backend = first_not_none(backend, default=env['backend_value'])
   llm = infer_auto_class(backend).for_model(
-      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
+      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False,
+                                                                         quantize=env['quantize_value'],
+serialisation=serialisation
   )
   _previously_saved = False
   try:
-    _ref = serialisation.get(llm)
+    _ref = openllm.serialisation.get(llm)
     _previously_saved = True
   except openllm.exceptions.OpenLLMException:
     if not machine and output == 'pretty':
       msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
       termui.echo(msg, fg='yellow', nl=True)
-    _ref = serialisation.get(llm, auto_import=True)
+    _ref = openllm.serialisation.get(llm, auto_import=True)
     if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
   if machine: return _ref
   elif output == 'pretty':
@@ -472,7 +473,7 @@ def build_command(
     bento_version: str | None,
     overwrite: bool,
     output: LiteralOutput,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+    quantize: LiteralQuantise | None,
     enable_features: tuple[str, ...] | None,
     workers_per_resource: float | None,
     adapter_id: tuple[str, ...],
@@ -483,7 +484,7 @@ def build_command(
     dockerfile_template: t.TextIO | None,
     containerize: bool,
     push: bool,
-    serialisation_format: t.Literal['safetensors', 'legacy'],
+    serialisation: t.Literal['safetensors', 'legacy'],
     container_registry: LiteralContainerRegistry,
     container_version_strategy: LiteralContainerVersionStrategy,
     force_push: bool,
@@ -517,12 +518,12 @@ def build_command(
   # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
   # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
   try:
-    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
+    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation, env.backend: env['backend_value']})
     if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
     if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
 
     llm = infer_auto_class(env['backend_value']).for_model(
-        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
+        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=serialisation, **attrs
     )
 
     labels = dict(llm.identifying_params)
@@ -798,7 +799,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
   except http.client.BadStatusLine:
     raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
   if agent == 'hf':
-    if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
     _memoized = {k: v[0] for k, v in _memoized.items() if v}
     client._hf_agent.set_stream(logger.info)
     if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py
index d04d7bef..d309f423 100644
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -49,7 +49,7 @@ class BaseAutoLLMClass:
     ```
     '''
     llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
-    if ensure_available: llm.ensure_model_id_exists()
+    if ensure_available: llm.save_pretrained()
     return llm
 
   @classmethod
diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
index caabd0d6..b7933680 100644
--- a/openllm-python/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -37,6 +37,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
   from .transformers._helpers import process_config
 
   config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
+
   bentomodel_fs = fs.open_fs(llm._bentomodel.path)
   if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
     with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index 44f36576..cb712ec5 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -14,13 +14,14 @@ import openllm
 
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelOptions
+from openllm_core._typing_compat import M
+from openllm_core._typing_compat import T
 
 from ._helpers import check_unintialised_params
 from ._helpers import infer_autoclass_from_llm
 from ._helpers import infer_tokenizers_from_llm
 from ._helpers import make_model_signatures
 from ._helpers import process_config
-from ._helpers import update_model
 from .weights import HfIgnore
 
 if t.TYPE_CHECKING:
@@ -32,8 +33,6 @@ if t.TYPE_CHECKING:
 
   from bentoml._internal.models import ModelStore
   from openllm_core._typing_compat import DictStrAny
-  from openllm_core._typing_compat import M
-  from openllm_core._typing_compat import T
 else:
   autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
   torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
@@ -63,16 +62,23 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
   """
   config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
   _, tokenizer_attrs = llm.llm_parameters
-  quantize_method = llm._quantize_method
-  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
+  quantize = llm._quantize
+  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
   # Disable safe serialization with vLLM
   if llm.__llm_backend__ == 'vllm': safe_serialisation = False
-  metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
+  metadata: DictStrAny = {'safe_serialisation': safe_serialisation}
+  if quantize: metadata['_quantize'] = quantize
+  architectures = getattr(config, 'architectures', [])
+  if not architectures: raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
+  metadata['_pretrained_class'] = architectures[0]
+
   signatures: DictStrAny = {}
 
-  if quantize_method == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
+  if quantize == 'gptq':
+    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
+      )
     if llm.config['model_type'] != 'causal_lm':
       raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
     signatures['generate'] = {'batchable': False}
@@ -82,7 +88,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
     if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
       attrs.pop('quantization_config')
     if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
-    metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
+    metadata['_framework'] = llm.__llm_backend__
+    signatures.update(make_model_signatures(llm))
 
   tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
   if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
@@ -95,42 +102,22 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
                                     options=ModelOptions(),
                                     context=openllm.utils.generate_context(framework_name='openllm'),
                                     labels=openllm.utils.generate_labels(llm),
-                                    signatures=signatures if signatures else make_model_signatures(llm))
+                                    metadata=metadata,
+                                    signatures=signatures)
   with openllm.utils.analytics.set_bentoml_tracking():
     try:
       bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
       tokenizer.save_pretrained(bentomodel.path)
-      if quantize_method == 'gptq':
-        if not openllm.utils.is_autogptq_available():
-          raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-        if llm.config['model_type'] != 'causal_lm':
-          raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
-        model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
-                                                            *decls,
-                                                            quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-                                                            trust_remote_code=trust_remote_code,
-                                                            use_safetensors=safe_serialisation,
-                                                            **hub_attrs,
-                                                            **attrs)
-        update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
-        model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
+      if llm._local:
+        # possible local path
+        logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
+        model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
+        # for trust_remote_code to work
+        bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
+        model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
       else:
-        architectures = getattr(config, 'architectures', [])
-        if not architectures:
-          raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
-        architecture = architectures[0]
-        update_model(bentomodel, metadata={'_pretrained_class': architecture})
-        if llm._local:
-          # possible local path
-          logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
-          model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
-          # for trust_remote_code to work
-          bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
-          model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
-        else:
-          # we will clone the all tings into the bentomodel path without loading model into memory
-          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
+        # we will clone the all tings into the bentomodel path without loading model into memory
+        snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
     except Exception:
       raise
     else:
@@ -165,29 +152,27 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
 
 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
   config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
-  safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
-                                                    attrs.pop('safe_serialization', None),
-                                                    default=llm._serialisation_format == 'safetensors')
-  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
-                                                       *decls,
-                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-                                                       trust_remote_code=llm.trust_remote_code,
-                                                       use_safetensors=safe_serialization,
-                                                       **hub_attrs,
-                                                       **attrs)
+  auto_class = infer_autoclass_from_llm(llm, config)
+  device_map: str | None = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
 
-  device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
-  model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
-                                                                *decls,
-                                                                config=config,
-                                                                trust_remote_code=llm.trust_remote_code,
-                                                                device_map=device_map,
-                                                                **hub_attrs,
-                                                                **attrs).eval()
-  if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
+  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
+    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
+      )
+    if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+
+    model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs)
+    # TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
+    # from accelerate import init_empty_weights
+    # from optimum.gptq import load_quantized_model
+    # # disable exllama if gptq is loaded on CPU
+    # disable_exllama = not torch.cuda.is_available()
+    # with init_empty_weights():
+    #   empty = auto_class.from_pretrained(llm.model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto')
+    # empty.tie_weights()
+    # model = load_quantized_model(empty, save_folder=llm._bentomodel.path, device_map='auto', disable_exllama=disable_exllama)
+  else:
+    model = auto_class.from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval()
+    if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
   return t.cast('M', model)
diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
index 29fcf8af..57774913 100644
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -5,7 +5,6 @@ import typing as t
 import openllm
 import openllm_core
 
-from bentoml._internal.models.model import ModelInfo
 from bentoml._internal.models.model import ModelSignature
 from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
 from openllm.serialisation.constants import HUB_ATTRS
@@ -16,8 +15,6 @@ if t.TYPE_CHECKING:
 
   from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-  import bentoml
-
   from bentoml._internal.models.model import ModelSignaturesType
   from openllm_core._typing_compat import DictStrAny
   from openllm_core._typing_compat import M
@@ -25,8 +22,6 @@ if t.TYPE_CHECKING:
 else:
   transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
 
-_object_setattr = object.__setattr__
-
 def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
   '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
 
@@ -73,24 +68,6 @@ def check_unintialised_params(model: torch.nn.Module) -> None:
   if len(unintialized) > 0:
     raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
 
-def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
-  based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
-  based.update(metadata)
-  _object_setattr(
-      bentomodel,
-      '_info',
-      ModelInfo(  # type: ignore[call-arg] # XXX: remove me once upstream is merged
-          tag=bentomodel.info.tag,
-          module=bentomodel.info.module,
-          labels=bentomodel.info.labels,
-          options=bentomodel.info.options.to_dict(),
-          signatures=bentomodel.info.signatures,
-          context=bentomodel.info.context,
-          api_version=bentomodel.info.api_version,
-          creation_time=bentomodel.info.creation_time,
-          metadata=based))
-  return bentomodel
-
 # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
 def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
   infer_fn: tuple[str, ...] = ('__call__',)
diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
index 90043c9b..b0d8cdf6 100644
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -11,11 +11,12 @@ import openllm
 
 if t.TYPE_CHECKING:
   from openllm_core._typing_compat import LiteralBackend
+  from openllm_core._typing_compat import LiteralQuantise
 
 logger = logging.getLogger(__name__)
 
 @contextlib.contextmanager
-def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
+def build_bento(model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
   logger.info('Building BentoML for %s', model)
   bento = openllm.build(model, model_id=model_id, quantize=quantize)
   yield bento
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index 75cf83e8..f8a7bf29 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -19,13 +19,7 @@ if t.TYPE_CHECKING:
   from openllm_core._typing_compat import LiteralBackend
 
 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
-  return {
-      'backend': llm.__llm_backend__,
-      'framework': 'openllm',
-      'model_name': llm.config['model_name'],
-      'architecture': llm.config['architecture'],
-      'serialisation_format': llm._serialisation_format
-  }
+  return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
 
 def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
   import openllm
diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py
index 3d722661..24b29eb6 100644
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -24,6 +24,7 @@ import openllm
 from openllm._llm import normalise_model_name
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import ListAny
+from openllm_core._typing_compat import LiteralQuantise
 
 logger = logging.getLogger(__name__)
 
@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
     return container.status in ['running', 'created']
 
 @contextlib.contextmanager
-def _local_handle(model: str,
-                  model_id: str,
-                  image_tag: str,
-                  deployment_mode: t.Literal['container', 'local'],
-                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                  *,
-                  _serve_grpc: bool = False,
-                  ):
+def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
   with openllm.utils.reserve_free_port() as port:
     pass
 
@@ -169,14 +163,7 @@ def _local_handle(model: str,
     proc.stderr.close()
 
 @contextlib.contextmanager
-def _container_handle(model: str,
-                      model_id: str,
-                      image_tag: str,
-                      deployment_mode: t.Literal['container', 'local'],
-                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                      *,
-                      _serve_grpc: bool = False,
-                      ):
+def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
   envvar = openllm.utils.EnvVarMixin(model)
 
   with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
diff --git a/tools/dependencies.py b/tools/dependencies.py
index 9c50a26a..116c6386 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -110,10 +110,11 @@ _TRANSFORMERS_EXT = ['torch', 'tokenizers', 'accelerate']
 
 _BASE_DEPENDENCIES = [
     Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=lower_bentoml_constraint),
-    Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint='4.29.0'),
+    Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint='4.32.1'),
     Dependencies(name='openllm-client'),
     Dependencies(name='safetensors'),
-    Dependencies(name='optimum'),
+    Dependencies(name='optimum', lower_constraint="1.12.0"),
+    Dependencies(name='accelerate'),
     Dependencies(name='ghapi'),
     Dependencies(name='tabulate', extensions=['widechars'], lower_constraint='0.9.0'),
     Dependencies(name='click', lower_constraint='8.1.3'),
@@ -122,7 +123,7 @@ _BASE_DEPENDENCIES = [
 ]
 
 _ALL_RUNTIME_DEPS = ['flax>=0.7', 'jax', 'jaxlib', 'tensorflow', 'keras']
-FINE_TUNE_DEPS = ['peft>=0.4.0', 'bitsandbytes', 'datasets', 'accelerate', 'trl']
+FINE_TUNE_DEPS = ['peft>=0.5.0', 'bitsandbytes', 'datasets', 'accelerate', 'trl']
 FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
 OPT_DEPS = _ALL_RUNTIME_DEPS
 GRPC_DEPS = ['openllm-client[grpc]']
@@ -130,7 +131,7 @@ OPENAI_DEPS = ['openai', 'tiktoken']
 AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile']
 PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
 GGML_DEPS = ['ctransformers']
-GPTQ_DEPS = ['auto-gptq[triton]']
+GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0']
 VLLM_DEPS = ['vllm>=0.1.4', 'ray']
 
 _base_requirements: dict[str, t.Any] = {