From 8530a067ea1d189c8a860cf65b1330fed7931954 Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Thu, 7 Sep 2023 16:50:25 +0000
Subject: [PATCH] chore(serialisation): dump quantization_config.json to
 conform with optimum load

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 .../src/openllm/serialisation/transformers/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index cb712ec5..9f0c1b07 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -4,6 +4,8 @@ import importlib
 import logging
 import typing as t
 
+import orjson
+
 from huggingface_hub import snapshot_download
 from packaging.version import Version
 from simple_di import Provide
@@ -108,6 +110,10 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
     try:
       bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
       tokenizer.save_pretrained(bentomodel.path)
+      if quantize == 'gptq':
+        from optimum.gptq.constants import GPTQ_CONFIG
+        with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f:
+          f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode())
       if llm._local:
         # possible local path
         logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
@@ -163,7 +169,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
     if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
 
     model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs)
-    # TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
+    # XXX: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
+    # Seems like the logic below requires to add support for safetensors on accelerate
+    #
     # from accelerate import init_empty_weights
     # from optimum.gptq import load_quantized_model
     # # disable exllama if gptq is loaded on CPU