fix(gptq): use upstream integration (#297)

* wip Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * feat: GPTQ transformers integration Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * fix: only load if variable is available and add changelog Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * chore: remove boilerplate check Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-08-01 10:38:50 -04:00 · 2023-09-04 14:05:50 -04:00
parent 3da869e728
commit 956b3a53bc
23 changed files with 197 additions and 248 deletions
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -24,6 +24,7 @@ import openllm
 from openllm._llm import normalise_model_name
 from openllm_core._typing_compat import DictStrAny
 from openllm_core._typing_compat import ListAny
+from openllm_core._typing_compat import LiteralQuantise

 logger = logging.getLogger(__name__)

@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
    return container.status in ['running', 'created']

@contextlib.contextmanager
-def _local_handle(model: str,
-                  model_id: str,
-                  image_tag: str,
-                  deployment_mode: t.Literal['container', 'local'],
-                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                  *,
-                  _serve_grpc: bool = False,
-                  ):
+def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
  with openllm.utils.reserve_free_port() as port:
    pass

@@ -169,14 +163,7 @@ def _local_handle(model: str,
    proc.stderr.close()

@contextlib.contextmanager
-def _container_handle(model: str,
-                      model_id: str,
-                      image_tag: str,
-                      deployment_mode: t.Literal['container', 'local'],
-                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                      *,
-                      _serve_grpc: bool = False,
-                      ):
+def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
  envvar = openllm.utils.EnvVarMixin(model)

  with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: