fix(gptq): use upstream integration (#297)

* wip

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* feat: GPTQ transformers integration

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* fix: only load if variable is available and add changelog

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* chore: remove boilerplate check

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-04 14:05:50 -04:00
committed by GitHub
parent 3da869e728
commit 956b3a53bc
23 changed files with 197 additions and 248 deletions

View File

@@ -24,6 +24,7 @@ import openllm
from openllm._llm import normalise_model_name
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import ListAny
from openllm_core._typing_compat import LiteralQuantise
logger = logging.getLogger(__name__)
@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
return container.status in ['running', 'created']
@contextlib.contextmanager
def _local_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
with openllm.utils.reserve_free_port() as port:
pass
@@ -169,14 +163,7 @@ def _local_handle(model: str,
proc.stderr.close()
@contextlib.contextmanager
def _container_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
envvar = openllm.utils.EnvVarMixin(model)
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: