mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-19 05:57:39 -04:00
fix(gptq): use upstream integration (#297)
* wip Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * feat: GPTQ transformers integration Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * fix: only load if variable is available and add changelog Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * chore: remove boilerplate check Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -24,6 +24,7 @@ import openllm
|
||||
from openllm._llm import normalise_model_name
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import ListAny
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
|
||||
return container.status in ['running', 'created']
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _local_handle(model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
|
||||
with openllm.utils.reserve_free_port() as port:
|
||||
pass
|
||||
|
||||
@@ -169,14 +163,7 @@ def _local_handle(model: str,
|
||||
proc.stderr.close()
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _container_handle(model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
|
||||
envvar = openllm.utils.EnvVarMixin(model)
|
||||
|
||||
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
|
||||
|
||||
Reference in New Issue
Block a user