feat(client): support authentication token and shim implementation (#605)

* chore: synch generate_iterator to be the same as server Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * --wip-- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * wip Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: cleanup shim implementation Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci * chore: fix pre-commit Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update check with tuple Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-03-05 07:36:15 -05:00 · 2023-11-10 17:44:31 -05:00
parent af0b1b9a7f
commit c41828f68f
25 changed files with 1086 additions and 344 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -66,7 +66,7 @@ else:

 _import_structure: dict[str, list[str]] = {
  'exceptions': [],
-  'client': [],
+  'client': ['HTTPClient', 'AsyncHTTPClient'],
  'bundle': [],
  'playground': [],
  'testing': [],
@@ -98,6 +98,8 @@ if _t.TYPE_CHECKING:
  from . import serialisation as serialisation
  from . import testing as testing
  from . import utils as utils
+  from .client import HTTPClient as HTTPClient
+  from .client import AsyncHTTPClient as AsyncHTTPClient
  from ._deprecated import Runner as Runner
  from ._generation import LogitsProcessorList as LogitsProcessorList
  from ._generation import StopOnTokens as StopOnTokens
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -7,9 +7,6 @@ To start any OpenLLM model:
    openllm start <model_name> --options ...
 """

-from __future__ import annotations
-
-
 if __name__ == '__main__':
  from openllm.cli.entrypoint import cli

--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -71,7 +71,7 @@ def is_sentence_complete(output: str) -> bool:

 def is_partial_stop(output: str, stop_str: str) -> bool:
  """Check whether the output contains a partial stop str."""
-  for i in range(0, min(len(output), len(stop_str))):
+  for i in range(min(len(output), len(stop_str))):
    if stop_str.startswith(output[-i:]):
      return True
  return False
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -235,7 +235,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
      raise RuntimeError('Failed to initialise CUDA runtime binding.')
    # correctly parse handle
    for el in val:
-      if el.startswith('GPU-') or el.startswith('MIG-'):
+      if el.startswith(('GPU-', 'MIG-')):
        uuids = _raw_device_uuid_nvml()
        if uuids is None:
          raise ValueError('Failed to parse available GPUs UUID')
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -580,6 +580,6 @@ def append_schemas(
  def mk_generate_spec(svc:bentoml.Service,openapi_version:str=OPENAPI_VERSION)->MKSchema:return MKSchema(svc_schema)
  def mk_asdict(self:OpenAPISpecification)->dict[str,t.Any]:return svc_schema
  openapi.generate_spec=mk_generate_spec
-  setattr(OpenAPISpecification, 'asdict', mk_asdict)
+  OpenAPISpecification.asdict = mk_asdict
  # yapf: disable
  return svc
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -1,30 +1,13 @@
-"""Serialisation related implementation for GGML-based implementation.
-
-This requires ctransformers to be installed.
-"""
-
 from __future__ import annotations
-import typing as t


-if t.TYPE_CHECKING:
-  import bentoml
-  import openllm
-
-  from openllm_core._typing_compat import M
-
-_conversion_strategy = {'pt': 'ggml'}
-
-
-def import_model(
-  llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any
-) -> bentoml.Model:
+def import_model(llm, *decls, trust_remote_code=True, **attrs):
  raise NotImplementedError('Currently work in progress.')


-def get(llm: openllm.LLM[t.Any, t.Any]) -> bentoml.Model:
+def get(llm):
  raise NotImplementedError('Currently work in progress.')


-def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
+def load_model(llm, *decls, **attrs):
  raise NotImplementedError('Currently work in progress.')
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -1,5 +1,3 @@
-"""Serialisation related implementation for Transformers-based implementation."""
-
 from __future__ import annotations
 import importlib
 import logging
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -6,6 +6,7 @@ we won't ensure backward compatibility for these functions. So use with caution.

 from __future__ import annotations
 import functools
+import importlib.metadata
 import typing as t

 import openllm_core
@@ -72,6 +73,7 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
    'model_name': llm.config['model_name'],
    'architecture': llm.config['architecture'],
    'serialisation': llm._serialisation,
+    **{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
  }