infra: update docs on serving fine-tuning layers (#567)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-07 00:26:52 -05:00 · 2023-11-06 21:34:44 -05:00
parent b158609e95
commit 8fade070f3
5 changed files with 95 additions and 31 deletions
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
    yield 'llm_type', self.llm_type
    yield 'backend', backend
    yield 'llm_tag', self.tag
+  def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
  # yapf: enable

  return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -40,7 +40,7 @@ tags:
  - OpenAI
 x-bentoml-name: list_models
 responses:
-  '200':
+  200:
    description: The Model object
    content:
      application/json:
@@ -105,7 +105,7 @@ requestBody:
      schema:
        $ref: '#/components/schemas/ChatCompletionRequest'
 responses:
-  '200':
+  200:
    description: OK
    content:
      application/json:
@@ -120,7 +120,7 @@ responses:
            summary: One-shot output example
            value: >
              {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
-  '404':
+  404:
    content:
      application/json:
        schema:
@@ -139,7 +139,7 @@ responses:
                }
              }
    description: NotFound
-  '500':
+  500:
    content:
      application/json:
        schema:
@@ -158,7 +158,7 @@ responses:
                }
              }
    description: Internal Server Error
-  '400':
+  400:
    content:
      application/json:
        schema:
@@ -238,7 +238,7 @@ requestBody:
              - "\\n"
              - "<|endoftext|>"
 responses:
-  '200':
+  200:
    description: OK
    content:
      application/json:
@@ -273,7 +273,7 @@ responses:
                  logprobs: null
                  finish_reason: null
              model: gpt-3.5-turbo-instruct
-  '404':
+  404:
    content:
      application/json:
        schema:
@@ -292,7 +292,7 @@ responses:
                }
              }
    description: NotFound
-  '500':
+  500:
    content:
      application/json:
        schema:
@@ -311,7 +311,7 @@ responses:
                }
              }
    description: Internal Server Error
-  '400':
+  400:
    content:
      application/json:
        schema:
@@ -379,13 +379,44 @@ responses:
    content:
      application/json:
        schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
    description: Bad Request
  500:
    content:
      application/json:
        schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
+    description: Not Found
+'''
+HF_ADAPTERS_SCHEMA = '''\
+---
+consumes:
+- application/json
+description: Return current list of adapters for given LLM.
+operationId: hf__adapters_map
+produces:
+  - application/json
+summary: Describes a model offering that can be used with the API.
+tags:
+  - HF
+x-bentoml-name: adapters_map
+responses:
+  200:
+    description: Return list of LoRA adapters.
+    content:
+      application/json:
+        example:
+          aarnphm/opt-6-7b-quotes:
+            adapter_name: default
+            adapter_type: LORA
+          aarnphm/opt-6-7b-dolly:
+            adapter_name: dolly
+            adapter_type: LORA
+  500:
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/HFErrorResponse'
    description: Not Found
 '''

--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -3,6 +3,7 @@ import functools
 import logging
 import typing as t

+from enum import Enum
 from http import HTTPStatus

 import orjson
@@ -13,24 +14,27 @@ from starlette.routing import Route

 from openllm_core.utils import converter

+from ._openapi import HF_ADAPTERS_SCHEMA
 from ._openapi import HF_AGENT_SCHEMA
 from ._openapi import add_schema_definitions
 from ._openapi import append_schemas
 from ._openapi import get_generator
-from ..protocol.hf import AgentErrorResponse
 from ..protocol.hf import AgentRequest
 from ..protocol.hf import AgentResponse
+from ..protocol.hf import HFErrorResponse

 schemas = get_generator('hf',
-                        components=[AgentRequest, AgentResponse, AgentErrorResponse],
+                        components=[AgentRequest, AgentResponse, HFErrorResponse],
                        tags=[{
                            'name': 'HF',
-                            'description': 'Includes HF Agent support',
+                            'description': 'HF integration, including Agent and others schema endpoints.',
                            'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
                        }])
 logger = logging.getLogger(__name__)

 if t.TYPE_CHECKING:
+
+  from peft.config import PeftConfig
  from starlette.requests import Request
  from starlette.responses import Response

@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import T

 def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
-  app = Starlette(
-      debug=True,
-      routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
-              Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
+  app = Starlette(debug=True,
+                  routes=[
+                      Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
+                      Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
+                      Route('/schema', endpoint=openapi_schema, include_in_schema=False)
+                  ])
  mount_path = '/hf'
  generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
  svc.mount_asgi_app(app, path=mount_path)
  return append_schemas(svc, generated_schema, tags_order='append')

 def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
-  return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)

@add_schema_definitions(HF_AGENT_SCHEMA)
 async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
    logger.error('Error while generating: %s', err)
    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')

+@add_schema_definitions(HF_ADAPTERS_SCHEMA)
+def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
+  return JSONResponse(
+      {
+          adapter_tuple[1]: {
+              'adapter_name': k,
+              'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
+          } for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
+      },
+      status_code=HTTPStatus.OK.value)
+
 def openapi_schema(req: Request) -> Response:
  return schemas.OpenAPIResponse(req)
--- a/openllm-python/src/openllm/protocol/hf.py
+++ b/openllm-python/src/openllm/protocol/hf.py
@@ -13,6 +13,6 @@ class AgentResponse:
  generated_text: str

@attr.define
-class AgentErrorResponse:
+class HFErrorResponse:
  error_code: int
  message: str