infra: update docs on serving fine-tuning layers (#567)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-04-19 14:40:46 -04:00 · 2023-11-06 21:34:44 -05:00
parent b158609e95
commit 8fade070f3
5 changed files with 95 additions and 31 deletions
--- a/README.md
+++ b/README.md
@@ -791,36 +791,50 @@ openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gpt
 > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving.
 > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization.

-## 🛠️ Fine-tuning support (Experimental)
+## 🛠️ Serving fine-tuning layers

 [PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters.

 With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example:

 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default
 ```

 OpenLLM also provides flexibility by supporting adapters from custom file paths:

 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
+openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter
 ```

 To use multiple adapters, use the following format:

 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora
 ```

-By default, the first specified `adapter-id` is the default LoRA layer, but optionally you can specify a different LoRA layer for inference using the `/v1/adapters` endpoint:
+By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`:

 ```bash
-curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
+curl -X 'POST' \
+  'http://localhost:3000/v1/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "prompt": "What is the meaning of life?",
+  "stop": [
+    "philosopher"
+  ],
+  "llm_config": {
+    "max_new_tokens": 256,
+    "temperature": 0.75,
+    "top_k": 15,
+    "top_p": 1
+  },
+  "adapter_name": "default"
+}'
 ```

-Note that if you are using multiple adapter names and IDs, it is recommended to set the default adapter before sending the inference to avoid any performance degradation.
-
 To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command:

 ```bash
@@ -833,9 +847,9 @@ If you use a relative path for `--adapter-id`, you need to add `--build-ctx`.
 openllm build opt --adapter-id ./path/to/adapter_id --build-ctx .
 ```

-> [!NOTE]
-> We will gradually roll out support for fine-tuning all models.
-> Currently, the models supporting fine-tuning with OpenLLM include: OPT, Falcon, and LlaMA.
+> [!IMPORTANT]
+> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon.
+

 ## 🥅 Playground and Chat UI

--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
    yield 'llm_type', self.llm_type
    yield 'backend', backend
    yield 'llm_tag', self.tag
+  def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
  # yapf: enable

  return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -40,7 +40,7 @@ tags:
  - OpenAI
 x-bentoml-name: list_models
 responses:
-  '200':
+  200:
    description: The Model object
    content:
      application/json:
@@ -105,7 +105,7 @@ requestBody:
      schema:
        $ref: '#/components/schemas/ChatCompletionRequest'
 responses:
-  '200':
+  200:
    description: OK
    content:
      application/json:
@@ -120,7 +120,7 @@ responses:
            summary: One-shot output example
            value: >
              {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
-  '404':
+  404:
    content:
      application/json:
        schema:
@@ -139,7 +139,7 @@ responses:
                }
              }
    description: NotFound
-  '500':
+  500:
    content:
      application/json:
        schema:
@@ -158,7 +158,7 @@ responses:
                }
              }
    description: Internal Server Error
-  '400':
+  400:
    content:
      application/json:
        schema:
@@ -238,7 +238,7 @@ requestBody:
              - "\\n"
              - "<|endoftext|>"
 responses:
-  '200':
+  200:
    description: OK
    content:
      application/json:
@@ -273,7 +273,7 @@ responses:
                  logprobs: null
                  finish_reason: null
              model: gpt-3.5-turbo-instruct
-  '404':
+  404:
    content:
      application/json:
        schema:
@@ -292,7 +292,7 @@ responses:
                }
              }
    description: NotFound
-  '500':
+  500:
    content:
      application/json:
        schema:
@@ -311,7 +311,7 @@ responses:
                }
              }
    description: Internal Server Error
-  '400':
+  400:
    content:
      application/json:
        schema:
@@ -379,13 +379,44 @@ responses:
    content:
      application/json:
        schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
    description: Bad Request
  500:
    content:
      application/json:
        schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
+    description: Not Found
+'''
+HF_ADAPTERS_SCHEMA = '''\
+---
+consumes:
+- application/json
+description: Return current list of adapters for given LLM.
+operationId: hf__adapters_map
+produces:
+  - application/json
+summary: Describes a model offering that can be used with the API.
+tags:
+  - HF
+x-bentoml-name: adapters_map
+responses:
+  200:
+    description: Return list of LoRA adapters.
+    content:
+      application/json:
+        example:
+          aarnphm/opt-6-7b-quotes:
+            adapter_name: default
+            adapter_type: LORA
+          aarnphm/opt-6-7b-dolly:
+            adapter_name: dolly
+            adapter_type: LORA
+  500:
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/HFErrorResponse'
    description: Not Found
 '''

--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -3,6 +3,7 @@ import functools
 import logging
 import typing as t

+from enum import Enum
 from http import HTTPStatus

 import orjson
@@ -13,24 +14,27 @@ from starlette.routing import Route

 from openllm_core.utils import converter

+from ._openapi import HF_ADAPTERS_SCHEMA
 from ._openapi import HF_AGENT_SCHEMA
 from ._openapi import add_schema_definitions
 from ._openapi import append_schemas
 from ._openapi import get_generator
-from ..protocol.hf import AgentErrorResponse
 from ..protocol.hf import AgentRequest
 from ..protocol.hf import AgentResponse
+from ..protocol.hf import HFErrorResponse

 schemas = get_generator('hf',
-                        components=[AgentRequest, AgentResponse, AgentErrorResponse],
+                        components=[AgentRequest, AgentResponse, HFErrorResponse],
                        tags=[{
                            'name': 'HF',
-                            'description': 'Includes HF Agent support',
+                            'description': 'HF integration, including Agent and others schema endpoints.',
                            'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
                        }])
 logger = logging.getLogger(__name__)

 if t.TYPE_CHECKING:
+
+  from peft.config import PeftConfig
  from starlette.requests import Request
  from starlette.responses import Response

@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import T

 def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
-  app = Starlette(
-      debug=True,
-      routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
-              Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
+  app = Starlette(debug=True,
+                  routes=[
+                      Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
+                      Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
+                      Route('/schema', endpoint=openapi_schema, include_in_schema=False)
+                  ])
  mount_path = '/hf'
  generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
  svc.mount_asgi_app(app, path=mount_path)
  return append_schemas(svc, generated_schema, tags_order='append')

 def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
-  return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)

@add_schema_definitions(HF_AGENT_SCHEMA)
 async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
    logger.error('Error while generating: %s', err)
    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')

+@add_schema_definitions(HF_ADAPTERS_SCHEMA)
+def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
+  return JSONResponse(
+      {
+          adapter_tuple[1]: {
+              'adapter_name': k,
+              'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
+          } for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
+      },
+      status_code=HTTPStatus.OK.value)
+
 def openapi_schema(req: Request) -> Response:
  return schemas.OpenAPIResponse(req)
--- a/openllm-python/src/openllm/protocol/hf.py
+++ b/openllm-python/src/openllm/protocol/hf.py
@@ -13,6 +13,6 @@ class AgentResponse:
  generated_text: str

@attr.define
-class AgentErrorResponse:
+class HFErrorResponse:
  error_code: int
  message: str