From 8fade070f3f6167a8d15bf4f2d0088b48a6730c2 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Mon, 6 Nov 2023 21:34:44 -0500
Subject: [PATCH] infra: update docs on serving fine-tuning layers (#567)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 README.md                                     | 36 +++++++++----
 openllm-python/src/openllm/_llm.py            |  1 +
 .../src/openllm/entrypoints/_openapi.py       | 53 +++++++++++++++----
 openllm-python/src/openllm/entrypoints/hf.py  | 34 +++++++++---
 openllm-python/src/openllm/protocol/hf.py     |  2 +-
 5 files changed, 95 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index cbf512c7..cffc5584 100644
--- a/README.md
+++ b/README.md
@@ -791,36 +791,50 @@ openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gpt
 > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving.
 > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization.
 
-## 🛠️ Fine-tuning support (Experimental)
+## 🛠️ Serving fine-tuning layers
 
 [PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters.
 
 With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example:
 
 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default
 ```
 
 OpenLLM also provides flexibility by supporting adapters from custom file paths:
 
 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
+openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter
 ```
 
 To use multiple adapters, use the following format:
 
 ```bash
-openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora
 ```
 
-By default, the first specified `adapter-id` is the default LoRA layer, but optionally you can specify a different LoRA layer for inference using the `/v1/adapters` endpoint:
+By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`:
 
 ```bash
-curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
+curl -X 'POST' \
+  'http://localhost:3000/v1/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "prompt": "What is the meaning of life?",
+  "stop": [
+    "philosopher"
+  ],
+  "llm_config": {
+    "max_new_tokens": 256,
+    "temperature": 0.75,
+    "top_k": 15,
+    "top_p": 1
+  },
+  "adapter_name": "default"
+}'
 ```
 
-Note that if you are using multiple adapter names and IDs, it is recommended to set the default adapter before sending the inference to avoid any performance degradation.
-
 To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command:
 
 ```bash
@@ -833,9 +847,9 @@ If you use a relative path for `--adapter-id`, you need to add `--build-ctx`.
 openllm build opt --adapter-id ./path/to/adapter_id --build-ctx .
 ```
 
-> [!NOTE]
-> We will gradually roll out support for fine-tuning all models.
-> Currently, the models supporting fine-tuning with OpenLLM include: OPT, Falcon, and LlaMA.
+> [!IMPORTANT]
+> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon.
+
 
 ## 🥅 Playground and Chat UI
 
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index c1af32ef..2702df8b 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
     yield 'llm_type', self.llm_type
     yield 'backend', backend
     yield 'llm_tag', self.tag
+  def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
   # yapf: enable
 
   return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py
index fc548ff1..54150a3e 100644
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -40,7 +40,7 @@ tags:
   - OpenAI
 x-bentoml-name: list_models
 responses:
-  '200':
+  200:
     description: The Model object
     content:
       application/json:
@@ -105,7 +105,7 @@ requestBody:
       schema:
         $ref: '#/components/schemas/ChatCompletionRequest'
 responses:
-  '200':
+  200:
     description: OK
     content:
       application/json:
@@ -120,7 +120,7 @@ responses:
             summary: One-shot output example
             value: >
               {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
-  '404':
+  404:
     content:
       application/json:
         schema:
@@ -139,7 +139,7 @@ responses:
                 }
               }
     description: NotFound
-  '500':
+  500:
     content:
       application/json:
         schema:
@@ -158,7 +158,7 @@ responses:
                 }
               }
     description: Internal Server Error
-  '400':
+  400:
     content:
       application/json:
         schema:
@@ -238,7 +238,7 @@ requestBody:
               - "\\n"
               - "<|endoftext|>"
 responses:
-  '200':
+  200:
     description: OK
     content:
       application/json:
@@ -273,7 +273,7 @@ responses:
                   logprobs: null
                   finish_reason: null
               model: gpt-3.5-turbo-instruct
-  '404':
+  404:
     content:
       application/json:
         schema:
@@ -292,7 +292,7 @@ responses:
                 }
               }
     description: NotFound
-  '500':
+  500:
     content:
       application/json:
         schema:
@@ -311,7 +311,7 @@ responses:
                 }
               }
     description: Internal Server Error
-  '400':
+  400:
     content:
       application/json:
         schema:
@@ -379,13 +379,44 @@ responses:
     content:
       application/json:
         schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
     description: Bad Request
   500:
     content:
       application/json:
         schema:
-          $ref: '#/components/schemas/AgentErrorResponse'
+          $ref: '#/components/schemas/HFErrorResponse'
+    description: Not Found
+'''
+HF_ADAPTERS_SCHEMA = '''\
+---
+consumes:
+- application/json
+description: Return current list of adapters for given LLM.
+operationId: hf__adapters_map
+produces:
+  - application/json
+summary: Describes a model offering that can be used with the API.
+tags:
+  - HF
+x-bentoml-name: adapters_map
+responses:
+  200:
+    description: Return list of LoRA adapters.
+    content:
+      application/json:
+        example:
+          aarnphm/opt-6-7b-quotes:
+            adapter_name: default
+            adapter_type: LORA
+          aarnphm/opt-6-7b-dolly:
+            adapter_name: dolly
+            adapter_type: LORA
+  500:
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/HFErrorResponse'
     description: Not Found
 '''
 
diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py
index 8dcaee11..c92ad6bc 100644
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -3,6 +3,7 @@ import functools
 import logging
 import typing as t
 
+from enum import Enum
 from http import HTTPStatus
 
 import orjson
@@ -13,24 +14,27 @@ from starlette.routing import Route
 
 from openllm_core.utils import converter
 
+from ._openapi import HF_ADAPTERS_SCHEMA
 from ._openapi import HF_AGENT_SCHEMA
 from ._openapi import add_schema_definitions
 from ._openapi import append_schemas
 from ._openapi import get_generator
-from ..protocol.hf import AgentErrorResponse
 from ..protocol.hf import AgentRequest
 from ..protocol.hf import AgentResponse
+from ..protocol.hf import HFErrorResponse
 
 schemas = get_generator('hf',
-                        components=[AgentRequest, AgentResponse, AgentErrorResponse],
+                        components=[AgentRequest, AgentResponse, HFErrorResponse],
                         tags=[{
                             'name': 'HF',
-                            'description': 'Includes HF Agent support',
+                            'description': 'HF integration, including Agent and others schema endpoints.',
                             'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
                         }])
 logger = logging.getLogger(__name__)
 
 if t.TYPE_CHECKING:
+
+  from peft.config import PeftConfig
   from starlette.requests import Request
   from starlette.responses import Response
 
@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
   from openllm_core._typing_compat import T
 
 def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
-  app = Starlette(
-      debug=True,
-      routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
-              Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
+  app = Starlette(debug=True,
+                  routes=[
+                      Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
+                      Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
+                      Route('/schema', endpoint=openapi_schema, include_in_schema=False)
+                  ])
   mount_path = '/hf'
   generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
   svc.mount_asgi_app(app, path=mount_path)
   return append_schemas(svc, generated_schema, tags_order='append')
 
 def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
-  return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
 
 @add_schema_definitions(HF_AGENT_SCHEMA)
 async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
     logger.error('Error while generating: %s', err)
     return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
 
+@add_schema_definitions(HF_ADAPTERS_SCHEMA)
+def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
+  return JSONResponse(
+      {
+          adapter_tuple[1]: {
+              'adapter_name': k,
+              'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
+          } for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
+      },
+      status_code=HTTPStatus.OK.value)
+
 def openapi_schema(req: Request) -> Response:
   return schemas.OpenAPIResponse(req)
diff --git a/openllm-python/src/openllm/protocol/hf.py b/openllm-python/src/openllm/protocol/hf.py
index bf3d1dea..85657187 100644
--- a/openllm-python/src/openllm/protocol/hf.py
+++ b/openllm-python/src/openllm/protocol/hf.py
@@ -13,6 +13,6 @@ class AgentResponse:
   generated_text: str
 
 @attr.define
-class AgentErrorResponse:
+class HFErrorResponse:
   error_code: int
   message: str