From 8fade070f3f6167a8d15bf4f2d0088b48a6730c2 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Mon, 6 Nov 2023 21:34:44 -0500 Subject: [PATCH] infra: update docs on serving fine-tuning layers (#567) Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- README.md | 36 +++++++++---- openllm-python/src/openllm/_llm.py | 1 + .../src/openllm/entrypoints/_openapi.py | 53 +++++++++++++++---- openllm-python/src/openllm/entrypoints/hf.py | 34 +++++++++--- openllm-python/src/openllm/protocol/hf.py | 2 +- 5 files changed, 95 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index cbf512c7..cffc5584 100644 --- a/README.md +++ b/README.md @@ -791,36 +791,50 @@ openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gpt > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving. > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization. -## 🛠️ Fine-tuning support (Experimental) +## 🛠️ Serving fine-tuning layers [PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters. With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example: ```bash -openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes +openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default ``` OpenLLM also provides flexibility by supporting adapters from custom file paths: ```bash -openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters +openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter ``` To use multiple adapters, use the following format: ```bash -openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora +openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora ``` -By default, the first specified `adapter-id` is the default LoRA layer, but optionally you can specify a different LoRA layer for inference using the `/v1/adapters` endpoint: +By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`: ```bash -curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}' +curl -X 'POST' \ + 'http://localhost:3000/v1/generate' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt": "What is the meaning of life?", + "stop": [ + "philosopher" + ], + "llm_config": { + "max_new_tokens": 256, + "temperature": 0.75, + "top_k": 15, + "top_p": 1 + }, + "adapter_name": "default" +}' ``` -Note that if you are using multiple adapter names and IDs, it is recommended to set the default adapter before sending the inference to avoid any performance degradation. - To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command: ```bash @@ -833,9 +847,9 @@ If you use a relative path for `--adapter-id`, you need to add `--build-ctx`. openllm build opt --adapter-id ./path/to/adapter_id --build-ctx . ``` -> [!NOTE] -> We will gradually roll out support for fine-tuning all models. -> Currently, the models supporting fine-tuning with OpenLLM include: OPT, Falcon, and LlaMA. +> [!IMPORTANT] +> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon. + ## 🥅 Playground and Chat UI diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index c1af32ef..2702df8b 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T], yield 'llm_type', self.llm_type yield 'backend', backend yield 'llm_tag', self.tag + def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map) # yapf: enable return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,), diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py index fc548ff1..54150a3e 100644 --- a/openllm-python/src/openllm/entrypoints/_openapi.py +++ b/openllm-python/src/openllm/entrypoints/_openapi.py @@ -40,7 +40,7 @@ tags: - OpenAI x-bentoml-name: list_models responses: - '200': + 200: description: The Model object content: application/json: @@ -105,7 +105,7 @@ requestBody: schema: $ref: '#/components/schemas/ChatCompletionRequest' responses: - '200': + 200: description: OK content: application/json: @@ -120,7 +120,7 @@ responses: summary: One-shot output example value: > {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}} - '404': + 404: content: application/json: schema: @@ -139,7 +139,7 @@ responses: } } description: NotFound - '500': + 500: content: application/json: schema: @@ -158,7 +158,7 @@ responses: } } description: Internal Server Error - '400': + 400: content: application/json: schema: @@ -238,7 +238,7 @@ requestBody: - "\\n" - "<|endoftext|>" responses: - '200': + 200: description: OK content: application/json: @@ -273,7 +273,7 @@ responses: logprobs: null finish_reason: null model: gpt-3.5-turbo-instruct - '404': + 404: content: application/json: schema: @@ -292,7 +292,7 @@ responses: } } description: NotFound - '500': + 500: content: application/json: schema: @@ -311,7 +311,7 @@ responses: } } description: Internal Server Error - '400': + 400: content: application/json: schema: @@ -379,13 +379,44 @@ responses: content: application/json: schema: - $ref: '#/components/schemas/AgentErrorResponse' + $ref: '#/components/schemas/HFErrorResponse' description: Bad Request 500: content: application/json: schema: - $ref: '#/components/schemas/AgentErrorResponse' + $ref: '#/components/schemas/HFErrorResponse' + description: Not Found +''' +HF_ADAPTERS_SCHEMA = '''\ +--- +consumes: +- application/json +description: Return current list of adapters for given LLM. +operationId: hf__adapters_map +produces: + - application/json +summary: Describes a model offering that can be used with the API. +tags: + - HF +x-bentoml-name: adapters_map +responses: + 200: + description: Return list of LoRA adapters. + content: + application/json: + example: + aarnphm/opt-6-7b-quotes: + adapter_name: default + adapter_type: LORA + aarnphm/opt-6-7b-dolly: + adapter_name: dolly + adapter_type: LORA + 500: + content: + application/json: + schema: + $ref: '#/components/schemas/HFErrorResponse' description: Not Found ''' diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py index 8dcaee11..c92ad6bc 100644 --- a/openllm-python/src/openllm/entrypoints/hf.py +++ b/openllm-python/src/openllm/entrypoints/hf.py @@ -3,6 +3,7 @@ import functools import logging import typing as t +from enum import Enum from http import HTTPStatus import orjson @@ -13,24 +14,27 @@ from starlette.routing import Route from openllm_core.utils import converter +from ._openapi import HF_ADAPTERS_SCHEMA from ._openapi import HF_AGENT_SCHEMA from ._openapi import add_schema_definitions from ._openapi import append_schemas from ._openapi import get_generator -from ..protocol.hf import AgentErrorResponse from ..protocol.hf import AgentRequest from ..protocol.hf import AgentResponse +from ..protocol.hf import HFErrorResponse schemas = get_generator('hf', - components=[AgentRequest, AgentResponse, AgentErrorResponse], + components=[AgentRequest, AgentResponse, HFErrorResponse], tags=[{ 'name': 'HF', - 'description': 'Includes HF Agent support', + 'description': 'HF integration, including Agent and others schema endpoints.', 'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent' }]) logger = logging.getLogger(__name__) if t.TYPE_CHECKING: + + from peft.config import PeftConfig from starlette.requests import Request from starlette.responses import Response @@ -41,17 +45,19 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import T def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: - app = Starlette( - debug=True, - routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']), - Route('/schema', endpoint=openapi_schema, include_in_schema=False)]) + app = Starlette(debug=True, + routes=[ + Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']), + Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']), + Route('/schema', endpoint=openapi_schema, include_in_schema=False) + ]) mount_path = '/hf' generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path) svc.mount_asgi_app(app, path=mount_path) return append_schemas(svc, generated_schema, tags_order='append') def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: - return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value) + return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value) @add_schema_definitions(HF_AGENT_SCHEMA) async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response: @@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response: logger.error('Error while generating: %s', err) return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).') +@add_schema_definitions(HF_ADAPTERS_SCHEMA) +def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response: + if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.') + return JSONResponse( + { + adapter_tuple[1]: { + 'adapter_name': k, + 'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value + } for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items() + }, + status_code=HTTPStatus.OK.value) + def openapi_schema(req: Request) -> Response: return schemas.OpenAPIResponse(req) diff --git a/openllm-python/src/openllm/protocol/hf.py b/openllm-python/src/openllm/protocol/hf.py index bf3d1dea..85657187 100644 --- a/openllm-python/src/openllm/protocol/hf.py +++ b/openllm-python/src/openllm/protocol/hf.py @@ -13,6 +13,6 @@ class AgentResponse: generated_text: str @attr.define -class AgentErrorResponse: +class HFErrorResponse: error_code: int message: str