infra: update docs on serving fine-tuning layers (#567)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-06 21:34:44 -05:00
committed by GitHub
parent b158609e95
commit 8fade070f3
5 changed files with 95 additions and 31 deletions

View File

@@ -791,36 +791,50 @@ openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gpt
> first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving.
> See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization.
## 🛠️ Fine-tuning support (Experimental)
## 🛠️ Serving fine-tuning layers
[PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters.
With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example:
```bash
openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes
openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default
```
OpenLLM also provides flexibility by supporting adapters from custom file paths:
```bash
openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter
```
To use multiple adapters, use the following format:
```bash
openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora
```
By default, the first specified `adapter-id` is the default LoRA layer, but optionally you can specify a different LoRA layer for inference using the `/v1/adapters` endpoint:
By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`:
```bash
curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
curl -X 'POST' \
'http://localhost:3000/v1/generate' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "What is the meaning of life?",
"stop": [
"philosopher"
],
"llm_config": {
"max_new_tokens": 256,
"temperature": 0.75,
"top_k": 15,
"top_p": 1
},
"adapter_name": "default"
}'
```
Note that if you are using multiple adapter names and IDs, it is recommended to set the default adapter before sending the inference to avoid any performance degradation.
To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command:
```bash
@@ -833,9 +847,9 @@ If you use a relative path for `--adapter-id`, you need to add `--build-ctx`.
openllm build opt --adapter-id ./path/to/adapter_id --build-ctx .
```
> [!NOTE]
> We will gradually roll out support for fine-tuning all models.
> Currently, the models supporting fine-tuning with OpenLLM include: OPT, Falcon, and LlaMA.
> [!IMPORTANT]
> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon.
## 🥅 Playground and Chat UI

View File

@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
yield 'llm_type', self.llm_type
yield 'backend', backend
yield 'llm_tag', self.tag
def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
# yapf: enable
return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),

View File

@@ -40,7 +40,7 @@ tags:
- OpenAI
x-bentoml-name: list_models
responses:
'200':
200:
description: The Model object
content:
application/json:
@@ -105,7 +105,7 @@ requestBody:
schema:
$ref: '#/components/schemas/ChatCompletionRequest'
responses:
'200':
200:
description: OK
content:
application/json:
@@ -120,7 +120,7 @@ responses:
summary: One-shot output example
value: >
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
'404':
404:
content:
application/json:
schema:
@@ -139,7 +139,7 @@ responses:
}
}
description: NotFound
'500':
500:
content:
application/json:
schema:
@@ -158,7 +158,7 @@ responses:
}
}
description: Internal Server Error
'400':
400:
content:
application/json:
schema:
@@ -238,7 +238,7 @@ requestBody:
- "\\n"
- "<|endoftext|>"
responses:
'200':
200:
description: OK
content:
application/json:
@@ -273,7 +273,7 @@ responses:
logprobs: null
finish_reason: null
model: gpt-3.5-turbo-instruct
'404':
404:
content:
application/json:
schema:
@@ -292,7 +292,7 @@ responses:
}
}
description: NotFound
'500':
500:
content:
application/json:
schema:
@@ -311,7 +311,7 @@ responses:
}
}
description: Internal Server Error
'400':
400:
content:
application/json:
schema:
@@ -379,13 +379,44 @@ responses:
content:
application/json:
schema:
$ref: '#/components/schemas/AgentErrorResponse'
$ref: '#/components/schemas/HFErrorResponse'
description: Bad Request
500:
content:
application/json:
schema:
$ref: '#/components/schemas/AgentErrorResponse'
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
'''
HF_ADAPTERS_SCHEMA = '''\
---
consumes:
- application/json
description: Return current list of adapters for given LLM.
operationId: hf__adapters_map
produces:
- application/json
summary: Describes a model offering that can be used with the API.
tags:
- HF
x-bentoml-name: adapters_map
responses:
200:
description: Return list of LoRA adapters.
content:
application/json:
example:
aarnphm/opt-6-7b-quotes:
adapter_name: default
adapter_type: LORA
aarnphm/opt-6-7b-dolly:
adapter_name: dolly
adapter_type: LORA
500:
content:
application/json:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
'''

View File

@@ -3,6 +3,7 @@ import functools
import logging
import typing as t
from enum import Enum
from http import HTTPStatus
import orjson
@@ -13,24 +14,27 @@ from starlette.routing import Route
from openllm_core.utils import converter
from ._openapi import HF_ADAPTERS_SCHEMA
from ._openapi import HF_AGENT_SCHEMA
from ._openapi import add_schema_definitions
from ._openapi import append_schemas
from ._openapi import get_generator
from ..protocol.hf import AgentErrorResponse
from ..protocol.hf import AgentRequest
from ..protocol.hf import AgentResponse
from ..protocol.hf import HFErrorResponse
schemas = get_generator('hf',
components=[AgentRequest, AgentResponse, AgentErrorResponse],
components=[AgentRequest, AgentResponse, HFErrorResponse],
tags=[{
'name': 'HF',
'description': 'Includes HF Agent support',
'description': 'HF integration, including Agent and others schema endpoints.',
'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
}])
logger = logging.getLogger(__name__)
if t.TYPE_CHECKING:
from peft.config import PeftConfig
from starlette.requests import Request
from starlette.responses import Response
@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import T
def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
app = Starlette(
debug=True,
routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
app = Starlette(debug=True,
routes=[
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
Route('/schema', endpoint=openapi_schema, include_in_schema=False)
])
mount_path = '/hf'
generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(svc, generated_schema, tags_order='append')
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
@add_schema_definitions(HF_AGENT_SCHEMA)
async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
@add_schema_definitions(HF_ADAPTERS_SCHEMA)
def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
return JSONResponse(
{
adapter_tuple[1]: {
'adapter_name': k,
'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
} for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
},
status_code=HTTPStatus.OK.value)
def openapi_schema(req: Request) -> Response:
return schemas.OpenAPIResponse(req)

View File

@@ -13,6 +13,6 @@ class AgentResponse:
generated_text: str
@attr.define
class AgentErrorResponse:
class HFErrorResponse:
error_code: int
message: str