infra: update docs on serving fine-tuning layers (#567)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-06 21:34:44 -05:00
committed by GitHub
parent b158609e95
commit 8fade070f3
5 changed files with 95 additions and 31 deletions

View File

@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
yield 'llm_type', self.llm_type
yield 'backend', backend
yield 'llm_tag', self.tag
def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
# yapf: enable
return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),

View File

@@ -40,7 +40,7 @@ tags:
- OpenAI
x-bentoml-name: list_models
responses:
'200':
200:
description: The Model object
content:
application/json:
@@ -105,7 +105,7 @@ requestBody:
schema:
$ref: '#/components/schemas/ChatCompletionRequest'
responses:
'200':
200:
description: OK
content:
application/json:
@@ -120,7 +120,7 @@ responses:
summary: One-shot output example
value: >
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
'404':
404:
content:
application/json:
schema:
@@ -139,7 +139,7 @@ responses:
}
}
description: NotFound
'500':
500:
content:
application/json:
schema:
@@ -158,7 +158,7 @@ responses:
}
}
description: Internal Server Error
'400':
400:
content:
application/json:
schema:
@@ -238,7 +238,7 @@ requestBody:
- "\\n"
- "<|endoftext|>"
responses:
'200':
200:
description: OK
content:
application/json:
@@ -273,7 +273,7 @@ responses:
logprobs: null
finish_reason: null
model: gpt-3.5-turbo-instruct
'404':
404:
content:
application/json:
schema:
@@ -292,7 +292,7 @@ responses:
}
}
description: NotFound
'500':
500:
content:
application/json:
schema:
@@ -311,7 +311,7 @@ responses:
}
}
description: Internal Server Error
'400':
400:
content:
application/json:
schema:
@@ -379,13 +379,44 @@ responses:
content:
application/json:
schema:
$ref: '#/components/schemas/AgentErrorResponse'
$ref: '#/components/schemas/HFErrorResponse'
description: Bad Request
500:
content:
application/json:
schema:
$ref: '#/components/schemas/AgentErrorResponse'
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
'''
HF_ADAPTERS_SCHEMA = '''\
---
consumes:
- application/json
description: Return current list of adapters for given LLM.
operationId: hf__adapters_map
produces:
- application/json
summary: Describes a model offering that can be used with the API.
tags:
- HF
x-bentoml-name: adapters_map
responses:
200:
description: Return list of LoRA adapters.
content:
application/json:
example:
aarnphm/opt-6-7b-quotes:
adapter_name: default
adapter_type: LORA
aarnphm/opt-6-7b-dolly:
adapter_name: dolly
adapter_type: LORA
500:
content:
application/json:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
'''

View File

@@ -3,6 +3,7 @@ import functools
import logging
import typing as t
from enum import Enum
from http import HTTPStatus
import orjson
@@ -13,24 +14,27 @@ from starlette.routing import Route
from openllm_core.utils import converter
from ._openapi import HF_ADAPTERS_SCHEMA
from ._openapi import HF_AGENT_SCHEMA
from ._openapi import add_schema_definitions
from ._openapi import append_schemas
from ._openapi import get_generator
from ..protocol.hf import AgentErrorResponse
from ..protocol.hf import AgentRequest
from ..protocol.hf import AgentResponse
from ..protocol.hf import HFErrorResponse
schemas = get_generator('hf',
components=[AgentRequest, AgentResponse, AgentErrorResponse],
components=[AgentRequest, AgentResponse, HFErrorResponse],
tags=[{
'name': 'HF',
'description': 'Includes HF Agent support',
'description': 'HF integration, including Agent and others schema endpoints.',
'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
}])
logger = logging.getLogger(__name__)
if t.TYPE_CHECKING:
from peft.config import PeftConfig
from starlette.requests import Request
from starlette.responses import Response
@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import T
def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
app = Starlette(
debug=True,
routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
app = Starlette(debug=True,
routes=[
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
Route('/schema', endpoint=openapi_schema, include_in_schema=False)
])
mount_path = '/hf'
generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(svc, generated_schema, tags_order='append')
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
@add_schema_definitions(HF_AGENT_SCHEMA)
async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
@add_schema_definitions(HF_ADAPTERS_SCHEMA)
def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
return JSONResponse(
{
adapter_tuple[1]: {
'adapter_name': k,
'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
} for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
},
status_code=HTTPStatus.OK.value)
def openapi_schema(req: Request) -> Response:
return schemas.OpenAPIResponse(req)

View File

@@ -13,6 +13,6 @@ class AgentResponse:
generated_text: str
@attr.define
class AgentErrorResponse:
class HFErrorResponse:
error_code: int
message: str