mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-20 13:29:35 -05:00
infra: update docs on serving fine-tuning layers (#567)
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -396,6 +396,7 @@ def _RunnerFactory(self: openllm.LLM[M, T],
|
||||
yield 'llm_type', self.llm_type
|
||||
yield 'backend', backend
|
||||
yield 'llm_tag', self.tag
|
||||
def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
|
||||
# yapf: enable
|
||||
|
||||
return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
|
||||
|
||||
@@ -40,7 +40,7 @@ tags:
|
||||
- OpenAI
|
||||
x-bentoml-name: list_models
|
||||
responses:
|
||||
'200':
|
||||
200:
|
||||
description: The Model object
|
||||
content:
|
||||
application/json:
|
||||
@@ -105,7 +105,7 @@ requestBody:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ChatCompletionRequest'
|
||||
responses:
|
||||
'200':
|
||||
200:
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
@@ -120,7 +120,7 @@ responses:
|
||||
summary: One-shot output example
|
||||
value: >
|
||||
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
|
||||
'404':
|
||||
404:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -139,7 +139,7 @@ responses:
|
||||
}
|
||||
}
|
||||
description: NotFound
|
||||
'500':
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -158,7 +158,7 @@ responses:
|
||||
}
|
||||
}
|
||||
description: Internal Server Error
|
||||
'400':
|
||||
400:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -238,7 +238,7 @@ requestBody:
|
||||
- "\\n"
|
||||
- "<|endoftext|>"
|
||||
responses:
|
||||
'200':
|
||||
200:
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
@@ -273,7 +273,7 @@ responses:
|
||||
logprobs: null
|
||||
finish_reason: null
|
||||
model: gpt-3.5-turbo-instruct
|
||||
'404':
|
||||
404:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -292,7 +292,7 @@ responses:
|
||||
}
|
||||
}
|
||||
description: NotFound
|
||||
'500':
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -311,7 +311,7 @@ responses:
|
||||
}
|
||||
}
|
||||
description: Internal Server Error
|
||||
'400':
|
||||
400:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -379,13 +379,44 @@ responses:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AgentErrorResponse'
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Bad Request
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AgentErrorResponse'
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Not Found
|
||||
'''
|
||||
HF_ADAPTERS_SCHEMA = '''\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: Return current list of adapters for given LLM.
|
||||
operationId: hf__adapters_map
|
||||
produces:
|
||||
- application/json
|
||||
summary: Describes a model offering that can be used with the API.
|
||||
tags:
|
||||
- HF
|
||||
x-bentoml-name: adapters_map
|
||||
responses:
|
||||
200:
|
||||
description: Return list of LoRA adapters.
|
||||
content:
|
||||
application/json:
|
||||
example:
|
||||
aarnphm/opt-6-7b-quotes:
|
||||
adapter_name: default
|
||||
adapter_type: LORA
|
||||
aarnphm/opt-6-7b-dolly:
|
||||
adapter_name: dolly
|
||||
adapter_type: LORA
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Not Found
|
||||
'''
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import functools
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from enum import Enum
|
||||
from http import HTTPStatus
|
||||
|
||||
import orjson
|
||||
@@ -13,24 +14,27 @@ from starlette.routing import Route
|
||||
|
||||
from openllm_core.utils import converter
|
||||
|
||||
from ._openapi import HF_ADAPTERS_SCHEMA
|
||||
from ._openapi import HF_AGENT_SCHEMA
|
||||
from ._openapi import add_schema_definitions
|
||||
from ._openapi import append_schemas
|
||||
from ._openapi import get_generator
|
||||
from ..protocol.hf import AgentErrorResponse
|
||||
from ..protocol.hf import AgentRequest
|
||||
from ..protocol.hf import AgentResponse
|
||||
from ..protocol.hf import HFErrorResponse
|
||||
|
||||
schemas = get_generator('hf',
|
||||
components=[AgentRequest, AgentResponse, AgentErrorResponse],
|
||||
components=[AgentRequest, AgentResponse, HFErrorResponse],
|
||||
tags=[{
|
||||
'name': 'HF',
|
||||
'description': 'Includes HF Agent support',
|
||||
'description': 'HF integration, including Agent and others schema endpoints.',
|
||||
'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
|
||||
}])
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
|
||||
from peft.config import PeftConfig
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
|
||||
@@ -41,17 +45,19 @@ if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import T
|
||||
|
||||
def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
|
||||
Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
|
||||
app = Starlette(debug=True,
|
||||
routes=[
|
||||
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
|
||||
Route('/adapters', endpoint=functools.partial(adapters_map, llm=llm), name='adapters', methods=['GET']),
|
||||
Route('/schema', endpoint=openapi_schema, include_in_schema=False)
|
||||
])
|
||||
mount_path = '/hf'
|
||||
generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
|
||||
svc.mount_asgi_app(app, path=mount_path)
|
||||
return append_schemas(svc, generated_schema, tags_order='append')
|
||||
|
||||
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
|
||||
return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
|
||||
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
|
||||
|
||||
@add_schema_definitions(HF_AGENT_SCHEMA)
|
||||
async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
|
||||
@@ -71,5 +77,17 @@ async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
|
||||
logger.error('Error while generating: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
|
||||
|
||||
@add_schema_definitions(HF_ADAPTERS_SCHEMA)
|
||||
def adapters_map(req: Request, llm: openllm.LLM[M, T]) -> Response:
|
||||
if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
|
||||
return JSONResponse(
|
||||
{
|
||||
adapter_tuple[1]: {
|
||||
'adapter_name': k,
|
||||
'adapter_type': t.cast(Enum, adapter_tuple[0].peft_type).value
|
||||
} for k, adapter_tuple in t.cast(t.Dict[str, t.Tuple['PeftConfig', str]], dict(*llm.adapter_map.values())).items()
|
||||
},
|
||||
status_code=HTTPStatus.OK.value)
|
||||
|
||||
def openapi_schema(req: Request) -> Response:
|
||||
return schemas.OpenAPIResponse(req)
|
||||
|
||||
@@ -13,6 +13,6 @@ class AgentResponse:
|
||||
generated_text: str
|
||||
|
||||
@attr.define
|
||||
class AgentErrorResponse:
|
||||
class HFErrorResponse:
|
||||
error_code: int
|
||||
message: str
|
||||
|
||||
Reference in New Issue
Block a user