mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-03 21:32:46 -04:00
642 lines
20 KiB
Python
642 lines
20 KiB
Python
import functools
|
|
import inspect
|
|
import types
|
|
import typing as t
|
|
|
|
import attr
|
|
from starlette.routing import Host, Mount, Route
|
|
from starlette.schemas import EndpointInfo, SchemaGenerator
|
|
|
|
from openllm_core.utils import first_not_none
|
|
|
|
OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
|
|
# NOTE: OpenAI schema
|
|
LIST_MODELS_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: >
|
|
List and describe the various models available in the API.
|
|
|
|
You can refer to the available supported models with `openllm models` for more
|
|
information.
|
|
operationId: openai__list_models
|
|
produces:
|
|
- application/json
|
|
summary: Describes a model offering that can be used with the API.
|
|
tags:
|
|
- OpenAI
|
|
x-bentoml-name: list_models
|
|
responses:
|
|
200:
|
|
description: The Model object
|
|
content:
|
|
application/json:
|
|
example:
|
|
object: 'list'
|
|
data:
|
|
- id: __model_id__
|
|
object: model
|
|
created: 1686935002
|
|
owned_by: 'na'
|
|
schema:
|
|
$ref: '#/components/schemas/ModelList'
|
|
'''
|
|
CHAT_COMPLETIONS_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: >-
|
|
Given a list of messages comprising a conversation, the model will return a
|
|
response.
|
|
operationId: openai__chat_completions
|
|
produces:
|
|
- application/json
|
|
tags:
|
|
- OpenAI
|
|
x-bentoml-name: create_chat_completions
|
|
summary: Creates a model response for the given chat conversation.
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
examples:
|
|
one-shot:
|
|
summary: One-shot input example
|
|
value:
|
|
messages: __chat_messages__
|
|
model: __model_id__
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
top_p: 0.43
|
|
n: 1
|
|
stream: false
|
|
chat_template: __chat_template__
|
|
add_generation_prompt: __add_generation_prompt__
|
|
echo: false
|
|
streaming:
|
|
summary: Streaming input example
|
|
value:
|
|
messages:
|
|
- role: system
|
|
content: You are a helpful assistant.
|
|
- role: user
|
|
content: Hello, I'm looking for a chatbot that can help me with my work.
|
|
model: __model_id__
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
top_p: 0.43
|
|
n: 1
|
|
stream: true
|
|
stop:
|
|
- "\\n"
|
|
- "<|endoftext|>"
|
|
chat_template: __chat_template__
|
|
add_generation_prompt: __add_generation_prompt__
|
|
echo: false
|
|
schema:
|
|
$ref: '#/components/schemas/ChatCompletionRequest'
|
|
responses:
|
|
200:
|
|
description: OK
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ChatCompletionResponse'
|
|
examples:
|
|
streaming:
|
|
summary: Streaming output example
|
|
value: >
|
|
{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
|
|
one-shot:
|
|
summary: One-shot output example
|
|
value: >
|
|
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
|
|
404:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
wrong-model:
|
|
summary: Wrong model
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 404
|
|
}
|
|
}
|
|
description: NotFound
|
|
500:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
invalid-parameters:
|
|
summary: Invalid parameters
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 500
|
|
}
|
|
}
|
|
description: Internal Server Error
|
|
400:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
invalid-json:
|
|
summary: Invalid JSON sent
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Invalid JSON input received (Check server log).",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 400
|
|
}
|
|
}
|
|
invalid-prompt:
|
|
summary: Invalid prompt
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Please provide a prompt.",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 400
|
|
}
|
|
}
|
|
description: Bad Request
|
|
'''
|
|
COMPLETIONS_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: >-
|
|
Given a prompt, the model will return one or more predicted completions, and
|
|
can also return the probabilities of alternative tokens at each position. We
|
|
recommend most users use our Chat completions API.
|
|
operationId: openai__completions
|
|
produces:
|
|
- application/json
|
|
tags:
|
|
- OpenAI
|
|
x-bentoml-name: create_completions
|
|
summary: Creates a completion for the provided prompt and parameters.
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CompletionRequest'
|
|
examples:
|
|
one-shot:
|
|
summary: One-shot input example
|
|
value:
|
|
prompt: This is a test
|
|
model: __model_id__
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
logprobs: 1
|
|
top_p: 0.43
|
|
n: 1
|
|
stream: false
|
|
streaming:
|
|
summary: Streaming input example
|
|
value:
|
|
prompt: This is a test
|
|
model: __model_id__
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
top_p: 0.43
|
|
logprobs: 1
|
|
n: 1
|
|
stream: true
|
|
stop:
|
|
- "\\n"
|
|
- "<|endoftext|>"
|
|
responses:
|
|
200:
|
|
description: OK
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CompletionResponse'
|
|
examples:
|
|
one-shot:
|
|
summary: One-shot output example
|
|
value:
|
|
id: cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7
|
|
object: text_completion
|
|
created: 1589478378
|
|
model: VAR_model_id
|
|
choices:
|
|
- text: This is indeed a test
|
|
index: 0
|
|
logprobs: null
|
|
finish_reason: length
|
|
usage:
|
|
prompt_tokens: 5
|
|
completion_tokens: 7
|
|
total_tokens: 12
|
|
streaming:
|
|
summary: Streaming output example
|
|
value:
|
|
id: cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe
|
|
object: text_completion
|
|
created: 1690759702
|
|
choices:
|
|
- text: This
|
|
index: 0
|
|
logprobs: null
|
|
finish_reason: null
|
|
model: gpt-3.5-turbo-instruct
|
|
404:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
wrong-model:
|
|
summary: Wrong model
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 404
|
|
}
|
|
}
|
|
description: NotFound
|
|
500:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
invalid-parameters:
|
|
summary: Invalid parameters
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 500
|
|
}
|
|
}
|
|
description: Internal Server Error
|
|
400:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
examples:
|
|
invalid-json:
|
|
summary: Invalid JSON sent
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Invalid JSON input received (Check server log).",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 400
|
|
}
|
|
}
|
|
invalid-prompt:
|
|
summary: Invalid prompt
|
|
value: >
|
|
{
|
|
"error": {
|
|
"message": "Please provide a prompt.",
|
|
"type": "invalid_request_error",
|
|
"object": "error",
|
|
"param": null,
|
|
"code": 400
|
|
}
|
|
}
|
|
description: Bad Request
|
|
'''
|
|
HF_AGENT_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: Generate instruction for given HF Agent chain for all OpenLLM supported models.
|
|
operationId: hf__agent
|
|
summary: Generate instruction for given HF Agent.
|
|
tags:
|
|
- HF
|
|
x-bentoml-name: hf_agent
|
|
produces:
|
|
- application/json
|
|
requestBody:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/AgentRequest'
|
|
example:
|
|
inputs: "Is the following `text` positive or negative?"
|
|
parameters:
|
|
text: "This is a positive text."
|
|
stop: ["\n"]
|
|
required: true
|
|
responses:
|
|
200:
|
|
description: Successfull generated instruction.
|
|
content:
|
|
application/json:
|
|
example:
|
|
- generated_text: "This is a generated instruction."
|
|
schema:
|
|
$ref: '#/components/schemas/AgentResponse'
|
|
400:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HFErrorResponse'
|
|
description: Bad Request
|
|
500:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HFErrorResponse'
|
|
description: Not Found
|
|
'''
|
|
HF_ADAPTERS_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: Return current list of adapters for given LLM.
|
|
operationId: hf__adapters_map
|
|
produces:
|
|
- application/json
|
|
summary: Describes a model offering that can be used with the API.
|
|
tags:
|
|
- HF
|
|
x-bentoml-name: hf_adapters
|
|
responses:
|
|
200:
|
|
description: Return list of LoRA adapters.
|
|
content:
|
|
application/json:
|
|
example:
|
|
aarnphm/opt-6-7b-quotes:
|
|
adapter_name: default
|
|
adapter_type: LORA
|
|
aarnphm/opt-6-7b-dolly:
|
|
adapter_name: dolly
|
|
adapter_type: LORA
|
|
500:
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HFErrorResponse'
|
|
description: Not Found
|
|
'''
|
|
COHERE_GENERATE_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: >-
|
|
Given a prompt, the model will return one or more predicted completions, and
|
|
can also return the probabilities of alternative tokens at each position.
|
|
operationId: cohere__generate
|
|
produces:
|
|
- application/json
|
|
tags:
|
|
- Cohere
|
|
x-bentoml-name: cohere_generate
|
|
summary: Creates a completion for the provided prompt and parameters.
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CohereGenerateRequest'
|
|
examples:
|
|
one-shot:
|
|
summary: One-shot input example
|
|
value:
|
|
prompt: This is a test
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
p: 0.43
|
|
k: 12
|
|
num_generations: 2
|
|
stream: false
|
|
streaming:
|
|
summary: Streaming input example
|
|
value:
|
|
prompt: This is a test
|
|
max_tokens: 256
|
|
temperature: 0.7
|
|
p: 0.43
|
|
k: 12
|
|
num_generations: 2
|
|
stream: true
|
|
stop_sequences:
|
|
- "\\n"
|
|
- "<|endoftext|>"
|
|
'''
|
|
COHERE_CHAT_SCHEMA = '''\
|
|
---
|
|
consumes:
|
|
- application/json
|
|
description: >-
|
|
Given a list of messages comprising a conversation, the model will return a response.
|
|
operationId: cohere__chat
|
|
produces:
|
|
- application/json
|
|
tags:
|
|
- Cohere
|
|
x-bentoml-name: cohere_chat
|
|
summary: Creates a model response for the given chat conversation.
|
|
'''
|
|
|
|
_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}
|
|
|
|
|
|
def apply_schema(func, **attrs):
|
|
for k, v in attrs.items():
|
|
func.__doc__ = func.__doc__.replace(k, v)
|
|
return func
|
|
|
|
|
|
def add_schema_definitions(func):
|
|
append_str = _SCHEMAS.get(func.__name__.lower(), '')
|
|
if not append_str:
|
|
return func
|
|
if func.__doc__ is None:
|
|
func.__doc__ = ''
|
|
func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip()
|
|
return func
|
|
|
|
|
|
class OpenLLMSchemaGenerator(SchemaGenerator):
|
|
def get_endpoints(self, routes):
|
|
endpoints_info = []
|
|
for route in routes:
|
|
if isinstance(route, (Mount, Host)):
|
|
routes = route.routes or []
|
|
path = self._remove_converter(route.path) if isinstance(route, Mount) else ''
|
|
sub_endpoints = [
|
|
EndpointInfo(path=f'{path}{sub_endpoint.path}', http_method=sub_endpoint.http_method, func=sub_endpoint.func)
|
|
for sub_endpoint in self.get_endpoints(routes)
|
|
]
|
|
endpoints_info.extend(sub_endpoints)
|
|
elif not isinstance(route, Route) or not route.include_in_schema:
|
|
continue
|
|
elif (
|
|
inspect.isfunction(route.endpoint)
|
|
or inspect.ismethod(route.endpoint)
|
|
or isinstance(route.endpoint, functools.partial)
|
|
):
|
|
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
|
|
path = self._remove_converter(route.path)
|
|
for method in route.methods or ['GET']:
|
|
if method == 'HEAD':
|
|
continue
|
|
endpoints_info.append(EndpointInfo(path, method.lower(), endpoint))
|
|
else:
|
|
path = self._remove_converter(route.path)
|
|
for method in ['get', 'post', 'put', 'patch', 'delete', 'options']:
|
|
if not hasattr(route.endpoint, method):
|
|
continue
|
|
func = getattr(route.endpoint, method)
|
|
endpoints_info.append(EndpointInfo(path, method.lower(), func))
|
|
return endpoints_info
|
|
|
|
def get_schema(self, routes, mount_path=None):
|
|
schema = dict(self.base_schema)
|
|
schema.setdefault('paths', {})
|
|
endpoints_info = self.get_endpoints(routes)
|
|
if mount_path:
|
|
mount_path = f'/{mount_path}' if not mount_path.startswith('/') else mount_path
|
|
|
|
for endpoint in endpoints_info:
|
|
parsed = self.parse_docstring(endpoint.func)
|
|
if not parsed:
|
|
continue
|
|
|
|
path = endpoint.path if mount_path is None else mount_path + endpoint.path
|
|
if path not in schema['paths']:
|
|
schema['paths'][path] = {}
|
|
schema['paths'][path][endpoint.http_method] = parsed
|
|
|
|
return schema
|
|
|
|
|
|
def get_generator(title, components=None, tags=None, inject=True):
|
|
base_schema = {'info': {'title': title, 'version': API_VERSION}, 'version': OPENAPI_VERSION}
|
|
if components and inject:
|
|
base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
|
|
if tags is not None and tags and inject:
|
|
base_schema['tags'] = tags
|
|
return OpenLLMSchemaGenerator(base_schema)
|
|
|
|
|
|
def component_schema_generator(attr_cls, description=None):
|
|
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
|
|
schema['description'] = first_not_none(
|
|
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
|
|
)
|
|
for field in attr.fields(attr.resolve_types(attr_cls)):
|
|
attr_type = field.type
|
|
origin_type = t.get_origin(attr_type)
|
|
args_type = t.get_args(attr_type)
|
|
|
|
# Map Python types to OpenAPI schema types
|
|
if attr_type == str:
|
|
schema_type = 'string'
|
|
elif attr_type == int:
|
|
schema_type = 'integer'
|
|
elif attr_type == float:
|
|
schema_type = 'number'
|
|
elif attr_type == bool:
|
|
schema_type = 'boolean'
|
|
elif origin_type is list or origin_type is tuple:
|
|
schema_type = 'array'
|
|
elif origin_type is dict:
|
|
schema_type = 'object'
|
|
# Assuming string keys for simplicity, and handling Any type for values
|
|
prop_schema = {'type': 'object', 'additionalProperties': True if args_type[1] is t.Any else {'type': 'string'}}
|
|
elif attr_type == t.Optional[str]:
|
|
schema_type = 'string'
|
|
elif origin_type is t.Union and t.Any in args_type:
|
|
schema_type = 'object'
|
|
prop_schema = {'type': 'object', 'additionalProperties': True}
|
|
else:
|
|
schema_type = 'string'
|
|
|
|
if 'prop_schema' not in locals():
|
|
prop_schema = {'type': schema_type}
|
|
if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory):
|
|
prop_schema['default'] = field.default
|
|
if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)):
|
|
schema['required'].append(field.name)
|
|
schema['properties'][field.name] = prop_schema
|
|
locals().pop('prop_schema', None)
|
|
|
|
return schema
|
|
|
|
|
|
_SimpleSchema = types.new_class(
|
|
'_SimpleSchema',
|
|
(object,),
|
|
{},
|
|
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
|
|
)
|
|
|
|
|
|
def append_schemas(svc, generated_schema, tags_order='prepend', inject=True):
|
|
# HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec.
|
|
from bentoml._internal.service.openapi.specification import OpenAPISpecification
|
|
|
|
if not inject:
|
|
return svc
|
|
|
|
svc_schema = svc.openapi_spec
|
|
if isinstance(svc_schema, (OpenAPISpecification, _SimpleSchema)):
|
|
svc_schema = svc_schema.asdict()
|
|
if 'tags' in generated_schema:
|
|
if tags_order == 'prepend':
|
|
svc_schema['tags'] = generated_schema['tags'] + svc_schema['tags']
|
|
elif tags_order == 'append':
|
|
svc_schema['tags'].extend(generated_schema['tags'])
|
|
else:
|
|
raise ValueError(f'Invalid tags_order: {tags_order}')
|
|
if 'components' in generated_schema:
|
|
svc_schema['components']['schemas'].update(generated_schema['components']['schemas'])
|
|
svc_schema['paths'].update(generated_schema['paths'])
|
|
|
|
# HACK: mk this attribute until we have a better way to add starlette schemas.
|
|
from bentoml._internal.service import openapi
|
|
|
|
def _generate_spec(svc, openapi_version=OPENAPI_VERSION):
|
|
return _SimpleSchema(svc_schema)
|
|
|
|
def asdict(self):
|
|
return svc_schema
|
|
|
|
openapi.generate_spec = _generate_spec
|
|
OpenAPISpecification.asdict = asdict
|
|
return svc
|