mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-21 23:47:23 -04:00
chore(releases): remove deadcode
Signed-off-by: Aaron Pham (mbp16) <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -27,14 +27,11 @@ __lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once my
|
||||
'exceptions': [],
|
||||
'client': ['HTTPClient', 'AsyncHTTPClient'],
|
||||
'bundle': [],
|
||||
'testing': [],
|
||||
'utils': ['api'],
|
||||
'entrypoints': ['mount_entrypoints'],
|
||||
'serialisation': ['ggml', 'transformers', 'vllm'],
|
||||
'_llm': ['LLM'],
|
||||
'_deprecated': ['Runner'],
|
||||
'_runners': ['runner'],
|
||||
'_quantisation': ['infer_quantisation_config'],
|
||||
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
|
||||
},
|
||||
extra_objects={'COMPILED': COMPILED},
|
||||
@@ -44,7 +41,7 @@ __all__, __dir__ = __lazy.__all__, __lazy.__dir__
|
||||
_BREAKING_INTERNAL = ['_service', '_service_vars']
|
||||
_NEW_IMPL = ['LLM', *_BREAKING_INTERNAL]
|
||||
|
||||
if (_BENTOML_VERSION := utils.pkg.pkg_version_info('bentoml')) > (1, 2):
|
||||
if utils.pkg.pkg_version_info('bentoml') > (1, 2):
|
||||
import _openllm_tiny as _tiny
|
||||
else:
|
||||
_tiny = None
|
||||
@@ -58,7 +55,7 @@ def __getattr__(name: str) -> _t.Any:
|
||||
f'"{name}" is an internal implementation and considered breaking with older OpenLLM. Please migrate your code if you depend on this.'
|
||||
)
|
||||
_warnings.warn(
|
||||
f'"{name}" is considered deprecated implementation and will be removed in the future. Make sure to upgrade to OpenLLM 0.5.x',
|
||||
f'"{name}" is considered deprecated implementation and could be breaking. See https://github.com/bentoml/OpenLLM for more information on upgrading instruction.',
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
@@ -26,7 +26,6 @@ from . import (
|
||||
exceptions as exceptions,
|
||||
serialisation as serialisation,
|
||||
utils as utils,
|
||||
entrypoints as entrypoints,
|
||||
)
|
||||
from .serialisation import ggml as ggml, transformers as transformers, vllm as vllm
|
||||
from ._deprecated import Runner as Runner
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
import importlib
|
||||
from openllm_core.utils import LazyModule
|
||||
|
||||
_import_structure = {'openai': [], 'hf': []}
|
||||
|
||||
|
||||
def mount_entrypoints(svc, llm):
|
||||
for module_name in _import_structure:
|
||||
svc = importlib.import_module(f'.{module_name}', __name__).mount_to_svc(svc, llm)
|
||||
return svc
|
||||
|
||||
|
||||
__lazy = LazyModule(
|
||||
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
|
||||
)
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
@@ -1,17 +0,0 @@
|
||||
"""Entrypoint for all third-party apps.
|
||||
|
||||
Currently support OpenAI compatible API.
|
||||
|
||||
Each module should implement the following API:
|
||||
|
||||
- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
from _bentoml_sdk import Service
|
||||
from openllm_core._typing_compat import M, T
|
||||
|
||||
from . import hf as hf, openai as openai
|
||||
from .._llm import LLM
|
||||
|
||||
def mount_entrypoints(svc: Service[Any], llm: LLM[M, T]) -> Service: ...
|
||||
@@ -1,641 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import inspect
|
||||
import types
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
from starlette.routing import Host, Mount, Route
|
||||
from starlette.schemas import EndpointInfo, SchemaGenerator
|
||||
|
||||
from openllm_core.utils import first_not_none
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import pydantic
|
||||
|
||||
OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
|
||||
# NOTE: OpenAI schema
|
||||
LIST_MODELS_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: >
|
||||
List and describe the various models available in the API.
|
||||
|
||||
You can refer to the available supported models with `openllm models` for more
|
||||
information.
|
||||
operationId: openai__list_models
|
||||
produces:
|
||||
- application/json
|
||||
summary: Describes a model offering that can be used with the API.
|
||||
tags:
|
||||
- OpenAI
|
||||
x-bentoml-name: list_models
|
||||
responses:
|
||||
200:
|
||||
description: The Model object
|
||||
content:
|
||||
application/json:
|
||||
example:
|
||||
object: 'list'
|
||||
data:
|
||||
- id: __model_id__
|
||||
object: model
|
||||
created: 1686935002
|
||||
owned_by: 'na'
|
||||
schema:
|
||||
$ref: '#/components/schemas/ModelList'
|
||||
"""
|
||||
CHAT_COMPLETIONS_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: >-
|
||||
Given a list of messages comprising a conversation, the model will return a
|
||||
response.
|
||||
operationId: openai__chat_completions
|
||||
produces:
|
||||
- application/json
|
||||
tags:
|
||||
- OpenAI
|
||||
x-bentoml-name: create_chat_completions
|
||||
summary: Creates a model response for the given chat conversation.
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
examples:
|
||||
one-shot:
|
||||
summary: One-shot input example
|
||||
value:
|
||||
messages: __chat_messages__
|
||||
model: __model_id__
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
n: 1
|
||||
stream: false
|
||||
chat_template: __chat_template__
|
||||
add_generation_prompt: __add_generation_prompt__
|
||||
echo: false
|
||||
streaming:
|
||||
summary: Streaming input example
|
||||
value:
|
||||
messages:
|
||||
- role: system
|
||||
content: You are a helpful assistant.
|
||||
- role: user
|
||||
content: Hello, I'm looking for a chatbot that can help me with my work.
|
||||
model: __model_id__
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
n: 1
|
||||
stream: true
|
||||
stop:
|
||||
- "<|endoftext|>"
|
||||
chat_template: __chat_template__
|
||||
add_generation_prompt: __add_generation_prompt__
|
||||
echo: false
|
||||
schema:
|
||||
$ref: '#/components/schemas/ChatCompletionRequest'
|
||||
responses:
|
||||
200:
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ChatCompletionResponse'
|
||||
examples:
|
||||
streaming:
|
||||
summary: Streaming output example
|
||||
value: >
|
||||
{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
|
||||
one-shot:
|
||||
summary: One-shot output example
|
||||
value: >
|
||||
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
|
||||
404:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
wrong-model:
|
||||
summary: Wrong model
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 404
|
||||
}
|
||||
}
|
||||
description: NotFound
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
invalid-parameters:
|
||||
summary: Invalid parameters
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 500
|
||||
}
|
||||
}
|
||||
description: Internal Server Error
|
||||
400:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
invalid-json:
|
||||
summary: Invalid JSON sent
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Invalid JSON input received (Check server log).",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 400
|
||||
}
|
||||
}
|
||||
invalid-prompt:
|
||||
summary: Invalid prompt
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Please provide a prompt.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 400
|
||||
}
|
||||
}
|
||||
description: Bad Request
|
||||
"""
|
||||
COMPLETIONS_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: >-
|
||||
Given a prompt, the model will return one or more predicted completions, and can also return the probabilities of alternative tokens at each position. We recommend most users use our Chat completions API.
|
||||
operationId: openai__completions
|
||||
produces:
|
||||
- application/json
|
||||
tags:
|
||||
- OpenAI
|
||||
x-bentoml-name: create_completions
|
||||
summary: Creates a completion for the provided prompt and parameters.
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CompletionRequest'
|
||||
examples:
|
||||
one-shot:
|
||||
summary: One-shot input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
model: __model_id__
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
logprobs: null
|
||||
top_p: 0.43
|
||||
n: 1
|
||||
stream: false
|
||||
streaming:
|
||||
summary: Streaming input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
model: __model_id__
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
logprobs: null
|
||||
n: 1
|
||||
stream: true
|
||||
stop:
|
||||
- "<|endoftext|>"
|
||||
responses:
|
||||
200:
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CompletionResponse'
|
||||
examples:
|
||||
one-shot:
|
||||
summary: One-shot output example
|
||||
value:
|
||||
id: cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7
|
||||
object: text_completion
|
||||
created: 1589478378
|
||||
model: VAR_model_id
|
||||
choices:
|
||||
- text: This is indeed a test
|
||||
index: 0
|
||||
logprobs: null
|
||||
finish_reason: length
|
||||
usage:
|
||||
prompt_tokens: 5
|
||||
completion_tokens: 7
|
||||
total_tokens: 12
|
||||
streaming:
|
||||
summary: Streaming output example
|
||||
value:
|
||||
id: cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe
|
||||
object: text_completion
|
||||
created: 1690759702
|
||||
choices:
|
||||
- text: This
|
||||
index: 0
|
||||
logprobs: null
|
||||
finish_reason: null
|
||||
model: gpt-3.5-turbo-instruct
|
||||
404:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
wrong-model:
|
||||
summary: Wrong model
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 404
|
||||
}
|
||||
}
|
||||
description: NotFound
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
invalid-parameters:
|
||||
summary: Invalid parameters
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 500
|
||||
}
|
||||
}
|
||||
description: Internal Server Error
|
||||
400:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
examples:
|
||||
invalid-json:
|
||||
summary: Invalid JSON sent
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Invalid JSON input received (Check server log).",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 400
|
||||
}
|
||||
}
|
||||
invalid-prompt:
|
||||
summary: Invalid prompt
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Please provide a prompt.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
"code": 400
|
||||
}
|
||||
}
|
||||
description: Bad Request
|
||||
"""
|
||||
HF_AGENT_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: Generate instruction for given HF Agent chain for all OpenLLM supported models.
|
||||
operationId: hf__agent
|
||||
summary: Generate instruction for given HF Agent.
|
||||
tags:
|
||||
- HF
|
||||
x-bentoml-name: hf_agent
|
||||
produces:
|
||||
- application/json
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AgentRequest'
|
||||
example:
|
||||
inputs: "Is the following `text` positive or negative?"
|
||||
parameters:
|
||||
text: "This is a positive text."
|
||||
stop: []
|
||||
required: true
|
||||
responses:
|
||||
200:
|
||||
description: Successfull generated instruction.
|
||||
content:
|
||||
application/json:
|
||||
example:
|
||||
- generated_text: "This is a generated instruction."
|
||||
schema:
|
||||
$ref: '#/components/schemas/AgentResponse'
|
||||
400:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Bad Request
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Not Found
|
||||
"""
|
||||
HF_ADAPTERS_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: Return current list of adapters for given LLM.
|
||||
operationId: hf__adapters_map
|
||||
produces:
|
||||
- application/json
|
||||
summary: Describes a model offering that can be used with the API.
|
||||
tags:
|
||||
- HF
|
||||
x-bentoml-name: hf_adapters
|
||||
responses:
|
||||
200:
|
||||
description: Return list of LoRA adapters.
|
||||
content:
|
||||
application/json:
|
||||
example:
|
||||
aarnphm/opt-6-7b-quotes:
|
||||
adapter_name: default
|
||||
adapter_type: LORA
|
||||
aarnphm/opt-6-7b-dolly:
|
||||
adapter_name: dolly
|
||||
adapter_type: LORA
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/HFErrorResponse'
|
||||
description: Not Found
|
||||
"""
|
||||
COHERE_GENERATE_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: >-
|
||||
Given a prompt, the model will return one or more predicted completions, and
|
||||
can also return the probabilities of alternative tokens at each position.
|
||||
operationId: cohere__generate
|
||||
produces:
|
||||
- application/json
|
||||
tags:
|
||||
- Cohere
|
||||
x-bentoml-name: cohere_generate
|
||||
summary: Creates a completion for the provided prompt and parameters.
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CohereGenerateRequest'
|
||||
examples:
|
||||
one-shot:
|
||||
summary: One-shot input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
p: 0.43
|
||||
k: 12
|
||||
num_generations: 2
|
||||
stream: false
|
||||
streaming:
|
||||
summary: Streaming input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
p: 0.43
|
||||
k: 12
|
||||
num_generations: 2
|
||||
stream: true
|
||||
stop_sequences:
|
||||
- "<|endoftext|>"
|
||||
"""
|
||||
COHERE_CHAT_SCHEMA = """\
|
||||
---
|
||||
consumes:
|
||||
- application/json
|
||||
description: >-
|
||||
Given a list of messages comprising a conversation, the model will return a response.
|
||||
operationId: cohere__chat
|
||||
produces:
|
||||
- application/json
|
||||
tags:
|
||||
- Cohere
|
||||
x-bentoml-name: cohere_chat
|
||||
summary: Creates a model response for the given chat conversation.
|
||||
"""
|
||||
|
||||
_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}
|
||||
|
||||
|
||||
def apply_schema(func, **attrs):
|
||||
for k, v in attrs.items():
|
||||
func.__doc__ = func.__doc__.replace(k, v)
|
||||
return func
|
||||
|
||||
|
||||
def add_schema_definitions(func):
|
||||
append_str = _SCHEMAS.get(func.__name__.lower(), '')
|
||||
if not append_str:
|
||||
return func
|
||||
if func.__doc__ is None:
|
||||
func.__doc__ = ''
|
||||
func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip()
|
||||
return func
|
||||
|
||||
|
||||
class OpenLLMSchemaGenerator(SchemaGenerator):
|
||||
def get_endpoints(self, routes):
|
||||
endpoints_info = []
|
||||
for route in routes:
|
||||
if isinstance(route, (Mount, Host)):
|
||||
routes = route.routes or []
|
||||
path = self._remove_converter(route.path) if isinstance(route, Mount) else ''
|
||||
sub_endpoints = [
|
||||
EndpointInfo(path=f'{path}{sub_endpoint.path}', http_method=sub_endpoint.http_method, func=sub_endpoint.func)
|
||||
for sub_endpoint in self.get_endpoints(routes)
|
||||
]
|
||||
endpoints_info.extend(sub_endpoints)
|
||||
elif not isinstance(route, Route) or not route.include_in_schema:
|
||||
continue
|
||||
elif (
|
||||
inspect.isfunction(route.endpoint)
|
||||
or inspect.ismethod(route.endpoint)
|
||||
or isinstance(route.endpoint, functools.partial)
|
||||
):
|
||||
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
|
||||
path = self._remove_converter(route.path)
|
||||
for method in route.methods or ['GET']:
|
||||
if method == 'HEAD':
|
||||
continue
|
||||
endpoints_info.append(EndpointInfo(path, method.lower(), endpoint))
|
||||
else:
|
||||
path = self._remove_converter(route.path)
|
||||
for method in ['get', 'post', 'put', 'patch', 'delete', 'options']:
|
||||
if not hasattr(route.endpoint, method):
|
||||
continue
|
||||
func = getattr(route.endpoint, method)
|
||||
endpoints_info.append(EndpointInfo(path, method.lower(), func))
|
||||
return endpoints_info
|
||||
|
||||
def get_schema(self, routes, mount_path=None):
|
||||
schema = dict(self.base_schema)
|
||||
schema.setdefault('paths', {})
|
||||
endpoints_info = self.get_endpoints(routes)
|
||||
if mount_path:
|
||||
mount_path = f'/{mount_path}' if not mount_path.startswith('/') else mount_path
|
||||
|
||||
for endpoint in endpoints_info:
|
||||
parsed = self.parse_docstring(endpoint.func)
|
||||
if not parsed:
|
||||
continue
|
||||
|
||||
path = endpoint.path if mount_path is None else mount_path + endpoint.path
|
||||
if path not in schema['paths']:
|
||||
schema['paths'][path] = {}
|
||||
schema['paths'][path][endpoint.http_method] = parsed
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def get_generator(title, components=None, tags=None, inject=True):
|
||||
base_schema = {'info': {'title': title, 'version': API_VERSION}, 'version': OPENAPI_VERSION}
|
||||
if components and inject:
|
||||
base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
|
||||
if tags is not None and tags and inject:
|
||||
base_schema['tags'] = tags
|
||||
return OpenLLMSchemaGenerator(base_schema)
|
||||
|
||||
|
||||
def component_schema_generator(attr_cls: pydantic.BaseModel, description=None):
|
||||
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
|
||||
schema['description'] = first_not_none(
|
||||
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
|
||||
)
|
||||
for name, field in attr_cls.model_fields.items():
|
||||
attr_type = field.annotation
|
||||
origin_type = t.get_origin(attr_type)
|
||||
args_type = t.get_args(attr_type)
|
||||
|
||||
# Map Python types to OpenAPI schema types
|
||||
if isinstance(attr_type, str):
|
||||
schema_type = 'string'
|
||||
elif isinstance(attr_type, int):
|
||||
schema_type = 'integer'
|
||||
elif isinstance(attr_type, float):
|
||||
schema_type = 'number'
|
||||
elif isinstance(attr_type, bool):
|
||||
schema_type = 'boolean'
|
||||
elif origin_type is list or origin_type is tuple:
|
||||
schema_type = 'array'
|
||||
elif origin_type is dict:
|
||||
schema_type = 'object'
|
||||
# Assuming string keys for simplicity, and handling Any type for values
|
||||
prop_schema = {'type': 'object', 'additionalProperties': True if args_type[1] is t.Any else {'type': 'string'}}
|
||||
elif attr_type == t.Optional[str]:
|
||||
schema_type = 'string'
|
||||
elif origin_type is t.Union and t.Any in args_type:
|
||||
schema_type = 'object'
|
||||
prop_schema = {'type': 'object', 'additionalProperties': True}
|
||||
else:
|
||||
schema_type = 'string'
|
||||
|
||||
if 'prop_schema' not in locals():
|
||||
prop_schema = {'type': schema_type}
|
||||
if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory):
|
||||
prop_schema['default'] = field.default
|
||||
if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)):
|
||||
schema['required'].append(name)
|
||||
schema['properties'][name] = prop_schema
|
||||
locals().pop('prop_schema', None)
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
_SimpleSchema = types.new_class(
|
||||
'_SimpleSchema',
|
||||
(object,),
|
||||
{},
|
||||
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
|
||||
)
|
||||
|
||||
|
||||
def append_schemas(svc, generated_schema, tags_order='prepend', inject=True):
|
||||
# HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec.
|
||||
from bentoml._internal.service.openapi.specification import OpenAPISpecification
|
||||
|
||||
if not inject:
|
||||
return svc
|
||||
|
||||
svc_schema = svc.openapi_spec
|
||||
if isinstance(svc_schema, (OpenAPISpecification, _SimpleSchema)):
|
||||
svc_schema = svc_schema.asdict()
|
||||
if 'tags' in generated_schema:
|
||||
if tags_order == 'prepend':
|
||||
svc_schema['tags'] = generated_schema['tags'] + svc_schema['tags']
|
||||
elif tags_order == 'append':
|
||||
svc_schema['tags'].extend(generated_schema['tags'])
|
||||
else:
|
||||
raise ValueError(f'Invalid tags_order: {tags_order}')
|
||||
if 'components' in generated_schema:
|
||||
svc_schema['components']['schemas'].update(generated_schema['components']['schemas'])
|
||||
svc_schema['paths'].update(generated_schema['paths'])
|
||||
|
||||
# HACK: mk this attribute until we have a better way to add starlette schemas.
|
||||
from bentoml._internal.service import openapi
|
||||
|
||||
def _generate_spec(svc, openapi_version=OPENAPI_VERSION):
|
||||
return _SimpleSchema(svc_schema)
|
||||
|
||||
def asdict(self):
|
||||
return svc_schema
|
||||
|
||||
openapi.generate_spec = _generate_spec
|
||||
OpenAPISpecification.asdict = asdict
|
||||
return svc
|
||||
@@ -1,29 +0,0 @@
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Type
|
||||
|
||||
from attr import AttrsInstance
|
||||
from starlette.routing import BaseRoute
|
||||
from starlette.schemas import EndpointInfo
|
||||
|
||||
from bentoml import Service
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
|
||||
P = ParamSpec('P')
|
||||
|
||||
class OpenLLMSchemaGenerator:
|
||||
base_schema: Dict[str, Any]
|
||||
def get_endpoints(self, routes: list[BaseRoute]) -> list[EndpointInfo]: ...
|
||||
def get_schema(self, routes: list[BaseRoute], mount_path: Optional[str] = ...) -> Dict[str, Any]: ...
|
||||
def parse_docstring(self, func_or_method: Callable[P, Any]) -> Dict[str, Any]: ...
|
||||
|
||||
def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
|
||||
def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
|
||||
def append_schemas(
|
||||
svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
|
||||
) -> Service: ...
|
||||
def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
|
||||
def get_generator(
|
||||
title: str,
|
||||
components: Optional[List[Type[AttrsInstance]]] = ...,
|
||||
tags: Optional[List[Dict[str, Any]]] = ...,
|
||||
inject: bool = ...,
|
||||
) -> OpenLLMSchemaGenerator: ...
|
||||
@@ -1,63 +0,0 @@
|
||||
import functools, logging
|
||||
from http import HTTPStatus
|
||||
import orjson
|
||||
from starlette.applications import Starlette
|
||||
from starlette.responses import JSONResponse
|
||||
from starlette.routing import Route
|
||||
from openllm_core.utils import converter
|
||||
from ._openapi import add_schema_definitions, append_schemas, get_generator
|
||||
from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
|
||||
|
||||
schemas = get_generator(
|
||||
'hf',
|
||||
components=[AgentRequest, AgentResponse, HFErrorResponse],
|
||||
tags=[
|
||||
{
|
||||
'name': 'HF',
|
||||
'description': 'HF integration, including Agent and others schema endpoints.',
|
||||
'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent',
|
||||
}
|
||||
],
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def mount_to_svc(svc, llm):
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[
|
||||
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
|
||||
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
|
||||
],
|
||||
)
|
||||
mount_path = '/hf'
|
||||
svc.mount_asgi_app(app, path=mount_path)
|
||||
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
|
||||
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(
|
||||
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
|
||||
status_code=status_code.value,
|
||||
)
|
||||
|
||||
|
||||
@add_schema_definitions
|
||||
async def hf_agent(req, llm):
|
||||
json_str = await req.body()
|
||||
try:
|
||||
request = converter.structure(orjson.loads(json_str), AgentRequest)
|
||||
except orjson.JSONDecodeError as err:
|
||||
logger.debug('Sent body: %s', json_str)
|
||||
logger.error('Invalid JSON input received: %s', err)
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
|
||||
|
||||
stop = request.parameters.pop('stop', [])
|
||||
try:
|
||||
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
|
||||
return JSONResponse(
|
||||
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error('Error while generating: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
|
||||
@@ -1,14 +0,0 @@
|
||||
from http import HTTPStatus
|
||||
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import JSONResponse, Response
|
||||
|
||||
from bentoml import Service
|
||||
from openllm_core._typing_compat import M, T
|
||||
|
||||
from .._llm import LLM
|
||||
|
||||
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
|
||||
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
|
||||
async def hf_agent(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
def hf_adapters(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
@@ -1,448 +0,0 @@
|
||||
import functools
|
||||
import logging
|
||||
import time
|
||||
import traceback
|
||||
from http import HTTPStatus
|
||||
|
||||
import orjson
|
||||
from starlette.applications import Starlette
|
||||
from starlette.responses import JSONResponse, StreamingResponse
|
||||
from starlette.routing import Route
|
||||
|
||||
from openllm_core.utils import converter, gen_random_uuid
|
||||
|
||||
from ._openapi import add_schema_definitions, append_schemas, apply_schema, get_generator
|
||||
from openllm_core.protocol.openai import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionResponseChoice,
|
||||
ChatCompletionResponseStreamChoice,
|
||||
ChatCompletionStreamResponse,
|
||||
ChatMessage,
|
||||
CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionResponseChoice,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
Delta,
|
||||
ErrorResponse,
|
||||
LogProbs,
|
||||
ModelCard,
|
||||
ModelList,
|
||||
UsageInfo,
|
||||
)
|
||||
|
||||
schemas = get_generator(
|
||||
'openai',
|
||||
components=[
|
||||
ErrorResponse,
|
||||
ModelList,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionStreamResponse,
|
||||
CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionStreamResponse,
|
||||
],
|
||||
tags=[
|
||||
{
|
||||
'name': 'OpenAI',
|
||||
'description': 'OpenAI Compatible API support',
|
||||
'externalDocs': 'https://platform.openai.com/docs/api-reference/completions/object',
|
||||
}
|
||||
],
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def jsonify_attr(obj):
|
||||
return orjson.dumps(converter.unstructure(obj)).decode()
|
||||
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(
|
||||
{
|
||||
'error': converter.unstructure(
|
||||
ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
|
||||
)
|
||||
},
|
||||
status_code=status_code.value,
|
||||
)
|
||||
|
||||
|
||||
async def check_model(request, model): # noqa
|
||||
if request.model == model:
|
||||
return None
|
||||
return error_response(
|
||||
HTTPStatus.NOT_FOUND,
|
||||
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see available models.\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
)
|
||||
|
||||
|
||||
def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initial_text_offset=0, *, llm):
|
||||
# Create OpenAI-style logprobs.
|
||||
logprobs = LogProbs()
|
||||
last_token_len = 0
|
||||
if num_output_top_logprobs:
|
||||
logprobs.top_logprobs = []
|
||||
for i, token_id in enumerate(token_ids):
|
||||
step_top_logprobs = top_logprobs[i]
|
||||
token_logprob = None
|
||||
if step_top_logprobs is not None:
|
||||
token_logprob = step_top_logprobs[token_id]
|
||||
token = llm.tokenizer.convert_ids_to_tokens(token_id)
|
||||
logprobs.tokens.append(token)
|
||||
logprobs.token_logprobs.append(token_logprob)
|
||||
if len(logprobs.text_offset) == 0:
|
||||
logprobs.text_offset.append(initial_text_offset)
|
||||
else:
|
||||
logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
|
||||
last_token_len = len(token)
|
||||
if num_output_top_logprobs:
|
||||
logprobs.top_logprobs.append(
|
||||
{llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
|
||||
if step_top_logprobs
|
||||
else None
|
||||
)
|
||||
return logprobs
|
||||
|
||||
|
||||
def mount_to_svc(svc, llm):
|
||||
list_models.__doc__ = list_models.__doc__.replace('__model_id__', llm.llm_type)
|
||||
completions.__doc__ = completions.__doc__.replace('__model_id__', llm.llm_type)
|
||||
chat_completions.__doc__ = chat_completions.__doc__.replace('__model_id__', llm.llm_type)
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[
|
||||
Route(
|
||||
'/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
|
||||
),
|
||||
Route(
|
||||
'/completions',
|
||||
functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
|
||||
methods=['POST'],
|
||||
),
|
||||
Route(
|
||||
'/chat/completions',
|
||||
functools.partial(
|
||||
apply_schema(
|
||||
chat_completions,
|
||||
__model_id__=llm.llm_type,
|
||||
__chat_template__=orjson.dumps(llm.config.chat_template).decode(),
|
||||
__chat_messages__=orjson.dumps(llm.config.chat_messages).decode(),
|
||||
__add_generation_prompt__=str(True) if llm.config.chat_messages is not None else str(False),
|
||||
),
|
||||
llm=llm,
|
||||
),
|
||||
methods=['POST'],
|
||||
),
|
||||
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
|
||||
],
|
||||
)
|
||||
svc.mount_asgi_app(app, path='/v1')
|
||||
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path='/v1'))
|
||||
|
||||
|
||||
# GET /v1/models
|
||||
@add_schema_definitions
|
||||
def list_models(_, llm):
|
||||
return JSONResponse(
|
||||
converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
|
||||
)
|
||||
|
||||
|
||||
# POST /v1/chat/completions
|
||||
@add_schema_definitions
|
||||
async def chat_completions(req, llm):
|
||||
# TODO: Check for length based on model context_length
|
||||
json_str = await req.body()
|
||||
try:
|
||||
request = converter.structure(orjson.loads(json_str), ChatCompletionRequest)
|
||||
except orjson.JSONDecodeError as err:
|
||||
logger.debug('Sent body: %s', json_str)
|
||||
logger.error('Invalid JSON input received: %s', err)
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
|
||||
logger.debug('Received chat completion request: %s', request)
|
||||
err_check = await check_model(request, llm.llm_type)
|
||||
if err_check is not None:
|
||||
return err_check
|
||||
|
||||
if request.logit_bias is not None and len(request.logit_bias) > 0:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
|
||||
|
||||
model_name, request_id = request.model, gen_random_uuid('chatcmpl')
|
||||
created_time = int(time.monotonic())
|
||||
prompt = llm.tokenizer.apply_chat_template(
|
||||
request.messages,
|
||||
tokenize=False,
|
||||
chat_template=request.chat_template if request.chat_template != 'None' else None,
|
||||
add_generation_prompt=request.add_generation_prompt,
|
||||
)
|
||||
logger.debug('Prompt: %r', prompt)
|
||||
config = llm.config.compatible_options(request)
|
||||
|
||||
def get_role() -> str:
|
||||
return (
|
||||
request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
|
||||
) # TODO: Support custom role here.
|
||||
|
||||
try:
|
||||
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
logger.error('Error generating completion: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
|
||||
|
||||
def create_stream_response_json(index, text, finish_reason=None, usage=None):
|
||||
response = ChatCompletionStreamResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=[
|
||||
ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
|
||||
],
|
||||
)
|
||||
if usage is not None:
|
||||
response.usage = usage
|
||||
return jsonify_attr(response)
|
||||
|
||||
async def completion_stream_generator():
|
||||
# first chunk with role
|
||||
role = get_role()
|
||||
for i in range(config['n']):
|
||||
yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(role=role), finish_reason=None)], model=model_name))}\n\n'
|
||||
|
||||
if request.echo:
|
||||
last_message, last_content = request.messages[-1], ''
|
||||
if last_message.get('content') and last_message.get('role') == role:
|
||||
last_content = last_message['content']
|
||||
if last_content:
|
||||
for i in range(config['n']):
|
||||
yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(content=last_content), finish_reason=None)], model=model_name))}\n\n'
|
||||
|
||||
previous_num_tokens = [0] * config['n']
|
||||
finish_reason_sent = [False] * config['n']
|
||||
async for res in result_generator:
|
||||
for output in res.outputs:
|
||||
if finish_reason_sent[output.index]:
|
||||
continue
|
||||
yield f'data: {create_stream_response_json(output.index, output.text)}\n\n'
|
||||
previous_num_tokens[output.index] += len(output.token_ids)
|
||||
if output.finish_reason is not None:
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
|
||||
yield f'data: {create_stream_response_json(output.index, "", output.finish_reason, usage)}\n\n'
|
||||
finish_reason_sent[output.index] = True
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
try:
|
||||
# Streaming case
|
||||
if request.stream:
|
||||
return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
|
||||
# Non-streaming case
|
||||
final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
|
||||
async for res in result_generator:
|
||||
if await req.is_disconnected():
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
|
||||
for output in res.outputs:
|
||||
texts[output.index].append(output.text)
|
||||
token_ids[output.index].extend(output.token_ids)
|
||||
final_result = res
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.model_copy(
|
||||
update=dict(
|
||||
outputs=[
|
||||
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
|
||||
for output in final_result.outputs
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
role = get_role()
|
||||
choices = [
|
||||
ChatCompletionResponseChoice(
|
||||
index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
|
||||
)
|
||||
for output in final_result.outputs
|
||||
]
|
||||
if request.echo:
|
||||
last_message, last_content = request.messages[-1], ''
|
||||
if last_message.get('content') and last_message.get('role') == role:
|
||||
last_content = last_message['content']
|
||||
for choice in choices:
|
||||
full_message = last_content + choice.message.content
|
||||
choice.message.content = full_message
|
||||
|
||||
num_prompt_tokens = len(final_result.prompt_token_ids)
|
||||
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
|
||||
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
|
||||
response = ChatCompletionResponse(
|
||||
id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
|
||||
)
|
||||
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
logger.error('Error generating completion: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
|
||||
|
||||
|
||||
# POST /v1/completions
|
||||
@add_schema_definitions
|
||||
async def completions(req, llm):
|
||||
# TODO: Check for length based on model context_length
|
||||
json_str = await req.body()
|
||||
try:
|
||||
request = converter.structure(orjson.loads(json_str), CompletionRequest)
|
||||
except orjson.JSONDecodeError as err:
|
||||
logger.debug('Sent body: %s', json_str)
|
||||
logger.error('Invalid JSON input received: %s', err)
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
|
||||
logger.debug('Received legacy completion request: %s', request)
|
||||
err_check = await check_model(request, llm.llm_type)
|
||||
if err_check is not None:
|
||||
return err_check
|
||||
|
||||
# OpenAI API supports echoing the prompt when max_tokens is 0.
|
||||
echo_without_generation = request.echo and request.max_tokens == 0
|
||||
if echo_without_generation:
|
||||
request.max_tokens = 1 # XXX: Hack to make sure we get the prompt back.
|
||||
|
||||
if request.suffix is not None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, "'suffix' is not yet supported.")
|
||||
if request.logit_bias is not None and len(request.logit_bias) > 0:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
|
||||
|
||||
if not request.prompt:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Please provide a prompt.')
|
||||
prompt = request.prompt
|
||||
# TODO: Support multiple prompts
|
||||
|
||||
model_name, request_id = request.model, gen_random_uuid('cmpl')
|
||||
created_time = int(time.monotonic())
|
||||
config = llm.config.compatible_options(request)
|
||||
|
||||
try:
|
||||
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
logger.error('Error generating completion: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
|
||||
|
||||
# best_of != n then we don't stream
|
||||
# TODO: support use_beam_search
|
||||
stream = request.stream and (config['best_of'] is None or config['n'] == config['best_of'])
|
||||
|
||||
def create_stream_response_json(index, text, logprobs=None, finish_reason=None, usage=None):
|
||||
response = CompletionStreamResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=[CompletionResponseStreamChoice(index=index, text=text, logprobs=logprobs, finish_reason=finish_reason)],
|
||||
)
|
||||
if usage:
|
||||
response.usage = usage
|
||||
return jsonify_attr(response)
|
||||
|
||||
async def completion_stream_generator():
|
||||
previous_num_tokens = [0] * config['n']
|
||||
previous_texts = [''] * config['n']
|
||||
previous_echo = [False] * config['n']
|
||||
async for res in result_generator:
|
||||
for output in res.outputs:
|
||||
i = output.index
|
||||
delta_text = output.text
|
||||
token_ids = output.token_ids
|
||||
logprobs = None
|
||||
top_logprobs = None
|
||||
if request.logprobs is not None:
|
||||
top_logprobs = output.logprobs[previous_num_tokens[i] :]
|
||||
|
||||
if request.echo and not previous_echo[i]:
|
||||
if not echo_without_generation:
|
||||
delta_text = res.prompt + delta_text
|
||||
token_ids = res.prompt_token_ids + token_ids
|
||||
if top_logprobs:
|
||||
top_logprobs = res.prompt_logprobs + top_logprobs
|
||||
else:
|
||||
delta_text = res.prompt
|
||||
token_ids = res.prompt_token_ids
|
||||
if top_logprobs:
|
||||
top_logprobs = res.prompt_logprobs
|
||||
previous_echo[i] = True
|
||||
if request.logprobs is not None:
|
||||
logprobs = create_logprobs(
|
||||
output.token_ids,
|
||||
output.logprobs[previous_num_tokens[i] :],
|
||||
request.logprobs,
|
||||
len(previous_texts[i]),
|
||||
llm=llm,
|
||||
)
|
||||
previous_num_tokens[i] += len(output.token_ids)
|
||||
previous_texts[i] += output.text
|
||||
yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
|
||||
if output.finish_reason is not None:
|
||||
logprobs = LogProbs() if request.logprobs is not None else None
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
|
||||
yield f'data: {create_stream_response_json(i, "", logprobs, output.finish_reason, usage)}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
try:
|
||||
# Streaming case
|
||||
if stream:
|
||||
return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
|
||||
# Non-streaming case
|
||||
final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
|
||||
async for res in result_generator:
|
||||
if await req.is_disconnected():
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
|
||||
for output in res.outputs:
|
||||
texts[output.index].append(output.text)
|
||||
token_ids[output.index].extend(output.token_ids)
|
||||
final_result = res
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.model_copy(
|
||||
update=dict(
|
||||
outputs=[
|
||||
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
|
||||
for output in final_result.outputs
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
choices = []
|
||||
prompt_token_ids = final_result.prompt_token_ids
|
||||
prompt_logprobs = final_result.prompt_logprobs
|
||||
prompt_text = final_result.prompt
|
||||
for output in final_result.outputs:
|
||||
logprobs = None
|
||||
if request.logprobs is not None:
|
||||
if not echo_without_generation:
|
||||
token_ids, top_logprobs = output.token_ids, output.logprobs
|
||||
if request.echo:
|
||||
token_ids, top_logprobs = prompt_token_ids + token_ids, prompt_logprobs + top_logprobs
|
||||
else:
|
||||
token_ids, top_logprobs = prompt_token_ids, prompt_logprobs
|
||||
logprobs = create_logprobs(token_ids, top_logprobs, request.logprobs, llm=llm)
|
||||
if not echo_without_generation:
|
||||
output_text = output.text
|
||||
if request.echo:
|
||||
output_text = prompt_text + output_text
|
||||
else:
|
||||
output_text = prompt_text
|
||||
choice_data = CompletionResponseChoice(
|
||||
index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
|
||||
)
|
||||
choices.append(choice_data)
|
||||
|
||||
num_prompt_tokens = len(final_result.prompt_token_ids)
|
||||
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
|
||||
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
|
||||
response = CompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
|
||||
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
logger.error('Error generating completion: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
|
||||
@@ -1,30 +0,0 @@
|
||||
from http import HTTPStatus
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from attr import AttrsInstance
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import JSONResponse, Response
|
||||
|
||||
from bentoml import Service
|
||||
from openllm_core._typing_compat import M, T
|
||||
|
||||
from .._llm import LLM
|
||||
from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
|
||||
|
||||
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
|
||||
def jsonify_attr(obj: AttrsInstance) -> str: ...
|
||||
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
|
||||
async def check_model(
|
||||
request: Union[CompletionRequest, ChatCompletionRequest], model: str
|
||||
) -> Optional[JSONResponse]: ...
|
||||
def create_logprobs(
|
||||
token_ids: List[int],
|
||||
top_logprobs: List[Dict[int, float]], #
|
||||
num_output_top_logprobs: Optional[int] = ...,
|
||||
initial_text_offset: int = ...,
|
||||
*,
|
||||
llm: LLM[M, T],
|
||||
) -> LogProbs: ...
|
||||
def list_models(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
async def chat_completions(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
async def completions(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
Reference in New Issue
Block a user