chore(releases): remove deadcode

Signed-off-by: Aaron Pham (mbp16) <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham (mbp16)
2024-05-27 12:37:50 -04:00
parent da42c269c9
commit f4f7f16e81
38 changed files with 97 additions and 3291 deletions

View File

@@ -27,14 +27,11 @@ __lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once my
'exceptions': [],
'client': ['HTTPClient', 'AsyncHTTPClient'],
'bundle': [],
'testing': [],
'utils': ['api'],
'entrypoints': ['mount_entrypoints'],
'serialisation': ['ggml', 'transformers', 'vllm'],
'_llm': ['LLM'],
'_deprecated': ['Runner'],
'_runners': ['runner'],
'_quantisation': ['infer_quantisation_config'],
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
},
extra_objects={'COMPILED': COMPILED},
@@ -44,7 +41,7 @@ __all__, __dir__ = __lazy.__all__, __lazy.__dir__
_BREAKING_INTERNAL = ['_service', '_service_vars']
_NEW_IMPL = ['LLM', *_BREAKING_INTERNAL]
if (_BENTOML_VERSION := utils.pkg.pkg_version_info('bentoml')) > (1, 2):
if utils.pkg.pkg_version_info('bentoml') > (1, 2):
import _openllm_tiny as _tiny
else:
_tiny = None
@@ -58,7 +55,7 @@ def __getattr__(name: str) -> _t.Any:
f'"{name}" is an internal implementation and considered breaking with older OpenLLM. Please migrate your code if you depend on this.'
)
_warnings.warn(
f'"{name}" is considered deprecated implementation and will be removed in the future. Make sure to upgrade to OpenLLM 0.5.x',
f'"{name}" is considered deprecated implementation and could be breaking. See https://github.com/bentoml/OpenLLM for more information on upgrading instruction.',
DeprecationWarning,
stacklevel=3,
)

View File

@@ -26,7 +26,6 @@ from . import (
exceptions as exceptions,
serialisation as serialisation,
utils as utils,
entrypoints as entrypoints,
)
from .serialisation import ggml as ggml, transformers as transformers, vllm as vllm
from ._deprecated import Runner as Runner

View File

@@ -1,16 +0,0 @@
import importlib
from openllm_core.utils import LazyModule
_import_structure = {'openai': [], 'hf': []}
def mount_entrypoints(svc, llm):
for module_name in _import_structure:
svc = importlib.import_module(f'.{module_name}', __name__).mount_to_svc(svc, llm)
return svc
__lazy = LazyModule(
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
)
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,17 +0,0 @@
"""Entrypoint for all third-party apps.
Currently support OpenAI compatible API.
Each module should implement the following API:
- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
"""
from typing import Any
from _bentoml_sdk import Service
from openllm_core._typing_compat import M, T
from . import hf as hf, openai as openai
from .._llm import LLM
def mount_entrypoints(svc: Service[Any], llm: LLM[M, T]) -> Service: ...

View File

@@ -1,641 +0,0 @@
from __future__ import annotations
import functools
import inspect
import types
import typing as t
import attr
from starlette.routing import Host, Mount, Route
from starlette.schemas import EndpointInfo, SchemaGenerator
from openllm_core.utils import first_not_none
if t.TYPE_CHECKING:
import pydantic
OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
# NOTE: OpenAI schema
LIST_MODELS_SCHEMA = """\
---
consumes:
- application/json
description: >
List and describe the various models available in the API.
You can refer to the available supported models with `openllm models` for more
information.
operationId: openai__list_models
produces:
- application/json
summary: Describes a model offering that can be used with the API.
tags:
- OpenAI
x-bentoml-name: list_models
responses:
200:
description: The Model object
content:
application/json:
example:
object: 'list'
data:
- id: __model_id__
object: model
created: 1686935002
owned_by: 'na'
schema:
$ref: '#/components/schemas/ModelList'
"""
CHAT_COMPLETIONS_SCHEMA = """\
---
consumes:
- application/json
description: >-
Given a list of messages comprising a conversation, the model will return a
response.
operationId: openai__chat_completions
produces:
- application/json
tags:
- OpenAI
x-bentoml-name: create_chat_completions
summary: Creates a model response for the given chat conversation.
requestBody:
required: true
content:
application/json:
examples:
one-shot:
summary: One-shot input example
value:
messages: __chat_messages__
model: __model_id__
max_tokens: 256
temperature: 0.7
top_p: 0.43
n: 1
stream: false
chat_template: __chat_template__
add_generation_prompt: __add_generation_prompt__
echo: false
streaming:
summary: Streaming input example
value:
messages:
- role: system
content: You are a helpful assistant.
- role: user
content: Hello, I'm looking for a chatbot that can help me with my work.
model: __model_id__
max_tokens: 256
temperature: 0.7
top_p: 0.43
n: 1
stream: true
stop:
- "<|endoftext|>"
chat_template: __chat_template__
add_generation_prompt: __add_generation_prompt__
echo: false
schema:
$ref: '#/components/schemas/ChatCompletionRequest'
responses:
200:
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/ChatCompletionResponse'
examples:
streaming:
summary: Streaming output example
value: >
{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
one-shot:
summary: One-shot output example
value: >
{"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
404:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
wrong-model:
summary: Wrong model
value: >
{
"error": {
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 404
}
}
description: NotFound
500:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
invalid-parameters:
summary: Invalid parameters
value: >
{
"error": {
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 500
}
}
description: Internal Server Error
400:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
invalid-json:
summary: Invalid JSON sent
value: >
{
"error": {
"message": "Invalid JSON input received (Check server log).",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 400
}
}
invalid-prompt:
summary: Invalid prompt
value: >
{
"error": {
"message": "Please provide a prompt.",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 400
}
}
description: Bad Request
"""
COMPLETIONS_SCHEMA = """\
---
consumes:
- application/json
description: >-
Given a prompt, the model will return one or more predicted completions, and can also return the probabilities of alternative tokens at each position. We recommend most users use our Chat completions API.
operationId: openai__completions
produces:
- application/json
tags:
- OpenAI
x-bentoml-name: create_completions
summary: Creates a completion for the provided prompt and parameters.
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CompletionRequest'
examples:
one-shot:
summary: One-shot input example
value:
prompt: This is a test
model: __model_id__
max_tokens: 256
temperature: 0.7
logprobs: null
top_p: 0.43
n: 1
stream: false
streaming:
summary: Streaming input example
value:
prompt: This is a test
model: __model_id__
max_tokens: 256
temperature: 0.7
top_p: 0.43
logprobs: null
n: 1
stream: true
stop:
- "<|endoftext|>"
responses:
200:
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/CompletionResponse'
examples:
one-shot:
summary: One-shot output example
value:
id: cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7
object: text_completion
created: 1589478378
model: VAR_model_id
choices:
- text: This is indeed a test
index: 0
logprobs: null
finish_reason: length
usage:
prompt_tokens: 5
completion_tokens: 7
total_tokens: 12
streaming:
summary: Streaming output example
value:
id: cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe
object: text_completion
created: 1690759702
choices:
- text: This
index: 0
logprobs: null
finish_reason: null
model: gpt-3.5-turbo-instruct
404:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
wrong-model:
summary: Wrong model
value: >
{
"error": {
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 404
}
}
description: NotFound
500:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
invalid-parameters:
summary: Invalid parameters
value: >
{
"error": {
"message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 500
}
}
description: Internal Server Error
400:
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
examples:
invalid-json:
summary: Invalid JSON sent
value: >
{
"error": {
"message": "Invalid JSON input received (Check server log).",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 400
}
}
invalid-prompt:
summary: Invalid prompt
value: >
{
"error": {
"message": "Please provide a prompt.",
"type": "invalid_request_error",
"object": "error",
"param": null,
"code": 400
}
}
description: Bad Request
"""
HF_AGENT_SCHEMA = """\
---
consumes:
- application/json
description: Generate instruction for given HF Agent chain for all OpenLLM supported models.
operationId: hf__agent
summary: Generate instruction for given HF Agent.
tags:
- HF
x-bentoml-name: hf_agent
produces:
- application/json
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/AgentRequest'
example:
inputs: "Is the following `text` positive or negative?"
parameters:
text: "This is a positive text."
stop: []
required: true
responses:
200:
description: Successfull generated instruction.
content:
application/json:
example:
- generated_text: "This is a generated instruction."
schema:
$ref: '#/components/schemas/AgentResponse'
400:
content:
application/json:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Bad Request
500:
content:
application/json:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
"""
HF_ADAPTERS_SCHEMA = """\
---
consumes:
- application/json
description: Return current list of adapters for given LLM.
operationId: hf__adapters_map
produces:
- application/json
summary: Describes a model offering that can be used with the API.
tags:
- HF
x-bentoml-name: hf_adapters
responses:
200:
description: Return list of LoRA adapters.
content:
application/json:
example:
aarnphm/opt-6-7b-quotes:
adapter_name: default
adapter_type: LORA
aarnphm/opt-6-7b-dolly:
adapter_name: dolly
adapter_type: LORA
500:
content:
application/json:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
"""
COHERE_GENERATE_SCHEMA = """\
---
consumes:
- application/json
description: >-
Given a prompt, the model will return one or more predicted completions, and
can also return the probabilities of alternative tokens at each position.
operationId: cohere__generate
produces:
- application/json
tags:
- Cohere
x-bentoml-name: cohere_generate
summary: Creates a completion for the provided prompt and parameters.
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CohereGenerateRequest'
examples:
one-shot:
summary: One-shot input example
value:
prompt: This is a test
max_tokens: 256
temperature: 0.7
p: 0.43
k: 12
num_generations: 2
stream: false
streaming:
summary: Streaming input example
value:
prompt: This is a test
max_tokens: 256
temperature: 0.7
p: 0.43
k: 12
num_generations: 2
stream: true
stop_sequences:
- "<|endoftext|>"
"""
COHERE_CHAT_SCHEMA = """\
---
consumes:
- application/json
description: >-
Given a list of messages comprising a conversation, the model will return a response.
operationId: cohere__chat
produces:
- application/json
tags:
- Cohere
x-bentoml-name: cohere_chat
summary: Creates a model response for the given chat conversation.
"""
_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}
def apply_schema(func, **attrs):
for k, v in attrs.items():
func.__doc__ = func.__doc__.replace(k, v)
return func
def add_schema_definitions(func):
append_str = _SCHEMAS.get(func.__name__.lower(), '')
if not append_str:
return func
if func.__doc__ is None:
func.__doc__ = ''
func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip()
return func
class OpenLLMSchemaGenerator(SchemaGenerator):
def get_endpoints(self, routes):
endpoints_info = []
for route in routes:
if isinstance(route, (Mount, Host)):
routes = route.routes or []
path = self._remove_converter(route.path) if isinstance(route, Mount) else ''
sub_endpoints = [
EndpointInfo(path=f'{path}{sub_endpoint.path}', http_method=sub_endpoint.http_method, func=sub_endpoint.func)
for sub_endpoint in self.get_endpoints(routes)
]
endpoints_info.extend(sub_endpoints)
elif not isinstance(route, Route) or not route.include_in_schema:
continue
elif (
inspect.isfunction(route.endpoint)
or inspect.ismethod(route.endpoint)
or isinstance(route.endpoint, functools.partial)
):
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
path = self._remove_converter(route.path)
for method in route.methods or ['GET']:
if method == 'HEAD':
continue
endpoints_info.append(EndpointInfo(path, method.lower(), endpoint))
else:
path = self._remove_converter(route.path)
for method in ['get', 'post', 'put', 'patch', 'delete', 'options']:
if not hasattr(route.endpoint, method):
continue
func = getattr(route.endpoint, method)
endpoints_info.append(EndpointInfo(path, method.lower(), func))
return endpoints_info
def get_schema(self, routes, mount_path=None):
schema = dict(self.base_schema)
schema.setdefault('paths', {})
endpoints_info = self.get_endpoints(routes)
if mount_path:
mount_path = f'/{mount_path}' if not mount_path.startswith('/') else mount_path
for endpoint in endpoints_info:
parsed = self.parse_docstring(endpoint.func)
if not parsed:
continue
path = endpoint.path if mount_path is None else mount_path + endpoint.path
if path not in schema['paths']:
schema['paths'][path] = {}
schema['paths'][path][endpoint.http_method] = parsed
return schema
def get_generator(title, components=None, tags=None, inject=True):
base_schema = {'info': {'title': title, 'version': API_VERSION}, 'version': OPENAPI_VERSION}
if components and inject:
base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
if tags is not None and tags and inject:
base_schema['tags'] = tags
return OpenLLMSchemaGenerator(base_schema)
def component_schema_generator(attr_cls: pydantic.BaseModel, description=None):
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
schema['description'] = first_not_none(
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
)
for name, field in attr_cls.model_fields.items():
attr_type = field.annotation
origin_type = t.get_origin(attr_type)
args_type = t.get_args(attr_type)
# Map Python types to OpenAPI schema types
if isinstance(attr_type, str):
schema_type = 'string'
elif isinstance(attr_type, int):
schema_type = 'integer'
elif isinstance(attr_type, float):
schema_type = 'number'
elif isinstance(attr_type, bool):
schema_type = 'boolean'
elif origin_type is list or origin_type is tuple:
schema_type = 'array'
elif origin_type is dict:
schema_type = 'object'
# Assuming string keys for simplicity, and handling Any type for values
prop_schema = {'type': 'object', 'additionalProperties': True if args_type[1] is t.Any else {'type': 'string'}}
elif attr_type == t.Optional[str]:
schema_type = 'string'
elif origin_type is t.Union and t.Any in args_type:
schema_type = 'object'
prop_schema = {'type': 'object', 'additionalProperties': True}
else:
schema_type = 'string'
if 'prop_schema' not in locals():
prop_schema = {'type': schema_type}
if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory):
prop_schema['default'] = field.default
if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)):
schema['required'].append(name)
schema['properties'][name] = prop_schema
locals().pop('prop_schema', None)
return schema
_SimpleSchema = types.new_class(
'_SimpleSchema',
(object,),
{},
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
)
def append_schemas(svc, generated_schema, tags_order='prepend', inject=True):
# HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec.
from bentoml._internal.service.openapi.specification import OpenAPISpecification
if not inject:
return svc
svc_schema = svc.openapi_spec
if isinstance(svc_schema, (OpenAPISpecification, _SimpleSchema)):
svc_schema = svc_schema.asdict()
if 'tags' in generated_schema:
if tags_order == 'prepend':
svc_schema['tags'] = generated_schema['tags'] + svc_schema['tags']
elif tags_order == 'append':
svc_schema['tags'].extend(generated_schema['tags'])
else:
raise ValueError(f'Invalid tags_order: {tags_order}')
if 'components' in generated_schema:
svc_schema['components']['schemas'].update(generated_schema['components']['schemas'])
svc_schema['paths'].update(generated_schema['paths'])
# HACK: mk this attribute until we have a better way to add starlette schemas.
from bentoml._internal.service import openapi
def _generate_spec(svc, openapi_version=OPENAPI_VERSION):
return _SimpleSchema(svc_schema)
def asdict(self):
return svc_schema
openapi.generate_spec = _generate_spec
OpenAPISpecification.asdict = asdict
return svc

View File

@@ -1,29 +0,0 @@
from typing import Any, Callable, Dict, List, Literal, Optional, Type
from attr import AttrsInstance
from starlette.routing import BaseRoute
from starlette.schemas import EndpointInfo
from bentoml import Service
from openllm_core._typing_compat import ParamSpec
P = ParamSpec('P')
class OpenLLMSchemaGenerator:
base_schema: Dict[str, Any]
def get_endpoints(self, routes: list[BaseRoute]) -> list[EndpointInfo]: ...
def get_schema(self, routes: list[BaseRoute], mount_path: Optional[str] = ...) -> Dict[str, Any]: ...
def parse_docstring(self, func_or_method: Callable[P, Any]) -> Dict[str, Any]: ...
def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
def append_schemas(
svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
) -> Service: ...
def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
def get_generator(
title: str,
components: Optional[List[Type[AttrsInstance]]] = ...,
tags: Optional[List[Dict[str, Any]]] = ...,
inject: bool = ...,
) -> OpenLLMSchemaGenerator: ...

View File

@@ -1,63 +0,0 @@
import functools, logging
from http import HTTPStatus
import orjson
from starlette.applications import Starlette
from starlette.responses import JSONResponse
from starlette.routing import Route
from openllm_core.utils import converter
from ._openapi import add_schema_definitions, append_schemas, get_generator
from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
schemas = get_generator(
'hf',
components=[AgentRequest, AgentResponse, HFErrorResponse],
tags=[
{
'name': 'HF',
'description': 'HF integration, including Agent and others schema endpoints.',
'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent',
}
],
)
logger = logging.getLogger(__name__)
def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
routes=[
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
],
)
mount_path = '/hf'
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
def error_response(status_code, message):
return JSONResponse(
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
status_code=status_code.value,
)
@add_schema_definitions
async def hf_agent(req, llm):
json_str = await req.body()
try:
request = converter.structure(orjson.loads(json_str), AgentRequest)
except orjson.JSONDecodeError as err:
logger.debug('Sent body: %s', json_str)
logger.error('Invalid JSON input received: %s', err)
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
stop = request.parameters.pop('stop', [])
try:
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
return JSONResponse(
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
)
except Exception as err:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')

View File

@@ -1,14 +0,0 @@
from http import HTTPStatus
from starlette.requests import Request
from starlette.responses import JSONResponse, Response
from bentoml import Service
from openllm_core._typing_compat import M, T
from .._llm import LLM
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def hf_agent(req: Request, llm: LLM[M, T]) -> Response: ...
def hf_adapters(req: Request, llm: LLM[M, T]) -> Response: ...

View File

@@ -1,448 +0,0 @@
import functools
import logging
import time
import traceback
from http import HTTPStatus
import orjson
from starlette.applications import Starlette
from starlette.responses import JSONResponse, StreamingResponse
from starlette.routing import Route
from openllm_core.utils import converter, gen_random_uuid
from ._openapi import add_schema_definitions, append_schemas, apply_schema, get_generator
from openllm_core.protocol.openai import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseChoice,
ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse,
ChatMessage,
CompletionRequest,
CompletionResponse,
CompletionResponseChoice,
CompletionResponseStreamChoice,
CompletionStreamResponse,
Delta,
ErrorResponse,
LogProbs,
ModelCard,
ModelList,
UsageInfo,
)
schemas = get_generator(
'openai',
components=[
ErrorResponse,
ModelList,
ChatCompletionResponse,
ChatCompletionRequest,
ChatCompletionStreamResponse,
CompletionRequest,
CompletionResponse,
CompletionStreamResponse,
],
tags=[
{
'name': 'OpenAI',
'description': 'OpenAI Compatible API support',
'externalDocs': 'https://platform.openai.com/docs/api-reference/completions/object',
}
],
)
logger = logging.getLogger(__name__)
def jsonify_attr(obj):
return orjson.dumps(converter.unstructure(obj)).decode()
def error_response(status_code, message):
return JSONResponse(
{
'error': converter.unstructure(
ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
)
},
status_code=status_code.value,
)
async def check_model(request, model): # noqa
if request.model == model:
return None
return error_response(
HTTPStatus.NOT_FOUND,
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see available models.\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
)
def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initial_text_offset=0, *, llm):
# Create OpenAI-style logprobs.
logprobs = LogProbs()
last_token_len = 0
if num_output_top_logprobs:
logprobs.top_logprobs = []
for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i]
token_logprob = None
if step_top_logprobs is not None:
token_logprob = step_top_logprobs[token_id]
token = llm.tokenizer.convert_ids_to_tokens(token_id)
logprobs.tokens.append(token)
logprobs.token_logprobs.append(token_logprob)
if len(logprobs.text_offset) == 0:
logprobs.text_offset.append(initial_text_offset)
else:
logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
last_token_len = len(token)
if num_output_top_logprobs:
logprobs.top_logprobs.append(
{llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
if step_top_logprobs
else None
)
return logprobs
def mount_to_svc(svc, llm):
list_models.__doc__ = list_models.__doc__.replace('__model_id__', llm.llm_type)
completions.__doc__ = completions.__doc__.replace('__model_id__', llm.llm_type)
chat_completions.__doc__ = chat_completions.__doc__.replace('__model_id__', llm.llm_type)
app = Starlette(
debug=True,
routes=[
Route(
'/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
),
Route(
'/completions',
functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
methods=['POST'],
),
Route(
'/chat/completions',
functools.partial(
apply_schema(
chat_completions,
__model_id__=llm.llm_type,
__chat_template__=orjson.dumps(llm.config.chat_template).decode(),
__chat_messages__=orjson.dumps(llm.config.chat_messages).decode(),
__add_generation_prompt__=str(True) if llm.config.chat_messages is not None else str(False),
),
llm=llm,
),
methods=['POST'],
),
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
],
)
svc.mount_asgi_app(app, path='/v1')
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path='/v1'))
# GET /v1/models
@add_schema_definitions
def list_models(_, llm):
return JSONResponse(
converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
)
# POST /v1/chat/completions
@add_schema_definitions
async def chat_completions(req, llm):
# TODO: Check for length based on model context_length
json_str = await req.body()
try:
request = converter.structure(orjson.loads(json_str), ChatCompletionRequest)
except orjson.JSONDecodeError as err:
logger.debug('Sent body: %s', json_str)
logger.error('Invalid JSON input received: %s', err)
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
logger.debug('Received chat completion request: %s', request)
err_check = await check_model(request, llm.llm_type)
if err_check is not None:
return err_check
if request.logit_bias is not None and len(request.logit_bias) > 0:
return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
model_name, request_id = request.model, gen_random_uuid('chatcmpl')
created_time = int(time.monotonic())
prompt = llm.tokenizer.apply_chat_template(
request.messages,
tokenize=False,
chat_template=request.chat_template if request.chat_template != 'None' else None,
add_generation_prompt=request.add_generation_prompt,
)
logger.debug('Prompt: %r', prompt)
config = llm.config.compatible_options(request)
def get_role() -> str:
return (
request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
) # TODO: Support custom role here.
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
def create_stream_response_json(index, text, finish_reason=None, usage=None):
response = ChatCompletionStreamResponse(
id=request_id,
created=created_time,
model=model_name,
choices=[
ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
],
)
if usage is not None:
response.usage = usage
return jsonify_attr(response)
async def completion_stream_generator():
# first chunk with role
role = get_role()
for i in range(config['n']):
yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(role=role), finish_reason=None)], model=model_name))}\n\n'
if request.echo:
last_message, last_content = request.messages[-1], ''
if last_message.get('content') and last_message.get('role') == role:
last_content = last_message['content']
if last_content:
for i in range(config['n']):
yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(content=last_content), finish_reason=None)], model=model_name))}\n\n'
previous_num_tokens = [0] * config['n']
finish_reason_sent = [False] * config['n']
async for res in result_generator:
for output in res.outputs:
if finish_reason_sent[output.index]:
continue
yield f'data: {create_stream_response_json(output.index, output.text)}\n\n'
previous_num_tokens[output.index] += len(output.token_ids)
if output.finish_reason is not None:
prompt_tokens = len(res.prompt_token_ids)
usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
yield f'data: {create_stream_response_json(output.index, "", output.finish_reason, usage)}\n\n'
finish_reason_sent[output.index] = True
yield 'data: [DONE]\n\n'
try:
# Streaming case
if request.stream:
return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
# Non-streaming case
final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
async for res in result_generator:
if await req.is_disconnected():
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
for output in res.outputs:
texts[output.index].append(output.text)
token_ids[output.index].extend(output.token_ids)
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.model_copy(
update=dict(
outputs=[
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
for output in final_result.outputs
]
)
)
role = get_role()
choices = [
ChatCompletionResponseChoice(
index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
)
for output in final_result.outputs
]
if request.echo:
last_message, last_content = request.messages[-1], ''
if last_message.get('content') and last_message.get('role') == role:
last_content = last_message['content']
for choice in choices:
full_message = last_content + choice.message.content
choice.message.content = full_message
num_prompt_tokens = len(final_result.prompt_token_ids)
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
response = ChatCompletionResponse(
id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
)
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
# POST /v1/completions
@add_schema_definitions
async def completions(req, llm):
# TODO: Check for length based on model context_length
json_str = await req.body()
try:
request = converter.structure(orjson.loads(json_str), CompletionRequest)
except orjson.JSONDecodeError as err:
logger.debug('Sent body: %s', json_str)
logger.error('Invalid JSON input received: %s', err)
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
logger.debug('Received legacy completion request: %s', request)
err_check = await check_model(request, llm.llm_type)
if err_check is not None:
return err_check
# OpenAI API supports echoing the prompt when max_tokens is 0.
echo_without_generation = request.echo and request.max_tokens == 0
if echo_without_generation:
request.max_tokens = 1 # XXX: Hack to make sure we get the prompt back.
if request.suffix is not None:
return error_response(HTTPStatus.BAD_REQUEST, "'suffix' is not yet supported.")
if request.logit_bias is not None and len(request.logit_bias) > 0:
return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
if not request.prompt:
return error_response(HTTPStatus.BAD_REQUEST, 'Please provide a prompt.')
prompt = request.prompt
# TODO: Support multiple prompts
model_name, request_id = request.model, gen_random_uuid('cmpl')
created_time = int(time.monotonic())
config = llm.config.compatible_options(request)
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
# best_of != n then we don't stream
# TODO: support use_beam_search
stream = request.stream and (config['best_of'] is None or config['n'] == config['best_of'])
def create_stream_response_json(index, text, logprobs=None, finish_reason=None, usage=None):
response = CompletionStreamResponse(
id=request_id,
created=created_time,
model=model_name,
choices=[CompletionResponseStreamChoice(index=index, text=text, logprobs=logprobs, finish_reason=finish_reason)],
)
if usage:
response.usage = usage
return jsonify_attr(response)
async def completion_stream_generator():
previous_num_tokens = [0] * config['n']
previous_texts = [''] * config['n']
previous_echo = [False] * config['n']
async for res in result_generator:
for output in res.outputs:
i = output.index
delta_text = output.text
token_ids = output.token_ids
logprobs = None
top_logprobs = None
if request.logprobs is not None:
top_logprobs = output.logprobs[previous_num_tokens[i] :]
if request.echo and not previous_echo[i]:
if not echo_without_generation:
delta_text = res.prompt + delta_text
token_ids = res.prompt_token_ids + token_ids
if top_logprobs:
top_logprobs = res.prompt_logprobs + top_logprobs
else:
delta_text = res.prompt
token_ids = res.prompt_token_ids
if top_logprobs:
top_logprobs = res.prompt_logprobs
previous_echo[i] = True
if request.logprobs is not None:
logprobs = create_logprobs(
output.token_ids,
output.logprobs[previous_num_tokens[i] :],
request.logprobs,
len(previous_texts[i]),
llm=llm,
)
previous_num_tokens[i] += len(output.token_ids)
previous_texts[i] += output.text
yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
if output.finish_reason is not None:
logprobs = LogProbs() if request.logprobs is not None else None
prompt_tokens = len(res.prompt_token_ids)
usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
yield f'data: {create_stream_response_json(i, "", logprobs, output.finish_reason, usage)}\n\n'
yield 'data: [DONE]\n\n'
try:
# Streaming case
if stream:
return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
# Non-streaming case
final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
async for res in result_generator:
if await req.is_disconnected():
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
for output in res.outputs:
texts[output.index].append(output.text)
token_ids[output.index].extend(output.token_ids)
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.model_copy(
update=dict(
outputs=[
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
for output in final_result.outputs
]
)
)
choices = []
prompt_token_ids = final_result.prompt_token_ids
prompt_logprobs = final_result.prompt_logprobs
prompt_text = final_result.prompt
for output in final_result.outputs:
logprobs = None
if request.logprobs is not None:
if not echo_without_generation:
token_ids, top_logprobs = output.token_ids, output.logprobs
if request.echo:
token_ids, top_logprobs = prompt_token_ids + token_ids, prompt_logprobs + top_logprobs
else:
token_ids, top_logprobs = prompt_token_ids, prompt_logprobs
logprobs = create_logprobs(token_ids, top_logprobs, request.logprobs, llm=llm)
if not echo_without_generation:
output_text = output.text
if request.echo:
output_text = prompt_text + output_text
else:
output_text = prompt_text
choice_data = CompletionResponseChoice(
index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
)
choices.append(choice_data)
num_prompt_tokens = len(final_result.prompt_token_ids)
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
response = CompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')

View File

@@ -1,30 +0,0 @@
from http import HTTPStatus
from typing import Dict, List, Optional, Union
from attr import AttrsInstance
from starlette.requests import Request
from starlette.responses import JSONResponse, Response
from bentoml import Service
from openllm_core._typing_compat import M, T
from .._llm import LLM
from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def jsonify_attr(obj: AttrsInstance) -> str: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def check_model(
request: Union[CompletionRequest, ChatCompletionRequest], model: str
) -> Optional[JSONResponse]: ...
def create_logprobs(
token_ids: List[int],
top_logprobs: List[Dict[int, float]], #
num_output_top_logprobs: Optional[int] = ...,
initial_text_offset: int = ...,
*,
llm: LLM[M, T],
) -> LogProbs: ...
def list_models(req: Request, llm: LLM[M, T]) -> Response: ...
async def chat_completions(req: Request, llm: LLM[M, T]) -> Response: ...
async def completions(req: Request, llm: LLM[M, T]) -> Response: ...