perf: unify LLM interface (#518)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-06-12 10:29:36 -04:00 · 2023-11-06 20:39:43 -05:00
parent f2639879af
commit e2029c934b
136 changed files with 9646 additions and 11244 deletions
--- a/openllm-python/src/openllm/entrypoints/init.py
+++ b/openllm-python/src/openllm/entrypoints/init.py
@@ -0,0 +1,29 @@
+'''Entrypoint for all third-party apps.
+
+Currently support OpenAI compatible API.
+
+Each module should implement the following API:
+
+- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
+'''
+from __future__ import annotations
+import typing as t
+
+from openllm_core.utils import LazyModule
+
+from . import hf as hf
+from . import openai as openai
+
+if t.TYPE_CHECKING:
+  import bentoml
+  import openllm
+
+_import_structure: dict[str, list[str]] = {'openai': [], 'hf': []}
+
+def mount_entrypoints(svc: bentoml.Service, llm: openllm.LLM[t.Any, t.Any]) -> bentoml.Service:
+  return openai.mount_to_svc(hf.mount_to_svc(svc, llm), llm)
+
+__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -0,0 +1,518 @@
+from __future__ import annotations
+import functools
+import inspect
+import typing as t
+
+import attr
+
+from starlette.routing import BaseRoute
+from starlette.routing import Host
+from starlette.routing import Mount
+from starlette.routing import Route
+from starlette.schemas import EndpointInfo
+from starlette.schemas import SchemaGenerator
+
+from openllm_core._typing_compat import ParamSpec
+from openllm_core.utils import first_not_none
+
+if t.TYPE_CHECKING:
+  from attr import AttrsInstance
+
+  import bentoml
+
+P = ParamSpec('P')
+OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
+# NOTE: OpenAI schema
+LIST_MODEL_SCHEMA = '''\
+---
+consumes:
+- application/json
+description: >
+  List and describe the various models available in the API.
+
+  You can refer to the available supported models with `openllm models` for more
+  information.
+operationId: openai__list_models
+produces:
+  - application/json
+summary: Describes a model offering that can be used with the API.
+tags:
+  - OpenAI
+x-bentoml-name: list_models
+responses:
+  '200':
+    description: The Model object
+    content:
+      application/json:
+        example:
+          id: davinci
+          object: model
+          created: 1686935002
+          owned_by: openai
+        schema:
+          $ref: '#/components/schemas/ModelList'
+'''
+CHAT_COMPLETION_SCHEMA = '''\
+---
+consumes:
+- application/json
+description: >-
+  Given a list of messages comprising a conversation, the model will return a
+  response.
+operationId: openai__create_chat_completions
+produces:
+  - application/json
+tags:
+  - OpenAI
+x-bentoml-name: create_chat_completions
+summary: Creates a model response for the given chat conversation.
+requestBody:
+  required: true
+  content:
+    application/json:
+      examples:
+        one-shot:
+          summary: One-shot input example
+          value:
+            messages:
+              - role: system
+                content: You are a helpful assistant.
+              - role: user
+                content: Hello, I'm looking for a chatbot that can help me with my work.
+            model: meta-llama--Llama-2-13-chat-hf
+            max_tokens: 256
+            temperature: 0.7
+            top_p: 0.43
+            n: 1
+            stream: false
+        streaming:
+          summary: Streaming input example
+          value:
+            messages:
+              - role: system
+                content: You are a helpful assistant.
+              - role: user
+                content: Hello, I'm looking for a chatbot that can help me with my work.
+            model: meta-llama--Llama-2-13-chat-hf
+            max_tokens: 256
+            temperature: 0.7
+            top_p: 0.43
+            n: 1
+            stream: true
+            stop:
+              - "\\n"
+              - "<|endoftext|>"
+      schema:
+        $ref: '#/components/schemas/ChatCompletionRequest'
+responses:
+  '200':
+    description: OK
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ChatCompletionResponse'
+        examples:
+          streaming:
+            summary: Streaming output example
+            value: >
+              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
+          one-shot:
+            summary: One-shot output example
+            value: >
+              {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
+  '404':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          wrong-model:
+            summary: Wrong model
+            value: >
+              {
+                "error": {
+                  "message": "Model 'meta-llama--Llama-2-13-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 404
+                }
+              }
+    description: NotFound
+  '500':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          invalid-parameters:
+            summary: Invalid parameters
+            value: >
+              {
+                "error": {
+                  "message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 500
+                }
+              }
+    description: Internal Server Error
+  '400':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          invalid-json:
+            summary: Invalid JSON sent
+            value: >
+              {
+                "error": {
+                  "message": "Invalid JSON input received (Check server log).",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 400
+                }
+              }
+          invalid-prompt:
+            summary: Invalid prompt
+            value: >
+              {
+                "error": {
+                  "message": "Please provide a prompt.",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 400
+                }
+              }
+    description: Bad Request
+'''
+COMPLETION_SCHEMA = '''\
+---
+consumes:
+  - application/json
+description: >-
+  Given a prompt, the model will return one or more predicted completions, and
+  can also return the probabilities of alternative tokens at each position. We
+  recommend most users use our Chat completions API.
+operationId: openai__create_completions
+produces:
+  - application/json
+tags:
+  - OpenAI
+x-bentoml-name: create_completions
+summary: Creates a completion for the provided prompt and parameters.
+requestBody:
+  required: true
+  content:
+    application/json:
+      schema:
+        $ref: '#/components/schemas/CompletionRequest'
+      examples:
+        one-shot:
+          summary: One-shot input example
+          value:
+            prompt: This is a test
+            model: meta-llama--Llama-2-13-chat-hf
+            max_tokens: 256
+            temperature: 0.7
+            logprobs: 1
+            top_p: 0.43
+            n: 1
+            stream: false
+        streaming:
+          summary: Streaming input example
+          value:
+            prompt: This is a test
+            model: meta-llama--Llama-2-13-chat-hf
+            max_tokens: 256
+            temperature: 0.7
+            top_p: 0.43
+            logprobs: 1
+            n: 1
+            stream: true
+            stop:
+              - "\\n"
+              - "<|endoftext|>"
+responses:
+  '200':
+    description: OK
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/CompletionResponse'
+        examples:
+          one-shot:
+            summary: One-shot output example
+            value:
+              id: cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7
+              object: text_completion
+              created: 1589478378
+              model: VAR_model_id
+              choices:
+                - text: This is indeed a test
+                  index: 0
+                  logprobs: null
+                  finish_reason: length
+              usage:
+                prompt_tokens: 5
+                completion_tokens: 7
+                total_tokens: 12
+          streaming:
+            summary: Streaming output example
+            value:
+              id: cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe
+              object: text_completion
+              created: 1690759702
+              choices:
+                - text: This
+                  index: 0
+                  logprobs: null
+                  finish_reason: null
+              model: gpt-3.5-turbo-instruct
+  '404':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          wrong-model:
+            summary: Wrong model
+            value: >
+              {
+                "error": {
+                  "message": "Model 'meta-llama--Llama-2-13-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 404
+                }
+              }
+    description: NotFound
+  '500':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          invalid-parameters:
+            summary: Invalid parameters
+            value: >
+              {
+                "error": {
+                  "message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 500
+                }
+              }
+    description: Internal Server Error
+  '400':
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/ErrorResponse'
+        examples:
+          invalid-json:
+            summary: Invalid JSON sent
+            value: >
+              {
+                "error": {
+                  "message": "Invalid JSON input received (Check server log).",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 400
+                }
+              }
+          invalid-prompt:
+            summary: Invalid prompt
+            value: >
+              {
+                "error": {
+                  "message": "Please provide a prompt.",
+                  "type": "invalid_request_error",
+                  "object": "error",
+                  "param": null,
+                  "code": 400
+                }
+              }
+    description: Bad Request
+'''
+HF_AGENT_SCHEMA = '''\
+---
+consumes:
+  - application/json
+description: Generate instruction for given HF Agent chain for all OpenLLM supported models.
+operationId: hf__agent
+summary: Generate instruction for given HF Agent.
+tags:
+  - HF
+x-bentoml-name: hf_agent
+produces:
+  - application/json
+requestBody:
+  content:
+    application/json:
+      schema:
+        $ref: '#/components/schemas/AgentRequest'
+      example:
+        inputs: "Is the following `text` positive or negative?"
+        parameters:
+          text: "This is a positive text."
+          stop: ["\n"]
+  required: true
+responses:
+  200:
+    description: Successfull generated instruction.
+    content:
+      application/json:
+        example:
+          - generated_text: "This is a generated instruction."
+        schema:
+          $ref: '#/components/schemas/AgentResponse'
+  400:
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/AgentErrorResponse'
+    description: Bad Request
+  500:
+    content:
+      application/json:
+        schema:
+          $ref: '#/components/schemas/AgentErrorResponse'
+    description: Not Found
+'''
+
+def add_schema_definitions(append_str: str) -> t.Callable[[t.Callable[P, t.Any]], t.Callable[P, t.Any]]:
+  def docstring_decorator(func: t.Callable[P, t.Any]) -> t.Callable[P, t.Any]:
+    if func.__doc__ is None: func.__doc__ = ''
+    func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip()
+    return func
+
+  return docstring_decorator
+
+class OpenLLMSchemaGenerator(SchemaGenerator):
+  def get_endpoints(self, routes: list[BaseRoute]) -> list[EndpointInfo]:
+    endpoints_info: list[EndpointInfo] = []
+    for route in routes:
+      if isinstance(route, (Mount, Host)):
+        routes = route.routes or []
+        path = self._remove_converter(route.path) if isinstance(route, Mount) else ''
+        sub_endpoints = [EndpointInfo(path=f'{path}{sub_endpoint.path}', http_method=sub_endpoint.http_method, func=sub_endpoint.func) for sub_endpoint in self.get_endpoints(routes)]
+        endpoints_info.extend(sub_endpoints)
+      elif not isinstance(route, Route) or not route.include_in_schema:
+        continue
+      elif inspect.isfunction(route.endpoint) or inspect.ismethod(route.endpoint) or isinstance(route.endpoint, functools.partial):
+        endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
+        path = self._remove_converter(route.path)
+        for method in route.methods or ['GET']:
+          if method == 'HEAD': continue
+          endpoints_info.append(EndpointInfo(path, method.lower(), endpoint))
+      else:
+        path = self._remove_converter(route.path)
+        for method in ['get', 'post', 'put', 'patch', 'delete', 'options']:
+          if not hasattr(route.endpoint, method): continue
+          func = getattr(route.endpoint, method)
+          endpoints_info.append(EndpointInfo(path, method.lower(), func))
+    return endpoints_info
+
+  def get_schema(self, routes: list[BaseRoute], mount_path: str | None = None) -> dict[str, t.Any]:
+    schema = dict(self.base_schema)
+    schema.setdefault('paths', {})
+    endpoints_info = self.get_endpoints(routes)
+    if mount_path: mount_path = f'/{mount_path}' if not mount_path.startswith('/') else mount_path
+
+    for endpoint in endpoints_info:
+      parsed = self.parse_docstring(endpoint.func)
+      if not parsed: continue
+
+      path = endpoint.path if mount_path is None else mount_path + endpoint.path
+      if path not in schema['paths']: schema['paths'][path] = {}
+      schema['paths'][path][endpoint.http_method] = parsed
+
+    return schema
+
+def get_generator(title: str, components: list[type[AttrsInstance]] | None = None, tags: list[dict[str, t.Any]] | None = None) -> OpenLLMSchemaGenerator:
+  base_schema: dict[str, t.Any] = dict(info={'title': title, 'version': API_VERSION}, version=OPENAPI_VERSION)
+  if components: base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
+  if tags is not None and tags: base_schema['tags'] = tags
+  return OpenLLMSchemaGenerator(base_schema)
+
+def component_schema_generator(attr_cls: type[AttrsInstance], description: str | None = None) -> dict[str, t.Any]:
+  schema: dict[str, t.Any] = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
+  schema['description'] = first_not_none(getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}')
+  for field in attr.fields(attr.resolve_types(attr_cls)):  # type: ignore[misc]
+    attr_type = field.type
+    origin_type = t.get_origin(attr_type)
+    args_type = t.get_args(attr_type)
+
+    # Map Python types to OpenAPI schema types
+    if attr_type == str: schema_type = 'string'
+    elif attr_type == int: schema_type = 'integer'
+    elif attr_type == float: schema_type = 'number'
+    elif attr_type == bool: schema_type = 'boolean'
+    elif origin_type is list or origin_type is tuple:
+      schema_type = 'array'
+    elif origin_type is dict:
+      schema_type = 'object'
+      # Assuming string keys for simplicity, and handling Any type for values
+      prop_schema = {
+          'type': 'object',
+          'additionalProperties':
+              True if args_type[1] is t.Any else {
+                  'type': 'string'
+              }  # Simplified
+      }
+    elif attr_type == t.Optional[str]:
+      schema_type = 'string'
+    elif origin_type is t.Union and t.Any in args_type:
+      schema_type = 'object'
+      prop_schema = {
+          'type': 'object',
+          'additionalProperties': True  # Allows any type of values
+      }
+    else:
+      schema_type = 'string'
+
+    if 'prop_schema' not in locals(): prop_schema = {'type': schema_type}
+    if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory): prop_schema['default'] = field.default  # type: ignore[arg-type]
+    if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)): schema['required'].append(field.name)
+    schema['properties'][field.name] = prop_schema
+    locals().pop('prop_schema', None)
+
+  return schema
+
+class MKSchema:
+  def __init__(self, it: dict[str, t.Any]) -> None:
+    self.it = it
+
+  def asdict(self) -> dict[str, t.Any]:
+    return self.it
+
+def append_schemas(svc: bentoml.Service, generated_schema: dict[str, t.Any], tags_order: t.Literal['prepend', 'append'] = 'prepend') -> bentoml.Service:
+  # HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec.
+  from bentoml._internal.service.openapi.specification import OpenAPISpecification
+  svc_schema: t.Any = svc.openapi_spec
+  if isinstance(svc_schema, (OpenAPISpecification, MKSchema)): svc_schema = svc_schema.asdict()
+  if 'tags' in generated_schema:
+    if tags_order == 'prepend': svc_schema['tags'] = generated_schema['tags'] + svc_schema['tags']
+    elif tags_order == 'append': svc_schema['tags'].extend(generated_schema['tags'])
+    else: raise ValueError(f'Invalid tags_order: {tags_order}')
+  if 'components' in generated_schema: svc_schema['components']['schemas'].update(generated_schema['components']['schemas'])
+  svc_schema['paths'].update(generated_schema['paths'])
+
+  from bentoml._internal.service import openapi  # HACK: mk this attribute until we have a better way to add starlette schemas.
+
+  # yapf: disable
+  def mk_generate_spec(svc:bentoml.Service,openapi_version:str=OPENAPI_VERSION)->MKSchema:return MKSchema(svc_schema)
+  def mk_asdict(self:OpenAPISpecification)->dict[str,t.Any]:return svc_schema
+  openapi.generate_spec=mk_generate_spec
+  setattr(OpenAPISpecification, 'asdict', mk_asdict)
+  # yapf: disable
+  return svc
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+import functools
+import logging
+import typing as t
+
+from http import HTTPStatus
+
+import orjson
+
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+
+from openllm_core.utils import converter
+
+from ._openapi import HF_AGENT_SCHEMA
+from ._openapi import add_schema_definitions
+from ._openapi import append_schemas
+from ._openapi import get_generator
+from ..protocol.hf import AgentErrorResponse
+from ..protocol.hf import AgentRequest
+from ..protocol.hf import AgentResponse
+
+schemas = get_generator('hf',
+                        components=[AgentRequest, AgentResponse, AgentErrorResponse],
+                        tags=[{
+                            'name': 'HF',
+                            'description': 'Includes HF Agent support',
+                            'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent'
+                        }])
+logger = logging.getLogger(__name__)
+
+if t.TYPE_CHECKING:
+  from starlette.requests import Request
+  from starlette.responses import Response
+
+  import bentoml
+  import openllm
+
+  from openllm_core._typing_compat import M
+  from openllm_core._typing_compat import T
+
+def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
+  app = Starlette(
+      debug=True,
+      routes=[Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
+              Route('/schema', endpoint=openapi_schema, include_in_schema=False)])
+  mount_path = '/hf'
+  generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
+  svc.mount_asgi_app(app, path=mount_path)
+  return append_schemas(svc, generated_schema, tags_order='append')
+
+def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
+  return JSONResponse(converter.unstructure(AgentErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
+
+@add_schema_definitions(HF_AGENT_SCHEMA)
+async def hf_agent(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  json_str = await req.body()
+  try:
+    request = converter.structure(orjson.loads(json_str), AgentRequest)
+  except orjson.JSONDecodeError as err:
+    logger.debug('Sent body: %s', json_str)
+    logger.error('Invalid JSON input received: %s', err)
+    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
+
+  stop = request.parameters.pop('stop', ['\n'])
+  try:
+    result = await llm.generate(request.inputs, stop=stop, **request.parameters)
+    return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
+  except Exception as err:
+    logger.error('Error while generating: %s', err)
+    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
+
+def openapi_schema(req: Request) -> Response:
+  return schemas.OpenAPIResponse(req)
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -0,0 +1,305 @@
+from __future__ import annotations
+import functools
+import logging
+import time
+import traceback
+import typing as t
+
+from http import HTTPStatus
+
+import orjson
+
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.responses import StreamingResponse
+from starlette.routing import Route
+
+from openllm_core._schemas import SampleLogprobs
+from openllm_core.utils import converter
+from openllm_core.utils import gen_random_uuid
+
+from ._openapi import CHAT_COMPLETION_SCHEMA
+from ._openapi import COMPLETION_SCHEMA
+from ._openapi import LIST_MODEL_SCHEMA
+from ._openapi import add_schema_definitions
+from ._openapi import append_schemas
+from ._openapi import get_generator
+from ..protocol.openai import ChatCompletionRequest
+from ..protocol.openai import ChatCompletionResponse
+from ..protocol.openai import ChatCompletionResponseChoice
+from ..protocol.openai import ChatCompletionResponseStreamChoice
+from ..protocol.openai import ChatCompletionStreamResponse
+from ..protocol.openai import ChatMessage
+from ..protocol.openai import CompletionRequest
+from ..protocol.openai import CompletionResponse
+from ..protocol.openai import CompletionResponseChoice
+from ..protocol.openai import CompletionResponseStreamChoice
+from ..protocol.openai import CompletionStreamResponse
+from ..protocol.openai import Delta
+from ..protocol.openai import ErrorResponse
+from ..protocol.openai import LogProbs
+from ..protocol.openai import ModelCard
+from ..protocol.openai import ModelList
+from ..protocol.openai import UsageInfo
+from ..protocol.openai import get_conversation_prompt
+
+schemas = get_generator(
+    'openai',
+    components=[ErrorResponse, ModelList, ChatCompletionResponse, ChatCompletionRequest, ChatCompletionStreamResponse, CompletionRequest, CompletionResponse, CompletionStreamResponse],
+    tags=[{
+        'name': 'OpenAI',
+        'description': 'OpenAI Compatible API support',
+        'externalDocs': 'https://platform.openai.com/docs/api-reference/completions/object'
+    }])
+logger = logging.getLogger(__name__)
+
+if t.TYPE_CHECKING:
+  from attr import AttrsInstance
+  from starlette.requests import Request
+  from starlette.responses import Response
+
+  import bentoml
+  import openllm
+
+  from openllm_core._schemas import GenerationOutput
+  from openllm_core._typing_compat import M
+  from openllm_core._typing_compat import T
+
+def jsonify_attr(obj: AttrsInstance) -> str:
+  return orjson.dumps(converter.unstructure(obj)).decode()
+
+def error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
+  return JSONResponse({'error': converter.unstructure(ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value)))}, status_code=status_code.value)
+
+async def check_model(request: CompletionRequest | ChatCompletionRequest, model: str) -> JSONResponse | None:
+  if request.model == model: return None
+  return error_response(
+      HTTPStatus.NOT_FOUND,
+      f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see available models.\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request."
+  )
+
+def create_logprobs(token_ids: list[int], id_logprobs: list[dict[int, float]], initial_text_offset: int = 0, *, llm: openllm.LLM[M, T]) -> LogProbs:
+  # Create OpenAI-style logprobs.
+  logprobs = LogProbs()
+  last_token_len = 0
+  for token_id, id_logprob in zip(token_ids, id_logprobs):
+    token = llm.tokenizer.convert_ids_to_tokens(token_id)
+    logprobs.tokens.append(token)
+    logprobs.token_logprobs.append(id_logprob[token_id])
+    if len(logprobs.text_offset) == 0:
+      logprobs.text_offset.append(initial_text_offset)
+    else:
+      logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
+    last_token_len = len(token)
+
+    logprobs.top_logprobs.append({llm.tokenizer.convert_ids_to_tokens(i): p for i, p in id_logprob.items()})
+  return logprobs
+
+def mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service:
+  app = Starlette(debug=True,
+                  routes=[
+                      Route('/models', functools.partial(list_models, llm=llm), methods=['GET']),
+                      Route('/completions', functools.partial(create_completions, llm=llm), methods=['POST']),
+                      Route('/chat/completions', functools.partial(create_chat_completions, llm=llm), methods=['POST'])
+                  ])
+  mount_path = '/v1'
+  generated_schema = schemas.get_schema(routes=app.routes, mount_path=mount_path)
+  svc.mount_asgi_app(app, path=mount_path)
+  return append_schemas(svc, generated_schema)
+
+# GET /v1/models
+@add_schema_definitions(LIST_MODEL_SCHEMA)
+def list_models(_: Request, llm: openllm.LLM[M, T]) -> Response:
+  return JSONResponse(converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value)
+
+# POST /v1/chat/completions
+@add_schema_definitions(CHAT_COMPLETION_SCHEMA)
+async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  # TODO: Check for length based on model context_length
+  json_str = await req.body()
+  try:
+    request = converter.structure(orjson.loads(json_str), ChatCompletionRequest)
+  except orjson.JSONDecodeError as err:
+    logger.debug('Sent body: %s', json_str)
+    logger.error('Invalid JSON input received: %s', err)
+    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
+  logger.debug('Received chat completion request: %s', request)
+  err_check = await check_model(request, llm.llm_type)
+  if err_check is not None: return err_check
+
+  model_name, request_id = request.model, gen_random_uuid('chatcmpl')
+  created_time = int(time.monotonic())
+  prompt = await get_conversation_prompt(request, llm.config)
+  config = llm.config.with_openai_request(request)
+
+  try:
+    result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
+  except Exception as err:
+    traceback.print_exc()
+    logger.error('Error generating completion: %s', err)
+    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
+
+  def create_stream_response_json(index: int, text: str, finish_reason: str | None = None) -> str:
+    return jsonify_attr(
+        ChatCompletionStreamResponse(id=request_id,
+                                     created=created_time,
+                                     model=model_name,
+                                     choices=[ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)]))
+
+  async def completion_stream_generator() -> t.AsyncGenerator[str, None]:
+    # first chunk with role
+    for i in range(config['n']):
+      yield f"data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(role='assistant'), finish_reason=None)], model=model_name))}\n\n"
+
+    async for res in result_generator:
+      for output in res.outputs:
+        yield f'data: {create_stream_response_json(output.index, output.text)}\n\n'
+        if output.finish_reason is not None:
+          yield f'data: {create_stream_response_json(output.index, "", output.finish_reason)}\n\n'
+    yield 'data: [DONE]\n\n'
+
+  try:
+    # Streaming case
+    if request.stream: return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
+    # Non-streaming case
+    final_result: GenerationOutput | None = None
+    texts: list[list[str]] = [[]] * config['n']
+    token_ids: list[list[int]] = [[]] * config['n']
+    async for res in result_generator:
+      if await req.is_disconnected(): return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
+      for output in res.outputs:
+        texts[output.index].append(output.text)
+        token_ids[output.index].extend(output.token_ids)
+      final_result = res
+    if final_result is None: return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
+    final_result = final_result.with_options(outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs])
+    choices = [
+        ChatCompletionResponseChoice(index=output.index, message=ChatMessage(role='assistant', content=output.text), finish_reason=output.finish_reason) for output in final_result.outputs
+    ]
+    num_prompt_tokens, num_generated_tokens = len(t.cast(t.List[int], final_result.prompt_token_ids)), sum(len(output.token_ids) for output in final_result.outputs)
+    usage = UsageInfo(prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens)
+    response = ChatCompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
+
+    if request.stream:  # type: ignore[unreachable]
+      # When user requests streaming but we don't stream, we still need to
+      # return a streaming response with a single event.
+      async def fake_stream_generator() -> t.AsyncGenerator[str, None]:  # type: ignore[unreachable]
+        yield f'data: {jsonify_attr(response)}\n\n'
+        yield 'data: [DONE]\n\n'
+
+      return StreamingResponse(fake_stream_generator(), media_type='text/event-stream', status_code=HTTPStatus.OK.value)
+
+    return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
+  except Exception as err:
+    traceback.print_exc()
+    logger.error('Error generating completion: %s', err)
+    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
+
+# POST /v1/completions
+@add_schema_definitions(COMPLETION_SCHEMA)
+async def create_completions(req: Request, llm: openllm.LLM[M, T]) -> Response:
+  # TODO: Check for length based on model context_length
+  json_str = await req.body()
+  try:
+    request = converter.structure(orjson.loads(json_str), CompletionRequest)
+  except orjson.JSONDecodeError as err:
+    logger.debug('Sent body: %s', json_str)
+    logger.error('Invalid JSON input received: %s', err)
+    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
+  logger.debug('Received legacy completion request: %s', request)
+  err_check = await check_model(request, llm.llm_type)
+  if err_check is not None: return err_check
+
+  if request.echo: return error_response(HTTPStatus.BAD_REQUEST, "'echo' is not yet supported.")
+  if request.suffix is not None: return error_response(HTTPStatus.BAD_REQUEST, "'suffix' is not yet supported.")
+  if request.logit_bias is not None and len(request.logit_bias) > 0: return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
+
+  if not request.prompt: return error_response(HTTPStatus.BAD_REQUEST, 'Please provide a prompt.')
+  prompt = request.prompt
+  # TODO: Support multiple prompts
+
+  if request.logprobs is not None and llm.__llm_backend__ == 'pt':  # TODO: support logprobs generation for PyTorch
+    return error_response(HTTPStatus.BAD_REQUEST, "'logprobs' is not yet supported for PyTorch models. Make sure to unset `logprobs`.")
+
+  model_name, request_id = request.model, gen_random_uuid('cmpl')
+  created_time = int(time.monotonic())
+  config = llm.config.with_openai_request(request)
+
+  try:
+    result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
+  except Exception as err:
+    traceback.print_exc()
+    logger.error('Error generating completion: %s', err)
+    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
+
+  # best_of != n then we don't stream
+  # TODO: support use_beam_search
+  stream = request.stream and (config['best_of'] is None or config['n'] == config['best_of'])
+
+  def create_stream_response_json(index: int, text: str, logprobs: LogProbs | None = None, finish_reason: str | None = None) -> str:
+    return jsonify_attr(
+        CompletionStreamResponse(id=request_id,
+                                 created=created_time,
+                                 model=model_name,
+                                 choices=[CompletionResponseStreamChoice(index=index, text=text, logprobs=logprobs, finish_reason=finish_reason)]))
+
+  async def completion_stream_generator() -> t.AsyncGenerator[str, None]:
+    previous_num_tokens = [0] * config['n']
+    async for res in result_generator:
+      for output in res.outputs:
+        i = output.index
+        if request.logprobs is not None:
+          logprobs = create_logprobs(token_ids=output.token_ids, id_logprobs=t.cast(SampleLogprobs, output.logprobs)[previous_num_tokens[i]:], llm=llm)
+        else:
+          logprobs = None
+        previous_num_tokens[i] += len(output.token_ids)
+        yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs)}\n\n'
+        if output.finish_reason is not None:
+          logprobs = LogProbs() if request.logprobs is not None else None
+          yield f'data: {create_stream_response_json(index=i, text="", logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
+    yield 'data: [DONE]\n\n'
+
+  try:
+    # Streaming case
+    if stream: return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
+    # Non-streaming case
+    final_result: GenerationOutput | None = None
+    texts: list[list[str]] = [[]] * config['n']
+    token_ids: list[list[int]] = [[]] * config['n']
+    async for res in result_generator:
+      if await req.is_disconnected(): return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
+      for output in res.outputs:
+        texts[output.index].append(output.text)
+        token_ids[output.index].extend(output.token_ids)
+      final_result = res
+    if final_result is None: return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
+    final_result = final_result.with_options(outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs])
+
+    choices: list[CompletionResponseChoice] = []
+    for output in final_result.outputs:
+      if request.logprobs is not None:
+        logprobs = create_logprobs(token_ids=output.token_ids, id_logprobs=t.cast(SampleLogprobs, output.logprobs), llm=llm)
+      else:
+        logprobs = None
+      choice_data = CompletionResponseChoice(index=output.index, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)
+      choices.append(choice_data)
+
+    num_prompt_tokens = len(t.cast(t.List[int], final_result.prompt_token_ids))  # XXX: We will always return prompt_token_ids, so this won't be None
+    num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
+    usage = UsageInfo(prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens)
+    response = CompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
+
+    if request.stream:
+      # When user requests streaming but we don't stream, we still need to
+      # return a streaming response with a single event.
+      async def fake_stream_generator() -> t.AsyncGenerator[str, None]:
+        yield f'data: {jsonify_attr(response)}\n\n'
+        yield 'data: [DONE]\n\n'
+
+      return StreamingResponse(fake_stream_generator(), media_type='text/event-stream', status_code=HTTPStatus.OK.value)
+
+    return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
+  except Exception as err:
+    traceback.print_exc()
+    logger.error('Error generating completion: %s', err)
+    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')