chore(releases): remove deadcode

Signed-off-by: Aaron Pham (mbp16) <29749331+aarnphm@users.noreply.github.com>
2026-04-21 23:47:23 -04:00 · 2024-05-27 12:37:50 -04:00
parent da42c269c9
commit f4f7f16e81
38 changed files with 97 additions and 3291 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -27,14 +27,11 @@ __lazy = utils.LazyModule(  # NOTE: update this to sys.modules[__name__] once my
    'exceptions': [],
    'client': ['HTTPClient', 'AsyncHTTPClient'],
    'bundle': [],
-    'testing': [],
    'utils': ['api'],
-    'entrypoints': ['mount_entrypoints'],
    'serialisation': ['ggml', 'transformers', 'vllm'],
    '_llm': ['LLM'],
    '_deprecated': ['Runner'],
    '_runners': ['runner'],
-    '_quantisation': ['infer_quantisation_config'],
    '_strategies': ['CascadingResourceStrategy', 'get_resource'],
  },
  extra_objects={'COMPILED': COMPILED},
@@ -44,7 +41,7 @@ __all__, __dir__ = __lazy.__all__, __lazy.__dir__
 _BREAKING_INTERNAL = ['_service', '_service_vars']
 _NEW_IMPL = ['LLM', *_BREAKING_INTERNAL]

-if (_BENTOML_VERSION := utils.pkg.pkg_version_info('bentoml')) > (1, 2):
+if utils.pkg.pkg_version_info('bentoml') > (1, 2):
  import _openllm_tiny as _tiny
 else:
  _tiny = None
@@ -58,7 +55,7 @@ def __getattr__(name: str) -> _t.Any:
          f'"{name}" is an internal implementation and considered breaking with older OpenLLM. Please migrate your code if you depend on this.'
        )
      _warnings.warn(
-        f'"{name}" is considered deprecated implementation and will be removed in the future. Make sure to upgrade to OpenLLM 0.5.x',
+        f'"{name}" is considered deprecated implementation and could be breaking. See https://github.com/bentoml/OpenLLM for more information on upgrading instruction.',
        DeprecationWarning,
        stacklevel=3,
      )
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -26,7 +26,6 @@ from . import (
  exceptions as exceptions,
  serialisation as serialisation,
  utils as utils,
-  entrypoints as entrypoints,
 )
 from .serialisation import ggml as ggml, transformers as transformers, vllm as vllm
 from ._deprecated import Runner as Runner
--- a/openllm-python/src/openllm/entrypoints/init.py
+++ b/openllm-python/src/openllm/entrypoints/init.py
@@ -1,16 +0,0 @@
-import importlib
-from openllm_core.utils import LazyModule
-
-_import_structure = {'openai': [], 'hf': []}
-
-
-def mount_entrypoints(svc, llm):
-  for module_name in _import_structure:
-    svc = importlib.import_module(f'.{module_name}', __name__).mount_to_svc(svc, llm)
-  return svc
-
-
-__lazy = LazyModule(
-  __name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
-)
-__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
--- a/openllm-python/src/openllm/entrypoints/init.pyi
+++ b/openllm-python/src/openllm/entrypoints/init.pyi
@@ -1,17 +0,0 @@
-"""Entrypoint for all third-party apps.
-
-Currently support OpenAI compatible API.
-
-Each module should implement the following API:
-
- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
-"""
-
-from typing import Any
-from _bentoml_sdk import Service
-from openllm_core._typing_compat import M, T
-
-from . import hf as hf, openai as openai
-from .._llm import LLM
-
-def mount_entrypoints(svc: Service[Any], llm: LLM[M, T]) -> Service: ...
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -1,641 +0,0 @@
-from __future__ import annotations
-
-import functools
-import inspect
-import types
-import typing as t
-
-import attr
-from starlette.routing import Host, Mount, Route
-from starlette.schemas import EndpointInfo, SchemaGenerator
-
-from openllm_core.utils import first_not_none
-
-if t.TYPE_CHECKING:
-  import pydantic
-
-OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
-# NOTE: OpenAI schema
-LIST_MODELS_SCHEMA = """\
---
-consumes:
- application/json
-description: >
-  List and describe the various models available in the API.
-
-  You can refer to the available supported models with `openllm models` for more
-  information.
-operationId: openai__list_models
-produces:
-  - application/json
-summary: Describes a model offering that can be used with the API.
-tags:
-  - OpenAI
-x-bentoml-name: list_models
-responses:
-  200:
-    description: The Model object
-    content:
-      application/json:
-        example:
-          object: 'list'
-          data:
-            - id: __model_id__
-              object: model
-              created: 1686935002
-              owned_by: 'na'
-        schema:
-          $ref: '#/components/schemas/ModelList'
-"""
-CHAT_COMPLETIONS_SCHEMA = """\
---
-consumes:
- application/json
-description: >-
-  Given a list of messages comprising a conversation, the model will return a
-  response.
-operationId: openai__chat_completions
-produces:
-  - application/json
-tags:
-  - OpenAI
-x-bentoml-name: create_chat_completions
-summary: Creates a model response for the given chat conversation.
-requestBody:
-  required: true
-  content:
-    application/json:
-      examples:
-        one-shot:
-          summary: One-shot input example
-          value:
-            messages: __chat_messages__
-            model: __model_id__
-            max_tokens: 256
-            temperature: 0.7
-            top_p: 0.43
-            n: 1
-            stream: false
-            chat_template: __chat_template__
-            add_generation_prompt: __add_generation_prompt__
-            echo: false
-        streaming:
-          summary: Streaming input example
-          value:
-            messages:
-              - role: system
-                content: You are a helpful assistant.
-              - role: user
-                content: Hello, I'm looking for a chatbot that can help me with my work.
-            model: __model_id__
-            max_tokens: 256
-            temperature: 0.7
-            top_p: 0.43
-            n: 1
-            stream: true
-            stop:
-              - "<|endoftext|>"
-            chat_template: __chat_template__
-            add_generation_prompt: __add_generation_prompt__
-            echo: false
-      schema:
-        $ref: '#/components/schemas/ChatCompletionRequest'
-responses:
-  200:
-    description: OK
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ChatCompletionResponse'
-        examples:
-          streaming:
-            summary: Streaming output example
-            value: >
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
-          one-shot:
-            summary: One-shot output example
-            value: >
-              {"id": "chatcmpl-123", "object": "chat.completion", "created": 1677652288, "model": "gpt-3.5-turbo-0613", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello there, how may I assist you today?"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}}
-  404:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          wrong-model:
-            summary: Wrong model
-            value: >
-              {
-                "error": {
-                  "message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 404
-                }
-              }
-    description: NotFound
-  500:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          invalid-parameters:
-            summary: Invalid parameters
-            value: >
-              {
-                "error": {
-                  "message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 500
-                }
-              }
-    description: Internal Server Error
-  400:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          invalid-json:
-            summary: Invalid JSON sent
-            value: >
-              {
-                "error": {
-                  "message": "Invalid JSON input received (Check server log).",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 400
-                }
-              }
-          invalid-prompt:
-            summary: Invalid prompt
-            value: >
-              {
-                "error": {
-                  "message": "Please provide a prompt.",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 400
-                }
-              }
-    description: Bad Request
-"""
-COMPLETIONS_SCHEMA = """\
---
-consumes:
-  - application/json
-description: >-
-  Given a prompt, the model will return one or more predicted completions, and can also return the probabilities of alternative tokens at each position. We recommend most users use our Chat completions API.
-operationId: openai__completions
-produces:
-  - application/json
-tags:
-  - OpenAI
-x-bentoml-name: create_completions
-summary: Creates a completion for the provided prompt and parameters.
-requestBody:
-  required: true
-  content:
-    application/json:
-      schema:
-        $ref: '#/components/schemas/CompletionRequest'
-      examples:
-        one-shot:
-          summary: One-shot input example
-          value:
-            prompt: This is a test
-            model: __model_id__
-            max_tokens: 256
-            temperature: 0.7
-            logprobs: null
-            top_p: 0.43
-            n: 1
-            stream: false
-        streaming:
-          summary: Streaming input example
-          value:
-            prompt: This is a test
-            model: __model_id__
-            max_tokens: 256
-            temperature: 0.7
-            top_p: 0.43
-            logprobs: null
-            n: 1
-            stream: true
-            stop:
-              - "<|endoftext|>"
-responses:
-  200:
-    description: OK
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/CompletionResponse'
-        examples:
-          one-shot:
-            summary: One-shot output example
-            value:
-              id: cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7
-              object: text_completion
-              created: 1589478378
-              model: VAR_model_id
-              choices:
-                - text: This is indeed a test
-                  index: 0
-                  logprobs: null
-                  finish_reason: length
-              usage:
-                prompt_tokens: 5
-                completion_tokens: 7
-                total_tokens: 12
-          streaming:
-            summary: Streaming output example
-            value:
-              id: cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe
-              object: text_completion
-              created: 1690759702
-              choices:
-                - text: This
-                  index: 0
-                  logprobs: null
-                  finish_reason: null
-              model: gpt-3.5-turbo-instruct
-  404:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          wrong-model:
-            summary: Wrong model
-            value: >
-              {
-                "error": {
-                  "message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 404
-                }
-              }
-    description: NotFound
-  500:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          invalid-parameters:
-            summary: Invalid parameters
-            value: >
-              {
-                "error": {
-                  "message": "`top_p` has to be a float > 0 and < 1, but is 4.0",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 500
-                }
-              }
-    description: Internal Server Error
-  400:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/ErrorResponse'
-        examples:
-          invalid-json:
-            summary: Invalid JSON sent
-            value: >
-              {
-                "error": {
-                  "message": "Invalid JSON input received (Check server log).",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 400
-                }
-              }
-          invalid-prompt:
-            summary: Invalid prompt
-            value: >
-              {
-                "error": {
-                  "message": "Please provide a prompt.",
-                  "type": "invalid_request_error",
-                  "object": "error",
-                  "param": null,
-                  "code": 400
-                }
-              }
-    description: Bad Request
-"""
-HF_AGENT_SCHEMA = """\
---
-consumes:
-  - application/json
-description: Generate instruction for given HF Agent chain for all OpenLLM supported models.
-operationId: hf__agent
-summary: Generate instruction for given HF Agent.
-tags:
-  - HF
-x-bentoml-name: hf_agent
-produces:
-  - application/json
-requestBody:
-  content:
-    application/json:
-      schema:
-        $ref: '#/components/schemas/AgentRequest'
-      example:
-        inputs: "Is the following `text` positive or negative?"
-        parameters:
-          text: "This is a positive text."
-          stop: []
-  required: true
-responses:
-  200:
-    description: Successfull generated instruction.
-    content:
-      application/json:
-        example:
-          - generated_text: "This is a generated instruction."
-        schema:
-          $ref: '#/components/schemas/AgentResponse'
-  400:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/HFErrorResponse'
-    description: Bad Request
-  500:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/HFErrorResponse'
-    description: Not Found
-"""
-HF_ADAPTERS_SCHEMA = """\
---
-consumes:
- application/json
-description: Return current list of adapters for given LLM.
-operationId: hf__adapters_map
-produces:
-  - application/json
-summary: Describes a model offering that can be used with the API.
-tags:
-  - HF
-x-bentoml-name: hf_adapters
-responses:
-  200:
-    description: Return list of LoRA adapters.
-    content:
-      application/json:
-        example:
-          aarnphm/opt-6-7b-quotes:
-            adapter_name: default
-            adapter_type: LORA
-          aarnphm/opt-6-7b-dolly:
-            adapter_name: dolly
-            adapter_type: LORA
-  500:
-    content:
-      application/json:
-        schema:
-          $ref: '#/components/schemas/HFErrorResponse'
-    description: Not Found
-"""
-COHERE_GENERATE_SCHEMA = """\
---
-consumes:
-  - application/json
-description: >-
-  Given a prompt, the model will return one or more predicted completions, and
-  can also return the probabilities of alternative tokens at each position.
-operationId: cohere__generate
-produces:
-  - application/json
-tags:
-  - Cohere
-x-bentoml-name: cohere_generate
-summary: Creates a completion for the provided prompt and parameters.
-requestBody:
-  required: true
-  content:
-    application/json:
-      schema:
-        $ref: '#/components/schemas/CohereGenerateRequest'
-      examples:
-        one-shot:
-          summary: One-shot input example
-          value:
-            prompt: This is a test
-            max_tokens: 256
-            temperature: 0.7
-            p: 0.43
-            k: 12
-            num_generations: 2
-            stream: false
-        streaming:
-          summary: Streaming input example
-          value:
-            prompt: This is a test
-            max_tokens: 256
-            temperature: 0.7
-            p: 0.43
-            k: 12
-            num_generations: 2
-            stream: true
-            stop_sequences:
-              - "<|endoftext|>"
-"""
-COHERE_CHAT_SCHEMA = """\
---
-consumes:
- application/json
-description: >-
-  Given a list of messages comprising a conversation, the model will return a response.
-operationId: cohere__chat
-produces:
-  - application/json
-tags:
-  - Cohere
-x-bentoml-name: cohere_chat
-summary: Creates a model response for the given chat conversation.
-"""
-
-_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}
-
-
-def apply_schema(func, **attrs):
-  for k, v in attrs.items():
-    func.__doc__ = func.__doc__.replace(k, v)
-  return func
-
-
-def add_schema_definitions(func):
-  append_str = _SCHEMAS.get(func.__name__.lower(), '')
-  if not append_str:
-    return func
-  if func.__doc__ is None:
-    func.__doc__ = ''
-  func.__doc__ = func.__doc__.strip() + '\n\n' + append_str.strip()
-  return func
-
-
-class OpenLLMSchemaGenerator(SchemaGenerator):
-  def get_endpoints(self, routes):
-    endpoints_info = []
-    for route in routes:
-      if isinstance(route, (Mount, Host)):
-        routes = route.routes or []
-        path = self._remove_converter(route.path) if isinstance(route, Mount) else ''
-        sub_endpoints = [
-          EndpointInfo(path=f'{path}{sub_endpoint.path}', http_method=sub_endpoint.http_method, func=sub_endpoint.func)
-          for sub_endpoint in self.get_endpoints(routes)
-        ]
-        endpoints_info.extend(sub_endpoints)
-      elif not isinstance(route, Route) or not route.include_in_schema:
-        continue
-      elif (
-        inspect.isfunction(route.endpoint)
-        or inspect.ismethod(route.endpoint)
-        or isinstance(route.endpoint, functools.partial)
-      ):
-        endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
-        path = self._remove_converter(route.path)
-        for method in route.methods or ['GET']:
-          if method == 'HEAD':
-            continue
-          endpoints_info.append(EndpointInfo(path, method.lower(), endpoint))
-      else:
-        path = self._remove_converter(route.path)
-        for method in ['get', 'post', 'put', 'patch', 'delete', 'options']:
-          if not hasattr(route.endpoint, method):
-            continue
-          func = getattr(route.endpoint, method)
-          endpoints_info.append(EndpointInfo(path, method.lower(), func))
-    return endpoints_info
-
-  def get_schema(self, routes, mount_path=None):
-    schema = dict(self.base_schema)
-    schema.setdefault('paths', {})
-    endpoints_info = self.get_endpoints(routes)
-    if mount_path:
-      mount_path = f'/{mount_path}' if not mount_path.startswith('/') else mount_path
-
-    for endpoint in endpoints_info:
-      parsed = self.parse_docstring(endpoint.func)
-      if not parsed:
-        continue
-
-      path = endpoint.path if mount_path is None else mount_path + endpoint.path
-      if path not in schema['paths']:
-        schema['paths'][path] = {}
-      schema['paths'][path][endpoint.http_method] = parsed
-
-    return schema
-
-
-def get_generator(title, components=None, tags=None, inject=True):
-  base_schema = {'info': {'title': title, 'version': API_VERSION}, 'version': OPENAPI_VERSION}
-  if components and inject:
-    base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
-  if tags is not None and tags and inject:
-    base_schema['tags'] = tags
-  return OpenLLMSchemaGenerator(base_schema)
-
-
-def component_schema_generator(attr_cls: pydantic.BaseModel, description=None):
-  schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
-  schema['description'] = first_not_none(
-    getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
-  )
-  for name, field in attr_cls.model_fields.items():
-    attr_type = field.annotation
-    origin_type = t.get_origin(attr_type)
-    args_type = t.get_args(attr_type)
-
-    # Map Python types to OpenAPI schema types
-    if isinstance(attr_type, str):
-      schema_type = 'string'
-    elif isinstance(attr_type, int):
-      schema_type = 'integer'
-    elif isinstance(attr_type, float):
-      schema_type = 'number'
-    elif isinstance(attr_type, bool):
-      schema_type = 'boolean'
-    elif origin_type is list or origin_type is tuple:
-      schema_type = 'array'
-    elif origin_type is dict:
-      schema_type = 'object'
-      # Assuming string keys for simplicity, and handling Any type for values
-      prop_schema = {'type': 'object', 'additionalProperties': True if args_type[1] is t.Any else {'type': 'string'}}
-    elif attr_type == t.Optional[str]:
-      schema_type = 'string'
-    elif origin_type is t.Union and t.Any in args_type:
-      schema_type = 'object'
-      prop_schema = {'type': 'object', 'additionalProperties': True}
-    else:
-      schema_type = 'string'
-
-    if 'prop_schema' not in locals():
-      prop_schema = {'type': schema_type}
-    if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory):
-      prop_schema['default'] = field.default
-    if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)):
-      schema['required'].append(name)
-    schema['properties'][name] = prop_schema
-    locals().pop('prop_schema', None)
-
-  return schema
-
-
-_SimpleSchema = types.new_class(
-  '_SimpleSchema',
-  (object,),
-  {},
-  lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
-)
-
-
-def append_schemas(svc, generated_schema, tags_order='prepend', inject=True):
-  # HACK: Dirty hack to append schemas to existing service. We def need to support mounting Starlette app OpenAPI spec.
-  from bentoml._internal.service.openapi.specification import OpenAPISpecification
-
-  if not inject:
-    return svc
-
-  svc_schema = svc.openapi_spec
-  if isinstance(svc_schema, (OpenAPISpecification, _SimpleSchema)):
-    svc_schema = svc_schema.asdict()
-  if 'tags' in generated_schema:
-    if tags_order == 'prepend':
-      svc_schema['tags'] = generated_schema['tags'] + svc_schema['tags']
-    elif tags_order == 'append':
-      svc_schema['tags'].extend(generated_schema['tags'])
-    else:
-      raise ValueError(f'Invalid tags_order: {tags_order}')
-  if 'components' in generated_schema:
-    svc_schema['components']['schemas'].update(generated_schema['components']['schemas'])
-  svc_schema['paths'].update(generated_schema['paths'])
-
-  # HACK: mk this attribute until we have a better way to add starlette schemas.
-  from bentoml._internal.service import openapi
-
-  def _generate_spec(svc, openapi_version=OPENAPI_VERSION):
-    return _SimpleSchema(svc_schema)
-
-  def asdict(self):
-    return svc_schema
-
-  openapi.generate_spec = _generate_spec
-  OpenAPISpecification.asdict = asdict
-  return svc
--- a/openllm-python/src/openllm/entrypoints/_openapi.pyi
+++ b/openllm-python/src/openllm/entrypoints/_openapi.pyi
@@ -1,29 +0,0 @@
-from typing import Any, Callable, Dict, List, Literal, Optional, Type
-
-from attr import AttrsInstance
-from starlette.routing import BaseRoute
-from starlette.schemas import EndpointInfo
-
-from bentoml import Service
-from openllm_core._typing_compat import ParamSpec
-
-P = ParamSpec('P')
-
-class OpenLLMSchemaGenerator:
-  base_schema: Dict[str, Any]
-  def get_endpoints(self, routes: list[BaseRoute]) -> list[EndpointInfo]: ...
-  def get_schema(self, routes: list[BaseRoute], mount_path: Optional[str] = ...) -> Dict[str, Any]: ...
-  def parse_docstring(self, func_or_method: Callable[P, Any]) -> Dict[str, Any]: ...
-
-def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
-def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
-def append_schemas(
-  svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
-) -> Service: ...
-def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
-def get_generator(
-  title: str,
-  components: Optional[List[Type[AttrsInstance]]] = ...,
-  tags: Optional[List[Dict[str, Any]]] = ...,
-  inject: bool = ...,
-) -> OpenLLMSchemaGenerator: ...
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -1,63 +0,0 @@
-import functools, logging
-from http import HTTPStatus
-import orjson
-from starlette.applications import Starlette
-from starlette.responses import JSONResponse
-from starlette.routing import Route
-from openllm_core.utils import converter
-from ._openapi import add_schema_definitions, append_schemas, get_generator
-from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
-
-schemas = get_generator(
-  'hf',
-  components=[AgentRequest, AgentResponse, HFErrorResponse],
-  tags=[
-    {
-      'name': 'HF',
-      'description': 'HF integration, including Agent and others schema endpoints.',
-      'externalDocs': 'https://huggingface.co/docs/transformers/main_classes/agent',
-    }
-  ],
-)
-logger = logging.getLogger(__name__)
-
-
-def mount_to_svc(svc, llm):
-  app = Starlette(
-    debug=True,
-    routes=[
-      Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
-      Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
-    ],
-  )
-  mount_path = '/hf'
-  svc.mount_asgi_app(app, path=mount_path)
-  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
-
-
-def error_response(status_code, message):
-  return JSONResponse(
-    converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
-    status_code=status_code.value,
-  )
-
-
-@add_schema_definitions
-async def hf_agent(req, llm):
-  json_str = await req.body()
-  try:
-    request = converter.structure(orjson.loads(json_str), AgentRequest)
-  except orjson.JSONDecodeError as err:
-    logger.debug('Sent body: %s', json_str)
-    logger.error('Invalid JSON input received: %s', err)
-    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
-
-  stop = request.parameters.pop('stop', [])
-  try:
-    result = await llm.generate(request.inputs, stop=stop, **request.parameters)
-    return JSONResponse(
-      converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
-    )
-  except Exception as err:
-    logger.error('Error while generating: %s', err)
-    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
--- a/openllm-python/src/openllm/entrypoints/hf.pyi
+++ b/openllm-python/src/openllm/entrypoints/hf.pyi
@@ -1,14 +0,0 @@
-from http import HTTPStatus
-
-from starlette.requests import Request
-from starlette.responses import JSONResponse, Response
-
-from bentoml import Service
-from openllm_core._typing_compat import M, T
-
-from .._llm import LLM
-
-def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
-def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
-async def hf_agent(req: Request, llm: LLM[M, T]) -> Response: ...
-def hf_adapters(req: Request, llm: LLM[M, T]) -> Response: ...
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -1,448 +0,0 @@
-import functools
-import logging
-import time
-import traceback
-from http import HTTPStatus
-
-import orjson
-from starlette.applications import Starlette
-from starlette.responses import JSONResponse, StreamingResponse
-from starlette.routing import Route
-
-from openllm_core.utils import converter, gen_random_uuid
-
-from ._openapi import add_schema_definitions, append_schemas, apply_schema, get_generator
-from openllm_core.protocol.openai import (
-  ChatCompletionRequest,
-  ChatCompletionResponse,
-  ChatCompletionResponseChoice,
-  ChatCompletionResponseStreamChoice,
-  ChatCompletionStreamResponse,
-  ChatMessage,
-  CompletionRequest,
-  CompletionResponse,
-  CompletionResponseChoice,
-  CompletionResponseStreamChoice,
-  CompletionStreamResponse,
-  Delta,
-  ErrorResponse,
-  LogProbs,
-  ModelCard,
-  ModelList,
-  UsageInfo,
-)
-
-schemas = get_generator(
-  'openai',
-  components=[
-    ErrorResponse,
-    ModelList,
-    ChatCompletionResponse,
-    ChatCompletionRequest,
-    ChatCompletionStreamResponse,
-    CompletionRequest,
-    CompletionResponse,
-    CompletionStreamResponse,
-  ],
-  tags=[
-    {
-      'name': 'OpenAI',
-      'description': 'OpenAI Compatible API support',
-      'externalDocs': 'https://platform.openai.com/docs/api-reference/completions/object',
-    }
-  ],
-)
-logger = logging.getLogger(__name__)
-
-
-def jsonify_attr(obj):
-  return orjson.dumps(converter.unstructure(obj)).decode()
-
-
-def error_response(status_code, message):
-  return JSONResponse(
-    {
-      'error': converter.unstructure(
-        ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
-      )
-    },
-    status_code=status_code.value,
-  )
-
-
-async def check_model(request, model):  # noqa
-  if request.model == model:
-    return None
-  return error_response(
-    HTTPStatus.NOT_FOUND,
-    f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see available models.\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
-  )
-
-
-def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initial_text_offset=0, *, llm):
-  # Create OpenAI-style logprobs.
-  logprobs = LogProbs()
-  last_token_len = 0
-  if num_output_top_logprobs:
-    logprobs.top_logprobs = []
-  for i, token_id in enumerate(token_ids):
-    step_top_logprobs = top_logprobs[i]
-    token_logprob = None
-    if step_top_logprobs is not None:
-      token_logprob = step_top_logprobs[token_id]
-    token = llm.tokenizer.convert_ids_to_tokens(token_id)
-    logprobs.tokens.append(token)
-    logprobs.token_logprobs.append(token_logprob)
-    if len(logprobs.text_offset) == 0:
-      logprobs.text_offset.append(initial_text_offset)
-    else:
-      logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
-    last_token_len = len(token)
-    if num_output_top_logprobs:
-      logprobs.top_logprobs.append(
-        {llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
-        if step_top_logprobs
-        else None
-      )
-  return logprobs
-
-
-def mount_to_svc(svc, llm):
-  list_models.__doc__ = list_models.__doc__.replace('__model_id__', llm.llm_type)
-  completions.__doc__ = completions.__doc__.replace('__model_id__', llm.llm_type)
-  chat_completions.__doc__ = chat_completions.__doc__.replace('__model_id__', llm.llm_type)
-  app = Starlette(
-    debug=True,
-    routes=[
-      Route(
-        '/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
-      ),
-      Route(
-        '/completions',
-        functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
-        methods=['POST'],
-      ),
-      Route(
-        '/chat/completions',
-        functools.partial(
-          apply_schema(
-            chat_completions,
-            __model_id__=llm.llm_type,
-            __chat_template__=orjson.dumps(llm.config.chat_template).decode(),
-            __chat_messages__=orjson.dumps(llm.config.chat_messages).decode(),
-            __add_generation_prompt__=str(True) if llm.config.chat_messages is not None else str(False),
-          ),
-          llm=llm,
-        ),
-        methods=['POST'],
-      ),
-      Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
-    ],
-  )
-  svc.mount_asgi_app(app, path='/v1')
-  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path='/v1'))
-
-
-# GET /v1/models
-@add_schema_definitions
-def list_models(_, llm):
-  return JSONResponse(
-    converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
-  )
-
-
-# POST /v1/chat/completions
-@add_schema_definitions
-async def chat_completions(req, llm):
-  # TODO: Check for length based on model context_length
-  json_str = await req.body()
-  try:
-    request = converter.structure(orjson.loads(json_str), ChatCompletionRequest)
-  except orjson.JSONDecodeError as err:
-    logger.debug('Sent body: %s', json_str)
-    logger.error('Invalid JSON input received: %s', err)
-    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
-  logger.debug('Received chat completion request: %s', request)
-  err_check = await check_model(request, llm.llm_type)
-  if err_check is not None:
-    return err_check
-
-  if request.logit_bias is not None and len(request.logit_bias) > 0:
-    return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
-
-  model_name, request_id = request.model, gen_random_uuid('chatcmpl')
-  created_time = int(time.monotonic())
-  prompt = llm.tokenizer.apply_chat_template(
-    request.messages,
-    tokenize=False,
-    chat_template=request.chat_template if request.chat_template != 'None' else None,
-    add_generation_prompt=request.add_generation_prompt,
-  )
-  logger.debug('Prompt: %r', prompt)
-  config = llm.config.compatible_options(request)
-
-  def get_role() -> str:
-    return (
-      request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
-    )  # TODO: Support custom role here.
-
-  try:
-    result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
-  except Exception as err:
-    traceback.print_exc()
-    logger.error('Error generating completion: %s', err)
-    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
-
-  def create_stream_response_json(index, text, finish_reason=None, usage=None):
-    response = ChatCompletionStreamResponse(
-      id=request_id,
-      created=created_time,
-      model=model_name,
-      choices=[
-        ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
-      ],
-    )
-    if usage is not None:
-      response.usage = usage
-    return jsonify_attr(response)
-
-  async def completion_stream_generator():
-    # first chunk with role
-    role = get_role()
-    for i in range(config['n']):
-      yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(role=role), finish_reason=None)], model=model_name))}\n\n'
-
-    if request.echo:
-      last_message, last_content = request.messages[-1], ''
-      if last_message.get('content') and last_message.get('role') == role:
-        last_content = last_message['content']
-      if last_content:
-        for i in range(config['n']):
-          yield f'data: {jsonify_attr(ChatCompletionStreamResponse(id=request_id, created=created_time, choices=[ChatCompletionResponseStreamChoice(index=i, delta=Delta(content=last_content), finish_reason=None)], model=model_name))}\n\n'
-
-    previous_num_tokens = [0] * config['n']
-    finish_reason_sent = [False] * config['n']
-    async for res in result_generator:
-      for output in res.outputs:
-        if finish_reason_sent[output.index]:
-          continue
-        yield f'data: {create_stream_response_json(output.index, output.text)}\n\n'
-        previous_num_tokens[output.index] += len(output.token_ids)
-        if output.finish_reason is not None:
-          prompt_tokens = len(res.prompt_token_ids)
-          usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
-          yield f'data: {create_stream_response_json(output.index, "", output.finish_reason, usage)}\n\n'
-          finish_reason_sent[output.index] = True
-    yield 'data: [DONE]\n\n'
-
-  try:
-    # Streaming case
-    if request.stream:
-      return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
-    # Non-streaming case
-    final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
-    async for res in result_generator:
-      if await req.is_disconnected():
-        return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
-      for output in res.outputs:
-        texts[output.index].append(output.text)
-        token_ids[output.index].extend(output.token_ids)
-      final_result = res
-    if final_result is None:
-      return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
-    final_result = final_result.model_copy(
-      update=dict(
-        outputs=[
-          output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
-          for output in final_result.outputs
-        ]
-      )
-    )
-
-    role = get_role()
-    choices = [
-      ChatCompletionResponseChoice(
-        index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
-      )
-      for output in final_result.outputs
-    ]
-    if request.echo:
-      last_message, last_content = request.messages[-1], ''
-      if last_message.get('content') and last_message.get('role') == role:
-        last_content = last_message['content']
-      for choice in choices:
-        full_message = last_content + choice.message.content
-        choice.message.content = full_message
-
-    num_prompt_tokens = len(final_result.prompt_token_ids)
-    num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
-    usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
-    response = ChatCompletionResponse(
-      id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
-    )
-    return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
-  except Exception as err:
-    traceback.print_exc()
-    logger.error('Error generating completion: %s', err)
-    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
-
-
-# POST /v1/completions
-@add_schema_definitions
-async def completions(req, llm):
-  # TODO: Check for length based on model context_length
-  json_str = await req.body()
-  try:
-    request = converter.structure(orjson.loads(json_str), CompletionRequest)
-  except orjson.JSONDecodeError as err:
-    logger.debug('Sent body: %s', json_str)
-    logger.error('Invalid JSON input received: %s', err)
-    return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
-  logger.debug('Received legacy completion request: %s', request)
-  err_check = await check_model(request, llm.llm_type)
-  if err_check is not None:
-    return err_check
-
-  # OpenAI API supports echoing the prompt when max_tokens is 0.
-  echo_without_generation = request.echo and request.max_tokens == 0
-  if echo_without_generation:
-    request.max_tokens = 1  # XXX: Hack to make sure we get the prompt back.
-
-  if request.suffix is not None:
-    return error_response(HTTPStatus.BAD_REQUEST, "'suffix' is not yet supported.")
-  if request.logit_bias is not None and len(request.logit_bias) > 0:
-    return error_response(HTTPStatus.BAD_REQUEST, "'logit_bias' is not yet supported.")
-
-  if not request.prompt:
-    return error_response(HTTPStatus.BAD_REQUEST, 'Please provide a prompt.')
-  prompt = request.prompt
-  # TODO: Support multiple prompts
-
-  model_name, request_id = request.model, gen_random_uuid('cmpl')
-  created_time = int(time.monotonic())
-  config = llm.config.compatible_options(request)
-
-  try:
-    result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
-  except Exception as err:
-    traceback.print_exc()
-    logger.error('Error generating completion: %s', err)
-    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
-
-  # best_of != n then we don't stream
-  # TODO: support use_beam_search
-  stream = request.stream and (config['best_of'] is None or config['n'] == config['best_of'])
-
-  def create_stream_response_json(index, text, logprobs=None, finish_reason=None, usage=None):
-    response = CompletionStreamResponse(
-      id=request_id,
-      created=created_time,
-      model=model_name,
-      choices=[CompletionResponseStreamChoice(index=index, text=text, logprobs=logprobs, finish_reason=finish_reason)],
-    )
-    if usage:
-      response.usage = usage
-    return jsonify_attr(response)
-
-  async def completion_stream_generator():
-    previous_num_tokens = [0] * config['n']
-    previous_texts = [''] * config['n']
-    previous_echo = [False] * config['n']
-    async for res in result_generator:
-      for output in res.outputs:
-        i = output.index
-        delta_text = output.text
-        token_ids = output.token_ids
-        logprobs = None
-        top_logprobs = None
-        if request.logprobs is not None:
-          top_logprobs = output.logprobs[previous_num_tokens[i] :]
-
-        if request.echo and not previous_echo[i]:
-          if not echo_without_generation:
-            delta_text = res.prompt + delta_text
-            token_ids = res.prompt_token_ids + token_ids
-            if top_logprobs:
-              top_logprobs = res.prompt_logprobs + top_logprobs
-          else:
-            delta_text = res.prompt
-            token_ids = res.prompt_token_ids
-            if top_logprobs:
-              top_logprobs = res.prompt_logprobs
-          previous_echo[i] = True
-        if request.logprobs is not None:
-          logprobs = create_logprobs(
-            output.token_ids,
-            output.logprobs[previous_num_tokens[i] :],
-            request.logprobs,
-            len(previous_texts[i]),
-            llm=llm,
-          )
-        previous_num_tokens[i] += len(output.token_ids)
-        previous_texts[i] += output.text
-        yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
-        if output.finish_reason is not None:
-          logprobs = LogProbs() if request.logprobs is not None else None
-          prompt_tokens = len(res.prompt_token_ids)
-          usage = UsageInfo(prompt_tokens, previous_num_tokens[i], prompt_tokens + previous_num_tokens[i])
-          yield f'data: {create_stream_response_json(i, "", logprobs, output.finish_reason, usage)}\n\n'
-    yield 'data: [DONE]\n\n'
-
-  try:
-    # Streaming case
-    if stream:
-      return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
-    # Non-streaming case
-    final_result, texts, token_ids = None, [[]] * config['n'], [[]] * config['n']
-    async for res in result_generator:
-      if await req.is_disconnected():
-        return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
-      for output in res.outputs:
-        texts[output.index].append(output.text)
-        token_ids[output.index].extend(output.token_ids)
-      final_result = res
-    if final_result is None:
-      return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
-    final_result = final_result.model_copy(
-      update=dict(
-        outputs=[
-          output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
-          for output in final_result.outputs
-        ]
-      )
-    )
-
-    choices = []
-    prompt_token_ids = final_result.prompt_token_ids
-    prompt_logprobs = final_result.prompt_logprobs
-    prompt_text = final_result.prompt
-    for output in final_result.outputs:
-      logprobs = None
-      if request.logprobs is not None:
-        if not echo_without_generation:
-          token_ids, top_logprobs = output.token_ids, output.logprobs
-          if request.echo:
-            token_ids, top_logprobs = prompt_token_ids + token_ids, prompt_logprobs + top_logprobs
-        else:
-          token_ids, top_logprobs = prompt_token_ids, prompt_logprobs
-        logprobs = create_logprobs(token_ids, top_logprobs, request.logprobs, llm=llm)
-      if not echo_without_generation:
-        output_text = output.text
-        if request.echo:
-          output_text = prompt_text + output_text
-      else:
-        output_text = prompt_text
-      choice_data = CompletionResponseChoice(
-        index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
-      )
-      choices.append(choice_data)
-
-    num_prompt_tokens = len(final_result.prompt_token_ids)
-    num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
-    usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
-    response = CompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
-    return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
-  except Exception as err:
-    traceback.print_exc()
-    logger.error('Error generating completion: %s', err)
-    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
--- a/openllm-python/src/openllm/entrypoints/openai.pyi
+++ b/openllm-python/src/openllm/entrypoints/openai.pyi
@@ -1,30 +0,0 @@
-from http import HTTPStatus
-from typing import Dict, List, Optional, Union
-
-from attr import AttrsInstance
-from starlette.requests import Request
-from starlette.responses import JSONResponse, Response
-
-from bentoml import Service
-from openllm_core._typing_compat import M, T
-
-from .._llm import LLM
-from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
-
-def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
-def jsonify_attr(obj: AttrsInstance) -> str: ...
-def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
-async def check_model(
-  request: Union[CompletionRequest, ChatCompletionRequest], model: str
-) -> Optional[JSONResponse]: ...
-def create_logprobs(
-  token_ids: List[int],
-  top_logprobs: List[Dict[int, float]],  #
-  num_output_top_logprobs: Optional[int] = ...,
-  initial_text_offset: int = ...,
-  *,
-  llm: LLM[M, T],
-) -> LogProbs: ...
-def list_models(req: Request, llm: LLM[M, T]) -> Response: ...
-async def chat_completions(req: Request, llm: LLM[M, T]) -> Response: ...
-async def completions(req: Request, llm: LLM[M, T]) -> Response: ...