perf: unify LLM interface (#518)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-04-21 15:39:36 -04:00 · 2023-11-06 20:39:43 -05:00
parent f2639879af
commit e2029c934b
136 changed files with 9646 additions and 11244 deletions
--- a/openllm-python/src/openllm/models/init.py
+++ b/openllm-python/src/openllm/models/init.py
@@ -1,11 +0,0 @@
-# This file is generated by tools/update-models-import.py. DO NOT EDIT MANUALLY!
-# To update this, run ./tools/update-models-import.py
-from __future__ import annotations
-import typing as t
-from openllm_core.utils import LazyModule
-_MODELS:set[str]={"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
-if t.TYPE_CHECKING:from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
-__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})
-__all__=__lazy.__all__
-__dir__=__lazy.__dir__
-__getattr__=__lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/init.py
+++ b/openllm-python/src/openllm/models/auto/init.py
@@ -1,66 +0,0 @@
-from __future__ import annotations
-import os
-import typing as t
-
-import openllm
-from openllm_core.config import CONFIG_MAPPING as CONFIG_MAPPING
-from openllm_core.config import CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
-from openllm_core.config import AutoConfig as AutoConfig
-from openllm_core.utils import LazyModule
-from openllm_core.utils import is_flax_available
-from openllm_core.utils import is_tf_available
-from openllm_core.utils import is_torch_available
-from openllm_core.utils import is_vllm_available
-
-_import_structure: dict[str, list[str]] = {
-    'modeling_auto': ['MODEL_MAPPING_NAMES'],
-    'modeling_flax_auto': ['MODEL_FLAX_MAPPING_NAMES'],
-    'modeling_tf_auto': ['MODEL_TF_MAPPING_NAMES'],
-    'modeling_vllm_auto': ['MODEL_VLLM_MAPPING_NAMES']
-}
-if t.TYPE_CHECKING:
-  from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-  from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-  from .modeling_tf_auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-  from .modeling_vllm_auto import MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
-try:
-  if not is_torch_available(): raise openllm.exceptions.MissingDependencyError
-except openllm.exceptions.MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_auto'].extend(['AutoLLM', 'MODEL_MAPPING'])
-  if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM
-try:
-  if not is_vllm_available(): raise openllm.exceptions.MissingDependencyError
-except openllm.exceptions.MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_auto'].extend(['AutoVLLM', 'MODEL_VLLM_MAPPING'])
-  if t.TYPE_CHECKING: from .modeling_vllm_auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM
-try:
-  if not is_flax_available(): raise openllm.exceptions.MissingDependencyError
-except openllm.exceptions.MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_flax_auto'].extend(['AutoFlaxLLM', 'MODEL_FLAX_MAPPING'])
-  if t.TYPE_CHECKING:
-    from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
-try:
-  if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
-except openllm.exceptions.MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_tf_auto'].extend(['AutoTFLLM', 'MODEL_TF_MAPPING'])
-  if t.TYPE_CHECKING: from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM
-
-__lazy = LazyModule(__name__,
-                    os.path.abspath('__file__'),
-                    _import_structure,
-                    extra_objects={
-                        'CONFIG_MAPPING': CONFIG_MAPPING,
-                        'CONFIG_MAPPING_NAMES': CONFIG_MAPPING_NAMES,
-                        'AutoConfig': AutoConfig,
-                    })
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -1,181 +0,0 @@
-# mypy: disable-error-code="type-arg"
-from __future__ import annotations
-import importlib
-import inspect
-import logging
-import typing as t
-from collections import OrderedDict
-
-import inflection
-
-import openllm
-from openllm_core.utils import ReprMixin
-if t.TYPE_CHECKING:
-  import types
-  from collections import _odict_items
-  from collections import _odict_keys
-  from collections import _odict_values
-
-  from _typeshed import SupportsIter
-
-  from openllm_core._typing_compat import LiteralString
-  from openllm_core._typing_compat import LLMRunner
-  ConfigModelKeysView = _odict_keys[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
-  ConfigModelValuesView = _odict_values[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
-  ConfigModelItemsView = _odict_items[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
-
-logger = logging.getLogger(__name__)
-
-class BaseAutoLLMClass:
-  _model_mapping: t.ClassVar[_LazyAutoMapping]
-
-  def __init__(self, *args: t.Any, **attrs: t.Any):
-    raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
-
-  @classmethod
-  def for_model(cls,
-                model: str,
-                /,
-                model_id: str | None = None,
-                model_version: str | None = None,
-                llm_config: openllm.LLMConfig | None = None,
-                ensure_available: bool = False,
-                **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
-    '''The lower level API for creating a LLM instance.
-
-    ```python
-    >>> import openllm
-    >>> llm = openllm.AutoLLM.for_model("flan-t5")
-    ```
-    '''
-    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
-    if ensure_available: llm.save_pretrained()
-    return llm
-
-  @classmethod
-  def create_runner(cls, model: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
-    '''Create a LLM Runner for the given model name.
-
-    Args:
-    model: The model name to instantiate.
-    model_id: The pretrained model name to instantiate.
-    **attrs: Additional keyword arguments passed along to the specific configuration class.
-
-    Returns:
-    A LLM instance.
-    '''
-    runner_kwargs_name = set(inspect.signature(openllm.LLM[t.Any, t.Any].to_runner).parameters)
-    runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
-    for k in runner_attrs:
-      del attrs[k]
-    return cls.for_model(model, model_id=model_id, **attrs).to_runner(**runner_attrs)
-
-  @classmethod
-  def register(cls, config_class: type[openllm.LLMConfig], llm_class: type[openllm.LLM[t.Any, t.Any]]) -> None:
-    '''Register a new model for this class.
-
-    Args:
-    config_class: The configuration corresponding to the model to register.
-    llm_class: The runnable to register.
-    '''
-    if hasattr(llm_class, 'config_class') and llm_class.config_class is not config_class:
-      raise ValueError(
-          f'The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!'
-      )
-    cls._model_mapping.register(config_class, llm_class)
-
-  @classmethod
-  def infer_class_from_name(cls, name: str) -> type[openllm.LLM[t.Any, t.Any]]:
-    config_class = openllm.AutoConfig.infer_class_from_name(name)
-    if config_class in cls._model_mapping: return cls._model_mapping[config_class]
-    raise ValueError(
-        f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])})."
-    )
-
-def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
-  if attr is None: return
-  if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr)
-  if hasattr(module, attr): return getattr(module, attr)
-  # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the object at the top level.
-  openllm_module = importlib.import_module('openllm')
-  if module != openllm_module:
-    try:
-      return getattribute_from_module(openllm_module, attr)
-    except ValueError:
-      raise ValueError(f'Could not find {attr} neither in {module} nor in {openllm_module}!') from None
-  raise ValueError(f'Could not find {attr} in {openllm_module}!')
-
-class _LazyAutoMapping(OrderedDict, ReprMixin):
-  """Based on transformers.models.auto.configuration_auto._LazyAutoMapping.
-
-  This OrderedDict values() and keys() returns the list instead, so you don't
-  have to do list(mapping.values()) to get the list of values.
-  """
-  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
-    self._config_mapping = config_mapping
-    self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
-    self._model_mapping = model_mapping
-    self._extra_content: dict[t.Any, t.Any] = {}
-    self._modules: dict[str, types.ModuleType] = {}
-
-  def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]:
-    if key in self._extra_content: return self._extra_content[key]
-    model_type = self._reverse_config_mapping[key.__name__]
-    if model_type in self._model_mapping:
-      return self._load_attr_from_module(model_type, self._model_mapping[model_type])
-    # Maybe there was several model types associated with this config.
-    model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
-    for mtype in model_types:
-      if mtype in self._model_mapping: return self._load_attr_from_module(mtype, self._model_mapping[mtype])
-    raise KeyError(key)
-
-  def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
-    module_name = inflection.underscore(model_type)
-    if module_name not in self._modules:
-      self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
-    return getattribute_from_module(self._modules[module_name], attr)
-
-  def __len__(self) -> int:
-    return len(set(self._config_mapping.keys()).intersection(self._model_mapping.keys())) + len(self._extra_content)
-
-  @property
-  def __repr_keys__(self) -> set[str]:
-    return set(self._config_mapping.keys())
-
-  def __repr__(self) -> str:
-    return ReprMixin.__repr__(self)
-
-  def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
-    yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
-
-  def __bool__(self) -> bool:
-    return bool(self.keys())
-
-  def keys(self) -> ConfigModelKeysView:
-    return t.cast('ConfigModelKeysView',
-                  [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()))
-
-  def values(self) -> ConfigModelValuesView:
-    return t.cast('ConfigModelValuesView',
-                  [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
-
-  def items(self) -> ConfigModelItemsView:
-    return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
-                                           for key in self._model_mapping.keys()
-                                           if key in self._config_mapping.keys()] + list(self._extra_content.items()))
-
-  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
-    return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
-
-  def __contains__(self, item: t.Any) -> bool:
-    if item in self._extra_content: return True
-    if not hasattr(item, '__name__') or item.__name__ not in self._reverse_config_mapping: return False
-    return self._reverse_config_mapping[item.__name__] in self._model_mapping
-
-  def register(self, key: t.Any, value: t.Any) -> None:
-    if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
-      if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys():
-        raise ValueError(f"'{key}' is already used by a OpenLLM model.")
-    self._extra_content[key] = value
-
-__all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -1,15 +0,0 @@
-from __future__ import annotations
-import typing as t
-from collections import OrderedDict
-
-from openllm_core.config import CONFIG_MAPPING_NAMES
-
-from .factory import BaseAutoLLMClass
-from .factory import _LazyAutoMapping
-
-MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
-                                   ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
-MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
-
-class AutoLLM(BaseAutoLLMClass):
-  _model_mapping: t.ClassVar = MODEL_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -1,14 +0,0 @@
-from __future__ import annotations
-import typing as t
-from collections import OrderedDict
-
-from openllm_core.config import CONFIG_MAPPING_NAMES
-
-from .factory import BaseAutoLLMClass
-from .factory import _LazyAutoMapping
-
-MODEL_FLAX_MAPPING_NAMES = OrderedDict([('flan_t5', 'FlaxFlanT5'), ('opt', 'FlaxOPT')])
-MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
-
-class AutoFlaxLLM(BaseAutoLLMClass):
-  _model_mapping: t.ClassVar = MODEL_FLAX_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -1,14 +0,0 @@
-from __future__ import annotations
-import typing as t
-from collections import OrderedDict
-
-from openllm_core.config import CONFIG_MAPPING_NAMES
-
-from .factory import BaseAutoLLMClass
-from .factory import _LazyAutoMapping
-
-MODEL_TF_MAPPING_NAMES = OrderedDict([('flan_t5', 'TFFlanT5'), ('opt', 'TFOPT')])
-MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
-
-class AutoTFLLM(BaseAutoLLMClass):
-  _model_mapping: t.ClassVar = MODEL_TF_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -1,15 +0,0 @@
-from __future__ import annotations
-import typing as t
-from collections import OrderedDict
-
-from openllm_core.config import CONFIG_MAPPING_NAMES
-
-from .factory import BaseAutoLLMClass
-from .factory import _LazyAutoMapping
-
-MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
-                                        ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
-MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
-
-class AutoVLLM(BaseAutoLLMClass):
-  _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
--- a/openllm-python/src/openllm/models/baichuan/init.py
+++ b/openllm-python/src/openllm/models/baichuan/init.py
@@ -1,37 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_cpm_kernels_available
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_baichuan import START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
-from openllm_core.config.configuration_baichuan import BaichuanConfig as BaichuanConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_baichuan'] = ['Baichuan']
-  if t.TYPE_CHECKING: from .modeling_baichuan import Baichuan as Baichuan
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_baichuan'] = ['VLLMBaichuan']
-  if t.TYPE_CHECKING: from .modeling_vllm_baichuan import VLLMBaichuan as VLLMBaichuan
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_BAICHUAN_COMMAND_DOCSTRING': START_BAICHUAN_COMMAND_DOCSTRING,
-                                       'BaichuanConfig': BaichuanConfig
-                                   })
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -1,15 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import transformers
-
-class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
-  __openllm_internal__ = True
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    import torch
-    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
-      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -1,9 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMBaichuan(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/chatglm/init.py
+++ b/openllm-python/src/openllm/models/chatglm/init.py
@@ -1,29 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_cpm_kernels_available
-from openllm.utils import is_torch_available
-from openllm_core.config.configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_chatglm import START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
-from openllm_core.config.configuration_chatglm import ChatGLMConfig as ChatGLMConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_chatglm'] = ['ChatGLM']
-  if t.TYPE_CHECKING: from .modeling_chatglm import ChatGLM as ChatGLM
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_CHATGLM_COMMAND_DOCSTRING': START_CHATGLM_COMMAND_DOCSTRING,
-                                       'ChatGLMConfig': ChatGLMConfig
-                                   })
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -1,17 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING:
-  import transformers
-
-class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerFast']):
-  __openllm_internal__ = True
-
-  def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
-    import torch
-    with torch.inference_mode():
-      self.model.eval()
-      # Only use half precision if the model is not yet quantized
-      if self.config.use_half_precision: self.model.half()
-      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
--- a/openllm-python/src/openllm/models/dolly_v2/init.py
+++ b/openllm-python/src/openllm/models/dolly_v2/init.py
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_dolly_v2 import START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
-from openllm_core.config.configuration_dolly_v2 import DollyV2Config as DollyV2Config
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_dolly_v2'] = ['DollyV2']
-  if t.TYPE_CHECKING: from .modeling_dolly_v2 import DollyV2 as DollyV2
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_dolly_v2'] = ['VLLMDollyV2']
-  if t.TYPE_CHECKING: from .modeling_vllm_dolly_v2 import VLLMDollyV2 as VLLMDollyV2
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_DOLLY_V2_COMMAND_DOCSTRING': START_DOLLY_V2_COMMAND_DOCSTRING,
-                                       'DollyV2Config': DollyV2Config
-                                   })
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -1,141 +0,0 @@
-from __future__ import annotations
-import logging
-import re
-import typing as t
-
-import openllm
-from openllm_core._typing_compat import overload
-from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_dolly_v2 import END_KEY
-from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
-from openllm_core.config.configuration_dolly_v2 import get_special_token_id
-if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
-else:
-  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
-                                                                                                            'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
-logger = logging.getLogger(__name__)
-
-@overload
-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
-  ...
-
-@overload
-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
-  ...
-
-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
-  # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
-  class InstructionTextGenerationPipeline(transformers.Pipeline):
-    def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
-      super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
-
-    def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
-      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      preprocess_params: dict[str, t.Any] = {}
-      # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
-      # append a newline to yield a single token.  find whatever token is configured for the response key.
-      tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
-      response_key_token_id = None
-      end_key_token_id = None
-      if tokenizer_response_key:
-        try:
-          response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
-          end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
-          # Ensure generation stops once it generates "### End"
-          generate_kwargs['eos_token_id'] = end_key_token_id
-        except ValueError:
-          pass
-      forward_params = generate_kwargs
-      postprocess_params = {'response_key_token_id': response_key_token_id, 'end_key_token_id': end_key_token_id}
-      if return_full_text is not None: postprocess_params['return_full_text'] = return_full_text
-      return preprocess_params, forward_params, postprocess_params
-
-    def preprocess(self, input_: str, **generate_kwargs: t.Any) -> t.Dict[str, t.Any]:
-      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
-      inputs = self.tokenizer(prompt_text, return_tensors='pt')
-      inputs['prompt_text'] = prompt_text
-      inputs['instruction_text'] = input_
-      return t.cast(t.Dict[str, t.Any], inputs)
-
-    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
-      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
-      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
-      else: in_b = input_ids.shape[0]
-      generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
-                                               attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
-                                               pad_token_id=self.tokenizer.pad_token_id,
-                                               **generate_kwargs)
-      out_b = generated_sequence.shape[0]
-      if self.framework == 'pt':
-        generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
-      elif self.framework == 'tf':
-        generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
-      instruction_text = input_tensors.pop('instruction_text')
-      return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}
-
-    def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
-      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      _generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
-      generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
-      records: list[dict[t.Literal['generated_text'], str]] = []
-      for sequence in generated_sequence:
-        # The response will be set to this variable if we can identify it.
-        decoded = None
-        # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
-        if response_key_token_id and end_key_token_id:
-          # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
-          # prompt, we should definitely find it.  We will return the tokens found after this token.
-          try:
-            response_pos = sequence.index(response_key_token_id)
-          except ValueError:
-            response_pos = None
-          if response_pos is None:
-            logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
-          if response_pos:
-            # Next find where "### End" is located.  The model has been trained to end its responses with this
-            # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
-            # this token, as the response could be truncated.  If we don't find it then just return everything
-            # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
-            try:
-              end_pos = sequence.index(end_key_token_id)
-            except ValueError:
-              end_pos = None
-            decoded = self.tokenizer.decode(sequence[response_pos + 1:end_pos]).strip()
-        if not decoded:
-          # Otherwise we'll decode everything and use a regex to find the response and end.
-          fully_decoded = self.tokenizer.decode(sequence)
-          # The response appears after "### Response:".  The model has been trained to append "### End" at the
-          # end.
-          m = re.search(r'#+\s*Response:\s*(.+?)#+\s*End', fully_decoded, flags=re.DOTALL)
-          if m: decoded = m.group(1).strip()
-          else:
-            # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
-            # return everything after "### Response:".
-            m = re.search(r'#+\s*Response:\s*(.+)', fully_decoded, flags=re.DOTALL)
-            if m: decoded = m.group(1).strip()
-            else: logger.warning('Failed to find response in:\n%s', fully_decoded)
-        # If the full text is requested, then append the decoded text to the original instruction.
-        # This technically isn't the full text, as we format the instruction in the prompt the model has been
-        # trained on, but to the client it will appear to be the full text.
-        if return_full_text: decoded = f'{instruction_text}\n{decoded}'
-        records.append({'generated_text': t.cast(str, decoded)})
-      return records
-
-  return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline
-
-class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedTokenizer']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
-
-  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    llm_config = self.config.model_construct_env(**attrs)
-    with torch.inference_mode():
-      return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -1,12 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
-
-class VLLMDollyV2(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizer']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/falcon/init.py
+++ b/openllm-python/src/openllm/models/falcon/init.py
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_falcon import START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
-from openllm_core.config.configuration_falcon import FalconConfig as FalconConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_falcon'] = ['Falcon']
-  if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_falcon'] = ['VLLMFalcon']
-  if t.TYPE_CHECKING: from .modeling_vllm_falcon import VLLMFalcon as VLLMFalcon
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_FALCON_COMMAND_DOCSTRING': START_FALCON_COMMAND_DOCSTRING,
-                                       'FalconConfig': FalconConfig
-                                   })
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -1,22 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import torch, transformers
-else:
-  torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
-
-class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
-                                                             attention_mask=inputs['attention_mask'],
-                                                             generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
-                                         skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
@@ -1,12 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-logger = logging.getLogger(__name__)
-
-class VLLMFalcon(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/flan_t5/init.py
+++ b/openllm-python/src/openllm/models/flan_t5/init.py
@@ -1,37 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_flax_available
-from openllm.utils import is_tf_available
-from openllm.utils import is_torch_available
-from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_flan_t5 import START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
-from openllm_core.config.configuration_flan_t5 import FlanT5Config as FlanT5Config
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_flan_t5'] = ['FlanT5']
-  if t.TYPE_CHECKING: from .modeling_flan_t5 import FlanT5 as FlanT5
-try:
-  if not is_flax_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_flax_flan_t5'] = ['FlaxFlanT5']
-  if t.TYPE_CHECKING: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-try:
-  if not is_tf_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_tf_flan_t5'] = ['TFFlanT5']
-  if t.TYPE_CHECKING: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-
-sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -1,17 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING:
-  import transformers
-
-class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
-  __openllm_internal__ = True
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    import torch
-    with torch.inference_mode():
-      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                                                             do_sample=True,
-                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-                                         skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -1,40 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-from openllm_core._prompt import process_prompt
-from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import transformers
-
-class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
-  __openllm_internal__ = True
-
-  def sanitize_parameters(self,
-                          prompt: str,
-                          max_new_tokens: int | None = None,
-                          temperature: float | None = None,
-                          top_k: int | None = None,
-                          top_p: float | None = None,
-                          repetition_penalty: float | None = None,
-                          decoder_start_token_id: int | None = None,
-                          use_default_prompt_template: bool = True,
-                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    if decoder_start_token_id is None: decoder_start_token_id = 0
-    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        'max_new_tokens': max_new_tokens,
-        'temperature': temperature,
-        'top_k': top_k,
-        'top_p': top_p,
-        'repetition_penalty': repetition_penalty,
-        'decoder_start_token_id': decoder_start_token_id
-    }, {}
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
-    decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
-    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='np')['input_ids'],
-                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-                                                           decoder_start_token_id=decoder_start_token_id).sequences,
-                                       skip_special_tokens=True,
-                                       clean_up_tokenization_spaces=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -1,14 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import transformers
-
-class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
-  __openllm_internal__ = True
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids,
-                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/gpt_neox/init.py
+++ b/openllm-python/src/openllm/models/gpt_neox/init.py
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_gpt_neox import START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
-from openllm_core.config.configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_gpt_neox'] = ['GPTNeoX']
-  if t.TYPE_CHECKING: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_gpt_neox'] = ['VLLMGPTNeoX']
-  if t.TYPE_CHECKING: from .modeling_vllm_gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_GPT_NEOX_COMMAND_DOCSTRING': START_GPT_NEOX_COMMAND_DOCSTRING,
-                                       'GPTNeoXConfig': GPTNeoXConfig
-                                   })
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -1,16 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import transformers
-
-logger = logging.getLogger(__name__)
-
-class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
@@ -1,9 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMGPTNeoX(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/llama/init.py
+++ b/openllm-python/src/openllm/models/llama/init.py
@@ -1,38 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_llama import PROMPT_MAPPING as PROMPT_MAPPING
-from openllm_core.config.configuration_llama import START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
-from openllm_core.config.configuration_llama import LlamaConfig as LlamaConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_llama'] = ['VLLMLlama']
-  if t.TYPE_CHECKING: from .modeling_vllm_llama import VLLMLlama as VLLMLlama
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_llama'] = ['Llama']
-  if t.TYPE_CHECKING: from .modeling_llama import Llama as Llama
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_LLAMA_COMMAND_DOCSTRING': START_LLAMA_COMMAND_DOCSTRING,
-                                       'LlamaConfig': LlamaConfig,
-                                       'PROMPT_MAPPING': PROMPT_MAPPING
-                                   })
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -1,14 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING:
-  import transformers
-
-class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaTokenizerFast']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
--- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
@@ -1,8 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMLlama(openllm.LLM['vllm.LLMEngine', 'transformers.LlamaTokenizerFast']):
-  __openllm_internal__ = True
--- a/openllm-python/src/openllm/models/mpt/init.py
+++ b/openllm-python/src/openllm/models/mpt/init.py
@@ -1,38 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_mpt import PROMPT_MAPPING as PROMPT_MAPPING
-from openllm_core.config.configuration_mpt import START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
-from openllm_core.config.configuration_mpt import MPTConfig as MPTConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_mpt'] = ['MPT']
-  if t.TYPE_CHECKING: from .modeling_mpt import MPT as MPT
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_mpt'] = ['VLLMMPT']
-  if t.TYPE_CHECKING: from .modeling_vllm_mpt import VLLMMPT as VLLMMPT
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_MPT_COMMAND_DOCSTRING': START_MPT_COMMAND_DOCSTRING,
-                                       'MPTConfig': MPTConfig,
-                                       'PROMPT_MAPPING': PROMPT_MAPPING
-                                   })
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -1,88 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import bentoml
-import openllm
-from openllm.utils import generate_labels
-from openllm.utils import is_triton_available
-if t.TYPE_CHECKING:
-  import torch
-  import transformers
-
-logger = logging.getLogger(__name__)
-
-def get_mpt_config(model_id_or_path: str,
-                   max_sequence_length: int,
-                   device: torch.device | str | int | None,
-                   device_map: str | None = None,
-                   trust_remote_code: bool = True) -> transformers.PretrainedConfig:
-  import torch
-  config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-  if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)):
-    config.init_device = str(device)
-  if hasattr(config, 'attn_config') and is_triton_available(): config.attn_config['attn_impl'] = 'triton'
-  else:
-    logger.debug(
-        "'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
-    )
-  # setting max_seq_len
-  config.max_seq_len = max_sequence_length
-  return config
-
-class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
-
-  def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
-    import torch
-    import transformers
-    _, tokenizer_attrs = self.llm_parameters
-    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
-    device_map = attrs.pop('device_map', None)
-    attrs.pop('low_cpu_mem_usage', None)
-    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
-    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
-    finally:
-      torch.cuda.empty_cache()
-
-  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
-    import transformers
-    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
-    device_map = attrs.pop('device_map', None)
-    trust_remote_code = attrs.pop('trust_remote_code', True)
-    config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
-    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
-                                                              config=config,
-                                                              trust_remote_code=trust_remote_code,
-                                                              torch_dtype=torch_dtype,
-                                                              device_map=device_map,
-                                                              **attrs)
-    model.tie_weights()
-    return model
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    import torch
-    llm_config = self.config.model_construct_env(**attrs)
-    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    attrs = {
-        'do_sample': False if llm_config['temperature'] == 0 else True,
-        'eos_token_id': self.tokenizer.eos_token_id,
-        'pad_token_id': self.tokenizer.pad_token_id,
-        'generation_config': llm_config.to_generation_config()
-    }
-    with torch.inference_mode():
-      if torch.cuda.is_available():
-        with torch.autocast('cuda', torch.float16):  # type: ignore[attr-defined]
-          generated_tensors = self.model.generate(**inputs, **attrs)
-      else:
-        generated_tensors = self.model.generate(**inputs, **attrs)
-    return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
@@ -1,9 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import transformers, vllm
-
-class VLLMMPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/opt/init.py
+++ b/openllm-python/src/openllm/models/opt/init.py
@@ -1,52 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_flax_available
-from openllm.utils import is_tf_available
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_opt import START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
-from openllm_core.config.configuration_opt import OPTConfig as OPTConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_opt'] = ['OPT']
-  if t.TYPE_CHECKING: from .modeling_opt import OPT as OPT
-try:
-  if not is_flax_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_flax_opt'] = ['FlaxOPT']
-  if t.TYPE_CHECKING: from .modeling_flax_opt import FlaxOPT as FlaxOPT
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_opt'] = ['VLLMOPT']
-  if t.TYPE_CHECKING: from .modeling_vllm_opt import VLLMOPT as VLLMOPT
-try:
-  if not is_tf_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_tf_opt'] = ['TFOPT']
-  if t.TYPE_CHECKING: from .modeling_tf_opt import TFOPT as TFOPT
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_OPT_COMMAND_DOCSTRING': START_OPT_COMMAND_DOCSTRING,
-                                       'OPTConfig': OPTConfig,
-                                   })
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -1,47 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import bentoml
-import openllm
-from openllm_core.prompts import process_prompt
-from openllm.utils import generate_labels
-from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import transformers
-else: transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
-
-logger = logging.getLogger(__name__)
-
-class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
-  __openllm_internal__ = True
-
-  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.pad_token_id = config.pad_token_id
-    return bentoml.transformers.save_model(self.tag,
-                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
-                                           custom_objects={'tokenizer': tokenizer},
-                                           labels=generate_labels(self))
-
-  def sanitize_parameters(self,
-                          prompt: str,
-                          max_new_tokens: int | None = None,
-                          temperature: float | None = None,
-                          top_k: int | None = None,
-                          num_return_sequences: int | None = None,
-                          repetition_penalty: float | None = None,
-                          use_default_prompt_template: bool = False,
-                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        'max_new_tokens': max_new_tokens,
-        'temperature': temperature,
-        'top_k': top_k,
-        'num_return_sequences': num_return_sequences,
-        'repetition_penalty': repetition_penalty
-    }, {}
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
-                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
-                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -1,24 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import transformers
-
-logger = logging.getLogger(__name__)
-
-class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    import torch
-    with torch.inference_mode():
-      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                                                             do_sample=True,
-                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-                                         skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -1,25 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import bentoml
-import openllm
-from openllm_core.utils import generate_labels
-if t.TYPE_CHECKING: import transformers
-
-class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
-  __openllm_internal__ = True
-
-  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    import transformers
-    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.pad_token_id = config.pad_token_id
-    return bentoml.transformers.save_model(self.tag,
-                                           transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
-                                           custom_objects={'tokenizer': tokenizer},
-                                           labels=generate_labels(self))
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
-                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -1,26 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-from openllm_core.prompts import process_prompt
-from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
-
-  def sanitize_parameters(self,
-                          prompt: str,
-                          max_new_tokens: int | None = None,
-                          temperature: float | None = None,
-                          top_k: int | None = None,
-                          num_return_sequences: int | None = None,
-                          use_default_prompt_template: bool = True,
-                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        'max_new_tokens': max_new_tokens,
-        'temperature': temperature,
-        'top_k': top_k,
-        'num_return_sequences': num_return_sequences
-    }, {}
--- a/openllm-python/src/openllm/models/stablelm/init.py
+++ b/openllm-python/src/openllm/models/stablelm/init.py
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_stablelm import START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
-from openllm_core.config.configuration_stablelm import StableLMConfig as StableLMConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_stablelm'] = ['StableLM']
-  if t.TYPE_CHECKING: from .modeling_stablelm import StableLM as StableLM
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_stablelm'] = ['VLLMStableLM']
-  if t.TYPE_CHECKING: from .modeling_vllm_stablelm import VLLMStableLM as VLLMStableLM
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_STABLELM_COMMAND_DOCSTRING': START_STABLELM_COMMAND_DOCSTRING,
-                                       'StableLMConfig': StableLMConfig,
-                                   })
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -1,26 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING:
-  import transformers
-
-class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    import torch
-    with torch.inference_mode():
-      return [
-          self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                                                    do_sample=True,
-                                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-                                                    pad_token_id=self.tokenizer.eos_token_id,
-                                                    stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
-                                skip_special_tokens=True)
-      ]
--- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
@@ -1,10 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMStableLM(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/starcoder/init.py
+++ b/openllm-python/src/openllm/models/starcoder/init.py
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import sys
-import typing as t
-
-from openllm.exceptions import MissingDependencyError
-from openllm.utils import LazyModule
-from openllm.utils import is_torch_available
-from openllm.utils import is_vllm_available
-from openllm_core.config.configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
-from openllm_core.config.configuration_starcoder import START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
-from openllm_core.config.configuration_starcoder import StarCoderConfig as StarCoderConfig
-
-_import_structure: dict[str, list[str]] = {}
-try:
-  if not is_torch_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_starcoder'] = ['StarCoder']
-  if t.TYPE_CHECKING: from .modeling_starcoder import StarCoder as StarCoder
-try:
-  if not is_vllm_available(): raise MissingDependencyError
-except MissingDependencyError:
-  pass
-else:
-  _import_structure['modeling_vllm_starcoder'] = ['VLLMStarCoder']
-  if t.TYPE_CHECKING: from .modeling_vllm_starcoder import VLLMStarCoder as VLLMStarCoder
-
-sys.modules[__name__] = LazyModule(__name__,
-                                   globals()['__file__'],
-                                   _import_structure,
-                                   extra_objects={
-                                       'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
-                                       'START_STARCODER_COMMAND_DOCSTRING': START_STARCODER_COMMAND_DOCSTRING,
-                                       'StarCoderConfig': StarCoderConfig,
-                                   })
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -1,32 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import bentoml
-import openllm
-from openllm.utils import generate_labels
-from openllm_core.config.configuration_starcoder import EOD
-from openllm_core.config.configuration_starcoder import FIM_MIDDLE
-from openllm_core.config.configuration_starcoder import FIM_PAD
-from openllm_core.config.configuration_starcoder import FIM_PREFIX
-from openllm_core.config.configuration_starcoder import FIM_SUFFIX
-if t.TYPE_CHECKING: import transformers
-
-class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.GPT2TokenizerFast']):
-  __openllm_internal__ = True
-
-  @property
-  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    import torch
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-
-  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    import torch
-    import transformers
-    torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
-    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
-    finally:
-      torch.cuda.empty_cache()
--- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
@@ -1,10 +0,0 @@
-from __future__ import annotations
-import logging
-import typing as t
-
-import openllm
-if t.TYPE_CHECKING: import vllm, transformers
-
-class VLLMStarCoder(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2TokenizerFast']):
-  __openllm_internal__ = True
-  tokenizer_id = 'local'