From 7398ae04866dc4ba29305acd3bcc181f19cbee51 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Wed, 8 Nov 2023 02:23:08 -0500
Subject: [PATCH] refactor(strategies): move logics into openllm-python (#578)

fix(strategies): move to openllm

Strategies shouldn't be a part of openllm-core

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 openllm-core/pyproject.toml                   |   1 +
 .../src/openllm_core/_configuration.py        | 214 +-----------------
 openllm-core/src/openllm_core/utils/peft.py   | 100 ++++++++
 openllm-python/src/openllm/_llm.py            |   2 +-
 .../src/openllm}/_strategies.py               |  16 +-
 openllm-python/src/openllm/cli/entrypoint.py  |  19 --
 openllm-python/tests/strategies_test.py       |   8 +-
 7 files changed, 121 insertions(+), 239 deletions(-)
 create mode 100644 openllm-core/src/openllm_core/utils/peft.py
 rename {openllm-core/src/openllm_core => openllm-python/src/openllm}/_strategies.py (97%)

diff --git a/openllm-core/pyproject.toml b/openllm-core/pyproject.toml
index 41b6df50..8cc30297 100644
--- a/openllm-core/pyproject.toml
+++ b/openllm-core/pyproject.toml
@@ -62,6 +62,7 @@ dependencies = [
   "cattrs>=23.1.0",
   "orjson",
   "inflection",
+  "deepmerge",
   "typing_extensions",
   "mypy_extensions",
 ]
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index 005418b2..d993b5e5 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -1,38 +1,4 @@
 # mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc"
-'''Configuration utilities for OpenLLM. All model configuration will inherit from ``openllm.LLMConfig``.
-
-Highlight feature: Each fields in ``openllm.LLMConfig`` will also automatically generate a environment
-variable based on its name field.
-
-For example, the following config class:
-
-```python
-class FlanT5Config(openllm.LLMConfig):
-  __config__ = {
-    "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
-    "default_id": "google/flan-t5-large",
-    "model_ids": [
-        "google/flan-t5-small",
-        "google/flan-t5-base",
-        "google/flan-t5-large",
-        "google/flan-t5-xl",
-        "google/flan-t5-xxl",
-    ],
-  }
-
-  class GenerationConfig:
-    temperature: float = 0.9
-    max_new_tokens: int = 2048
-    top_k: int = 50
-    top_p: float = 0.4
-    repetition_penalty = 1.0
-```
-
-which generates the environment OPENLLM_FLAN_T5_GENERATION_TEMPERATURE for users to configure temperature
-dynamically during serve, ahead-of-serve or per requests.
-
-Refer to ``openllm.LLMConfig`` docstring for more information.
-'''
 from __future__ import annotations
 import copy
 import enum
@@ -82,20 +48,17 @@ from .utils import dantic
 from .utils import field_env_key
 from .utils import first_not_none
 from .utils import lenient_issubclass
+from .utils.peft import FineTuneConfig, PEFT_TASK_TYPE_TARGET_MAPPING, PeftType
 from .utils.import_utils import is_vllm_available
 
 if t.TYPE_CHECKING:
   import click
-  import peft
   import transformers
   import vllm
 
-  from transformers.generation.beam_constraints import Constraint
-
   from openllm.protocol.openai import ChatCompletionRequest
   from openllm.protocol.openai import CompletionRequest
 else:
-  Constraint = t.Any
   vllm = LazyLoader('vllm', globals(), 'vllm')
   transformers = LazyLoader('transformers', globals(), 'transformers')
   peft = LazyLoader('peft', globals(), 'peft')
@@ -104,130 +67,8 @@ __all__ = ['LLMConfig', 'GenerationConfig', 'SamplingParams', 'field_env_key']
 
 logger = logging.getLogger(__name__)
 config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
-
-# case insensitive, but rename to conform with type
-class _PeftEnumMeta(enum.EnumMeta):
-  def __getitem__(self, __key: str | t.Any, /) -> t.Any:
-    if isinstance(__key, str): __key = inflection.underscore(__key).upper()
-    return self._member_map_[__key]
-
-# vendorred from peft.utils.config.PeftType since we don't have hard dependency on peft
-# see https://github.com/huggingface/peft/blob/main/src/peft/utils/config.py
-class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta):
-  PROMPT_TUNING = 'PROMPT_TUNING'
-  MULTITASK_PROMPT_TUNING = 'MULTITASK_PROMPT_TUNING'
-  P_TUNING = 'P_TUNING'
-  PREFIX_TUNING = 'PREFIX_TUNING'
-  LORA = 'LORA'
-  ADALORA = 'ADALORA'
-  ADAPTION_PROMPT = 'ADAPTION_PROMPT'
-  IA3 = 'IA3'
-  LOHA = 'LOHA'
-  LOKR = 'LOKR'
-
-  @classmethod
-  def _missing_(cls, value: object) -> enum.Enum | None:
-    if isinstance(value, str):
-      normalized = inflection.underscore(value).upper()
-      if normalized in cls._member_map_: return cls._member_map_[normalized]
-    return None
-
-  @classmethod
-  def supported(cls) -> set[str]:
-    return {inflection.underscore(v.value) for v in cls}
-
-  def to_str(self) -> str:
-    return self.value
-
-  @staticmethod
-  def get(__key: str | t.Any, /) -> PeftType:
-    return PeftType[__key]  # type-safe getitem.
-
-_PEFT_TASK_TYPE_TARGET_MAPPING = {'causal_lm': 'CAUSAL_LM', 'seq2seq_lm': 'SEQ_2_SEQ_LM'}
-
 _object_setattr = object.__setattr__
 
-def _adapter_converter(value: AdapterType | str | PeftType | None) -> PeftType:
-  if value is None: raise ValueError("'AdapterType' cannot be None.")
-  if isinstance(value, PeftType): return value
-  if value not in PeftType.supported(): raise ValueError(f"Given '{value}' is not a supported adapter type.")
-  return PeftType.get(value)
-
-@attr.define(slots=True, init=True)
-class FineTuneConfig:
-  '''FineTuneConfig defines a default value for fine-tuning this any given LLM.
-
-  For example:
-
-  ```python
-  class FalconConfig(openllm.LLMConfig):
-      __config__ = {
-          "fine_tune_strategies": (
-              {
-                  "adapter_type": "lora",
-                  "r": 64,
-                  "lora_alpha": 16,
-                  "lora_dropout": 0.1,
-                  "bias": "none",
-                  "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
-              },
-          ),
-      }
-  ```
-
-  This is a lower level API that leverage `peft` as well as openllm.LLMConfig to create default
-  and customization
-  '''
-
-  if t.TYPE_CHECKING and not MYPY:
-    # The following type stubs makes __init__ aware of attrs internal type converter.
-    @overload
-    def __init__(self, adapter_type: AdapterType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None:
-      ...
-
-    @overload
-    def __init__(self, adapter_type: PeftType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None:
-      ...
-
-    # The below should be generated via attrs. Only here to conform with pyright strict checking.
-    def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
-      ...
-
-  adapter_type: PeftType = dantic.Field('lora',
-                                        description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
-                                        use_default_converter=False,
-                                        converter=_adapter_converter)
-  adapter_config: t.Dict[str, t.Any] = dantic.Field(None,
-                                                    description='The configuration for the adapter. The content of the dict depends on the adapter type.',
-                                                    validator=attr.validators.optional(attr.validators.instance_of(dict)),
-                                                    converter=attr.converters.default_if_none(factory=dict),
-                                                    use_default_converter=False)
-  inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False)
-  llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False)
-
-  def build(self) -> peft.PeftConfig:  # type: ignore[name-defined]
-    adapter_config = self.adapter_config.copy()
-    # no need for peft_type since it is internally managed by OpenLLM and PEFT
-    if 'peft_type' in adapter_config: adapter_config.pop('peft_type')
-    for k in {'enable_lora', 'merge_weights'}:
-      if k in adapter_config: adapter_config.pop(k)  # this is an old key for older lora layer
-    # respect user set task_type if it is passed, otherwise use one managed by OpenLLM
-    task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop('inference_mode', self.inference_mode)
-    return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, inference_mode=inference_mode, **adapter_config)
-
-  def train(self) -> FineTuneConfig:
-    _object_setattr(self, 'inference_mode', False)
-    return self
-
-  def eval(self) -> FineTuneConfig:
-    _object_setattr(self, 'inference_mode', True)
-    return self
-
-  def with_config(self, **attrs: t.Any) -> FineTuneConfig:
-    adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode)
-    if 'llm_config_class' in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.")
-    return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs))
-
 @attr.frozen(slots=True, repr=False, init=False)
 class GenerationConfig(ReprMixin):
   """GenerationConfig is the attrs-compatible version of ``transformers.GenerationConfig``, with some additional validation and environment constructor.
@@ -304,10 +145,6 @@ class GenerationConfig(ReprMixin):
       description=
       "Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. "
   )
-  constraints: t.List[Constraint] = dantic.Field(
-      description=
-      'Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.'
-  )
   forced_bos_token_id: int = dantic.Field(
       description=
       'The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like [mBART](https://huggingface.co/docs/transformers/model_doc/mbart) where the first generated token needs to be the target language token. '
@@ -342,11 +179,6 @@ class GenerationConfig(ReprMixin):
   encoder_no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.')
   decoder_start_token_id: int = dantic.Field(description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.')
 
-  if t.TYPE_CHECKING and not MYPY:
-    # stubs this for pyright as mypy already has a attr plugin builtin
-    def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
-      ...
-
   def __init__(self, *, _internal: bool = False, **attrs: t.Any):
     if not _internal:
       raise RuntimeError('GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config')
@@ -405,9 +237,6 @@ class SamplingParams(ReprMixin):
     top_k: int
     top_p: float
 
-    def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
-      ...
-
   def __init__(self, *, _internal: bool = False, **attrs: t.Any):
     if not _internal: raise RuntimeError('SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config.')
     _object_setattr(self, 'max_tokens', attrs.pop('max_tokens', 16))
@@ -534,7 +363,6 @@ _transformed_type: DictStrAny = {'fine_tune_strategies': t.Dict[AdapterType, Fin
                                   description=f'ModelSettings field for {k}.')) for k, ann in t.get_type_hints(ModelSettings).items()
              ])
 class _ModelSettingsAttr:
-  """Internal attrs representation of ModelSettings."""
   def __getitem__(self, key: str) -> t.Any:
     if key in codegen.get_annotations(ModelSettings):
       return _object_getattribute(self, key)
@@ -594,7 +422,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
   _cl_name = cl_.__name__.replace('Config', '')
   _settings_attr = cls.default()
   has_custom_name = all(i in cl_.__config__ for i in {'model_name', 'start_name'})
-  _settings_attr = attr.evolve(_settings_attr, **cl_.__config__)  # type: ignore[misc]
+  _settings_attr = attr.evolve(_settings_attr, **cl_.__config__)
   _final_value_dct: DictStrAny = {}
 
   if not has_custom_name:
@@ -667,7 +495,7 @@ class _ConfigAttr:
     __config__: ModelSettings = Field(None)
     '''Internal configuration for this LLM model. Each of the field in here will be populated
         and prefixed with __openllm_<value>__'''
-    GenerationConfig: object = Field(None)
+    GenerationConfig: GenerationConfig = Field(None)
     '''Users can override this subclass of any given LLMConfig to provide GenerationConfig
         default value. For example:
 
@@ -680,7 +508,7 @@ class _ConfigAttr:
                 eos_token_id: int = 11
         ```
         '''
-    SamplingParams: object = Field(None)
+    SamplingParams: SamplingParams = Field(None)
     '''Users can override this subclass of any given LLMConfig to provide SamplingParams
         default value. For example:
 
@@ -793,26 +621,14 @@ class _ConfigAttr:
     # update-config-stubs.py: special stop
 
 class _ConfigBuilder:
-  """A modified version of attrs internal _ClassBuilder, and should only be called within __init_subclass__ of LLMConfig.
-
-  Where:
-  - has_custom_setattr=True
-  - getstate_setstate=None (config class will always be a slotted class.)
-  - slots=True
-  - auto_attribs=False (We should handle it before _ConfigBuilder is invoked)
-  - cache_hash=False (We don't need to cache the hash code of this object for now.)
-  - collect_by_mro=True (The correct behaviour to resolve inheritance)
-  - field_transformer=codegen.make_env_transformer (We need to transform the field to have env variable)
-
-  It takes `these` arguments as a fully parsed attr.Attribute[t.Any] from __init_subclass__
-  """
-
   __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init')
 
   def __init__(self, cls: type[LLMConfig], these: dict[str, _CountingAttr], auto_attribs: bool = False, kw_only: bool = False, collect_by_mro: bool = True):
     attrs, base_attrs, base_attr_map = _transform_attrs(cls, these, auto_attribs, kw_only, collect_by_mro, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__))
-    self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict(cls.__dict__), attrs, {a.name for a in base_attrs
-                                                                                                                                                                   }, base_attr_map
+    self._cls, self._model_name, self._cls_dict = cls, cls.__openllm_model_name__, dict(cls.__dict__)
+    self._attrs = attrs
+    self._base_attr_map = base_attr_map
+    self._base_names = {a.name for a in base_attrs}
     self._attr_names = tuple(a.name for a in attrs)
     self._has_pre_init = bool(getattr(cls, '__attrs_pre_init__', False))
     self._has_post_init = bool(getattr(cls, '__attrs_post_init__', False))
@@ -1070,7 +886,7 @@ class LLMConfig(_ConfigAttr):
     # Finally, resolve the types
     if getattr(cls, '__attrs_types_resolved__', None) != cls:
       # NOTE: We will try to resolve type here, and cached it for faster use
-      globs: DictStrAny = {'t': t, 'typing': t, 'Constraint': Constraint}
+      globs: DictStrAny = {'t': t, 'typing': t}
       if cls.__module__ in sys.modules: globs.update(sys.modules[cls.__module__].__dict__)
       attr.resolve_types(cls.__openllm_generation_class__, globalns=globs)
       attr.resolve_types(cls.__openllm_sampling_class__, globalns=globs)
@@ -1207,8 +1023,6 @@ class LLMConfig(_ConfigAttr):
   @overload
   def __getitem__(self, item: t.Literal['renormalize_logits']) -> bool: ...
   @overload
-  def __getitem__(self, item: t.Literal['constraints']) -> t.List[Constraint]: ...
-  @overload
   def __getitem__(self, item: t.Literal['forced_bos_token_id']) -> int: ...
   @overload
   def __getitem__(self, item: t.Literal['forced_eos_token_id']) -> t.Union[int, t.List[int]]: ...
@@ -1514,20 +1328,12 @@ class LLMConfig(_ConfigAttr):
   # holds a mapping from self.__openllm_model_type__ to peft.TaskType
   @classmethod
   def peft_task_type(cls) -> str:
-    return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]
+    return PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]
 
 converter.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig),
                                             lambda cls: make_dict_unstructure_fn(cls, converter, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
 
 def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
-  """Structure a dictionary to a LLMConfig object.
-
-  Essentially, if the given dictionary contains a 'generation_config' key, then we will
-  use it for LLMConfig.generation_config
-
-  Otherwise, we will filter out all keys are first in LLMConfig, parse it in, then
-  parse the remaining keys into LLMConfig.generation_config
-  """
   if not isinstance(data, dict): raise RuntimeError(f'Expected a dictionary, but got {type(data)}')
   cls_attrs = {k: v for k, v in data.items() if k in cls.__openllm_accepted_keys__}
   generation_cls_fields = attr.fields_dict(cls.__openllm_generation_class__)
diff --git a/openllm-core/src/openllm_core/utils/peft.py b/openllm-core/src/openllm_core/utils/peft.py
new file mode 100644
index 00000000..2c1f51ee
--- /dev/null
+++ b/openllm-core/src/openllm_core/utils/peft.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+import enum, typing as t, inflection, attr
+from deepmerge import Merger
+from . import dantic
+from ..exceptions import ForbiddenAttributeError
+
+config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
+
+if t.TYPE_CHECKING:
+  from peft.config import PeftConfig
+  from .._typing_compat import AdapterType
+  from .._configuration import LLMConfig
+
+# case insensitive, but rename to conform with type
+class _PeftEnumMeta(enum.EnumMeta):
+  def __getitem__(self, __key: str | t.Any, /) -> t.Any:
+    if isinstance(__key, str): __key = inflection.underscore(__key).upper()
+    return self._member_map_[__key]
+
+# vendorred from peft.utils.config.PeftType since we don't have hard dependency on peft
+# see https://github.com/huggingface/peft/blob/main/src/peft/utils/config.py
+class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta):
+  PROMPT_TUNING = 'PROMPT_TUNING'
+  MULTITASK_PROMPT_TUNING = 'MULTITASK_PROMPT_TUNING'
+  P_TUNING = 'P_TUNING'
+  PREFIX_TUNING = 'PREFIX_TUNING'
+  LORA = 'LORA'
+  ADALORA = 'ADALORA'
+  ADAPTION_PROMPT = 'ADAPTION_PROMPT'
+  IA3 = 'IA3'
+  LOHA = 'LOHA'
+  LOKR = 'LOKR'
+
+  @classmethod
+  def _missing_(cls, value: object) -> enum.Enum | None:
+    if isinstance(value, str):
+      normalized = inflection.underscore(value).upper()
+      if normalized in cls._member_map_: return cls._member_map_[normalized]
+    return None
+
+  @classmethod
+  def supported(cls) -> set[str]:
+    return {inflection.underscore(v.value) for v in cls}
+
+  @staticmethod
+  def get(__key: str | t.Any, /) -> PeftType:
+    return PeftType[__key]  # type-safe getitem.
+
+PEFT_TASK_TYPE_TARGET_MAPPING = {'causal_lm': 'CAUSAL_LM', 'seq2seq_lm': 'SEQ_2_SEQ_LM'}
+
+_object_setattr = object.__setattr__
+
+def _adapter_converter(value: AdapterType | str | PeftType | None) -> PeftType:
+  if value is None: raise ValueError("'AdapterType' cannot be None.")
+  if isinstance(value, PeftType): return value
+  if value not in PeftType.supported(): raise ValueError(f"Given '{value}' is not a supported adapter type.")
+  return PeftType.get(value)
+
+@attr.define(slots=True, init=True)
+class FineTuneConfig:
+  adapter_type: PeftType = dantic.Field('lora',
+                                        description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
+                                        use_default_converter=False,
+                                        converter=_adapter_converter)
+  adapter_config: t.Dict[str, t.Any] = dantic.Field(None,
+                                                    description='The configuration for the adapter. The content of the dict depends on the adapter type.',
+                                                    validator=attr.validators.optional(attr.validators.instance_of(dict)),
+                                                    converter=attr.converters.default_if_none(factory=dict),
+                                                    use_default_converter=False)
+  inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False)
+  llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False)
+
+  def build(self) -> PeftConfig:
+    try:
+      from peft import TaskType, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config
+    except ImportError:
+      raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.')
+    adapter_config = self.adapter_config.copy()
+    # no need for peft_type
+    if 'peft_type' in adapter_config: adapter_config.pop('peft_type')
+    for k in {'enable_lora', 'merge_weights'}:  # these keys are from older PEFT and no longer valid.
+      if k in adapter_config: adapter_config.pop(k)
+    # respect user set task_type if it is passed, otherwise use one managed by OpenLLM
+    inference_mode = adapter_config.pop('inference_mode', self.inference_mode)
+    task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()])
+    adapter_config = {'peft_type': self.adapter_type.value, "task_type": task_type, "inference_mode": inference_mode, **adapter_config}
+    return get_peft_config(adapter_config)
+
+  def train(self) -> FineTuneConfig:
+    _object_setattr(self, 'inference_mode', False)
+    return self
+
+  def eval(self) -> FineTuneConfig:
+    _object_setattr(self, 'inference_mode', True)
+    return self
+
+  def with_config(self, **attrs: t.Any) -> FineTuneConfig:
+    adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode)
+    if 'llm_config_class' in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.")
+    return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs))
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 080c3187..824e972c 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -19,7 +19,7 @@ from bentoml._internal.models.model import ModelSignature
 from bentoml._internal.runner.runner_handle import DummyRunnerHandle
 from openllm_core._schemas import CompletionChunk
 from openllm_core._schemas import GenerationOutput
-from openllm_core._strategies import CascadingResourceStrategy
+from ._strategies import CascadingResourceStrategy
 from openllm_core._typing_compat import AdapterMap
 from openllm_core._typing_compat import AdapterTuple
 from openllm_core._typing_compat import AdapterType
diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-python/src/openllm/_strategies.py
similarity index 97%
rename from openllm-core/src/openllm_core/_strategies.py
rename to openllm-python/src/openllm/_strategies.py
index a35fd916..1dcb9c59 100644
--- a/openllm-core/src/openllm_core/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -1,4 +1,3 @@
-# NOTE: This module depends on BentoML, whereas openllm_core doesn't necessary depends on BentoML.
 # mypy: disable-error-code="no-redef"
 from __future__ import annotations
 import inspect
@@ -10,20 +9,15 @@ import types
 import typing as t
 import warnings
 
-import psutil
-
-try:
-  import bentoml
-except ImportError:
-  raise RuntimeError("Importing 'openllm_core._strategies' requires bentoml (not available locally). Make sure to do 'pip install -U bentoml'")
+import psutil, bentoml
 
 from bentoml._internal.resource import get_resource
 from bentoml._internal.resource import system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
 
-from ._typing_compat import overload
-from .utils import DEBUG
-from .utils import ReprMixin
+from openllm_core._typing_compat import overload
+from openllm_core.utils import DEBUG
+from openllm_core.utils import ReprMixin
 
 class DynResource(t.Protocol):
   resource_id: t.ClassVar[str]
@@ -230,7 +224,7 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
           '__module__': 'openllm._strategies'
       }))
 
-# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
+# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal 🤦
 _TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2'
 _AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu'
 _NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu'
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index 8274e094..7089fa30 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -369,25 +369,6 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
   > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
   > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
   > support on-demand quantisation during initial startup.
-
-  \b
-  ## Conversion strategies [EXPERIMENTAL]
-
-  \b
-  Some models will include built-in conversion strategies for specific weights format.
-  It will be determined via the `CONVERTER` environment variable. Note that this envvar should only be use provisionally as it is not RECOMMENDED to export this
-  and save to a ``.env`` file.
-
-  The conversion strategies will have the following format and will be determined per architecture implementation:
-  <base_format>-<target_format>
-
-  \b
-  For example: the below convert LlaMA-2 model format to hf:
-
-  \b
-  ```bash
-  $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
-  ```
   """
   llm_config = openllm.AutoConfig.for_model(model_name)
   _serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default=llm_config['serialisation']))
diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
index 97d72038..7cb3d3c6 100644
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -6,10 +6,10 @@ import pytest
 
 import bentoml
 
-from openllm_core import _strategies as strategy
-from openllm_core._strategies import CascadingResourceStrategy
-from openllm_core._strategies import NvidiaGpuResource
-from openllm_core._strategies import get_resource
+from openllm import _strategies as strategy
+from openllm._strategies import CascadingResourceStrategy
+from openllm._strategies import NvidiaGpuResource
+from openllm._strategies import get_resource
 
 if t.TYPE_CHECKING:
   from _pytest.monkeypatch import MonkeyPatch