From 05fa34f9e6e8a862730c23a892d49f656ddfbcb6 Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 10 Jun 2023 17:36:02 -0400
Subject: [PATCH] refactor: pretrained => model_id

I think model_id makes more sense than calling it pretrained

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_llm.py                           | 78 ++++++++++---------
 src/openllm/cli.py                            | 57 +++++++-------
 src/openllm/models/auto/factory.py            | 21 +++--
 .../models/chatglm/configuration_chatglm.py   |  8 +-
 .../models/chatglm/modeling_chatglm.py        | 15 ++--
 .../models/dolly_v2/configuration_dolly_v2.py |  8 +-
 .../models/dolly_v2/modeling_dolly_v2.py      | 10 +--
 .../models/falcon/configuration_falcon.py     |  8 +-
 src/openllm/models/falcon/modeling_falcon.py  | 10 +--
 .../models/flan_t5/configuration_flan_t5.py   |  8 +-
 .../models/flan_t5/modeling_flan_t5.py        |  4 +-
 .../models/flan_t5/modeling_flax_flan_t5.py   |  4 +-
 .../models/flan_t5/modeling_tf_flan_t5.py     |  4 +-
 .../models/stablelm/configuration_stablelm.py |  8 +-
 .../models/stablelm/modeling_stablelm.py      |  6 +-
 .../starcoder/configuration_starcoder.py      |  8 +-
 .../models/starcoder/modeling_starcoder.py    | 10 +--
 src/openllm/utils/__init__.py                 |  4 +-
 18 files changed, 155 insertions(+), 116 deletions(-)

diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 258d3fa4..a0f7714c 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -92,14 +92,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):
 
 
 def import_model(
-    model_name: str,
+    model_id: str,
     tag: bentoml.Tag,
     _model_framework: str,
     *model_args: t.Any,
     tokenizer_kwds: dict[str, t.Any],
     **attrs: t.Any,
 ):
-    """Auto detect model type from given model_name and import it to bentoml's model store.
+    """Auto detect model type from given model_id and import it to bentoml's model store.
 
     For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
     returning all of the unused kwargs.
@@ -111,7 +111,7 @@ def import_model(
     Refer to Transformers documentation for more information about kwargs.
 
     Args:
-        model_name: Model name to be imported. use `openllm models` to see available entries
+        model_id: Model id to be imported. See `openllm models` for all supported models.
         tag: Tag to be used for the model. This is usually generated for you.
         model_args: Args to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
         **attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
@@ -139,7 +139,7 @@ def import_model(
         config, attrs = t.cast(
             "tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
             transformers.AutoConfig.from_pretrained(
-                model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
+                model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
             ),
         )
 
@@ -156,13 +156,13 @@ def import_model(
             getattr(
                 transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[_model_framework][TaskType[task_type].value - 1]
             ).from_pretrained(
-                model_name, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
+                model_id, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
             ),
             custom_objects={
                 "tokenizer": t.cast(
                     "LLMTokenizer",
                     transformers.AutoTokenizer.from_pretrained(
-                        model_name, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
+                        model_id, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
                     ),
                 )
             },
@@ -179,7 +179,7 @@ def import_model(
             torch.cuda.empty_cache()
 
 
-_required_namespace = {"default_model", "pretrained"}
+_required_namespace = {"default_id", "model_ids"}
 
 _reserved_namespace = _required_namespace | {
     "config_class",
@@ -192,11 +192,11 @@ _reserved_namespace = _required_namespace | {
 class LLMInterface(ABC):
     """This defines the loose contract for all openllm.LLM implementations."""
 
-    default_model: str
-    """Return the default model to use when using 'openllm start <model_name>'.
-    This could be one of the keys in 'self.pretrained' or custom users model."""
+    default_id: str
+    """Return the default model to use when using 'openllm start <model_id>'.
+    This could be one of the keys in 'self.model_ids' or custom users model."""
 
-    pretrained: list[str]
+    model_ids: list[str]
     """A list of supported pretrained models tag for this given runnable.
 
     For example:
@@ -253,7 +253,7 @@ class LLMInterface(ABC):
         pass
 
     def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
     ) -> bentoml.Model:
         """This function can be implemented if default import_model doesn't satisfy your needs."""
         raise NotImplementedError
@@ -275,6 +275,8 @@ class LLMMetaclass(ABCMeta):
                 namespace["__annotations__"] = annotations_dict
 
             # NOTE: check for required attributes
+            if "__openllm_internal__" not in namespace:
+                _required_namespace.add("config_class")
             for k in _required_namespace:
                 if k not in namespace:
                     raise RuntimeError(f"Missing required key '{k}'. Make sure to define it within the LLM subclass.")
@@ -378,13 +380,13 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
 
     @classmethod
     def from_pretrained(
-        cls, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
+        cls, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
     ) -> LLM:
-        return cls(pretrained=pretrained, llm_config=llm_config, *args, **attrs)
+        return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)
 
     def __init__(
         self,
-        pretrained: str | None = None,
+        model_id: str | None = None,
         llm_config: openllm.LLMConfig | None = None,
         *args: t.Any,
         **attrs: t.Any,
@@ -408,7 +410,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         ```python
         def import_model(
             self,
-            pretrained: str,
+            model_id: str,
             tag: bentoml.Tag,
             *args: t.Any,
             tokenizer_kwds: dict[str, t.Any],
@@ -417,11 +419,11 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
             return bentoml.transformers.save_model(
                 tag,
                 transformers.AutoModelForCausalLM.from_pretrained(
-                    pretrained, device_map="auto", torch_dtype=torch.bfloat16, **attrs
+                    model_id, device_map="auto", torch_dtype=torch.bfloat16, **attrs
                 ),
                 custom_objects={
                     "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                        pretrained, padding_size="left", **tokenizer_kwds
+                        model_id, padding_size="left", **tokenizer_kwds
                     )
                 },
             )
@@ -440,14 +442,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         Note: If you implement your own `import_model`, then `import_kwargs` will be the
         default kwargs for every load. You can still override those via ``openllm.Runner``.
 
-        Note that this tag will be generated based on `self.default_model` or the given `pretrained` kwds.
+        Note that this tag will be generated based on `self.default_id` or the given `pretrained` kwds.
         passed from the __init__ constructor.
 
         ``llm_post_init`` can also be implemented if you need to do any
         additional initialization after everything is setup.
 
         Args:
-            pretrained: The pretrained model to use. Defaults to None. It will use 'self.default_model' if None.
+            model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
             llm_config: The config to use for this LLM. Defaults to None. If not passed, we will use 'self.config_class'
                         to construct default configuration.
             *args: The args to be passed to the model.
@@ -462,14 +464,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
             # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
             attrs = self.config.__openllm_extras__
 
-        if pretrained is None:
-            pretrained = os.environ.get(self.config.__openllm_env__.pretrained, None)
-            if not pretrained:
-                assert self.default_model, "A default model is required for any LLM."
-                pretrained = self.default_model
+        if model_id is None:
+            model_id = os.environ.get(self.config.__openllm_env__.model_id, None)
+            if not model_id:
+                assert self.default_id, "A default model is required for any LLM."
+                model_id = self.default_id
 
         # NOTE: This is the actual given path or pretrained weight for this LLM.
-        self._pretrained = pretrained
+        self._model_id = model_id
 
         # NOTE: Save the args and kwargs for latter load
         self._llm_args = args
@@ -491,19 +493,19 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
     # NOTE: The section below defines a loose contract with langchain's LLM interface.
     @property
     def llm_type(self) -> str:
-        return convert_transformers_model_name(self._pretrained)
+        return convert_transformers_model_name(self._model_id)
 
     @property
     def identifying_params(self) -> dict[str, t.Any]:
         return {
             "configuration": self.config.model_dump_json().decode(),
-            "pretrained": orjson.dumps(self.pretrained).decode(),
+            "model_ids": orjson.dumps(self.model_ids).decode(),
         }
 
     @t.overload
     def make_tag(
         self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
         return_unused_kwargs: t.Literal[False] = ...,
         trust_remote_code: bool = ...,
         **attrs: t.Any,
@@ -513,7 +515,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
     @t.overload
     def make_tag(
         self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
         return_unused_kwargs: t.Literal[True] = ...,
         trust_remote_code: bool = ...,
         **attrs: t.Any,
@@ -522,7 +524,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
 
     def make_tag(
         self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
         return_unused_kwargs: bool = False,
         trust_remote_code: bool = False,
         **attrs: t.Any,
@@ -543,8 +545,8 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         Returns:
             A tuple of ``bentoml.Tag`` and a dict of unused kwargs.
         """
-        if model_name_or_path is None:
-            model_name_or_path = self._pretrained
+        if model_id is None:
+            model_id = self._model_id
 
         if "return_unused_kwargs" in attrs:
             logger.debug("Ignoring 'return_unused_kwargs' in 'generate_tag_from_model_name'.")
@@ -553,12 +555,12 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         config, attrs = t.cast(
             "tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
             transformers.AutoConfig.from_pretrained(
-                model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
+                model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
             ),
         )
-        name = convert_transformers_model_name(model_name_or_path)
+        name = convert_transformers_model_name(model_id)
 
-        if os.path.exists(os.path.dirname(model_name_or_path)):
+        if os.path.exists(os.path.dirname(model_id)):
             # If the model_name_or_path is a path, we assume it's a local path,
             # then users must pass a version for this.
             model_version = attrs.pop("openllm_model_version", None)
@@ -590,7 +592,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
                     "Given %s from '%s' doesn't contain a commit hash. We will generate"
                     " the tag without specific version.",
                     t.cast("type[transformers.PretrainedConfig]", config.__class__),
-                    model_name_or_path,
+                    model_id,
                 )
         tag = bentoml.Tag.from_taglike(f"{self.__llm_implementation__}-{name}:{model_version}")
 
@@ -621,7 +623,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
                 }
 
             return self.import_model(
-                self._pretrained,
+                self._model_id,
                 tag,
                 *self._llm_args,
                 tokenizer_kwds=tokenizer_kwds,
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index a9bb1d40..0f79efec 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -388,8 +388,8 @@ def start_model_command(
     docstring = f"""\
 {ModelEnv.start_docstring}
 \b
-The available pretrained models to use with '{model_name}' are: {for_doc.pretrained} [default: {for_doc.default_model}]
-Tip: One can pass one of the aforementioned to '--pretrained' to use other pretrained weights.
+Available model_id(s) to use with '{model_name}' are: {for_doc.model_ids} [default: {for_doc.default_id}]
+Tip: One can pass one of the aforementioned to '--model-id' to use other pretrained weights.
 """
     command_attrs: dict[str, t.Any] = {
         "name": ModelEnv.model_name,
@@ -430,9 +430,7 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
     @llm_config.to_click_options
     @parse_serve_args(_serve_grpc)
     @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
-    @click.option(
-        "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
-    )
+    @model_id_option
     @click.option(
         "--device",
         type=tuple,
@@ -444,18 +442,18 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
     )
     def model_start(
         server_timeout: int,
-        pretrained: str | None,
+        model_id: str | None,
         device: tuple[str, ...] | None,
         **attrs: t.Any,
     ) -> openllm.LLMConfig:
         config, server_attrs = llm_config.model_validate_click(**attrs)
 
         if ModelEnv.get_framework_env() == "flax":
-            llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
         elif ModelEnv.get_framework_env() == "tf":
-            llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
         else:
-            llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoLLM.for_model(model_name, mdoel_id=model_id, llm_config=config)
 
         # NOTE: We need to initialize llm here first to check if the model is already downloaded to
         # avoid deadlock before the subprocess forking.
@@ -580,6 +578,12 @@ output_option = click.option(
     default="pretty",
     help="Showing output type. Default to 'pretty'",
 )
+model_id_option = click.option(
+    "--model-id",
+    type=click.STRING,
+    default=None,
+    help="Optional model_id name or path for (fine-tune) weight.",
+)
 
 
 def cli_factory() -> click.Group:
@@ -626,16 +630,15 @@ def cli_factory() -> click.Group:
     @click.argument(
         "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
     )
-    @click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].")
-    @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+    @model_id_option
     @output_option
-    def build(model_name: str, pretrained: str | None, overwrite: bool, output: OutputLiteral):
+    @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+    def build(model_name: str, model_id: str | None, overwrite: bool, output: OutputLiteral):
         """Package a given models into a Bento.
 
         $ openllm build flan-t5
 
         \b
-
         NOTE: To run a container built from this Bento with GPU support, make sure
         to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.
         """
@@ -645,7 +648,7 @@ def cli_factory() -> click.Group:
         bento, _previously_built = openllm.build(
             model_name,
             __cli__=True,
-            pretrained=pretrained,
+            model_id=model_id,
             _overwrite_existing_bento=overwrite,
         )
 
@@ -684,13 +687,13 @@ def cli_factory() -> click.Group:
         else:
             failed_initialized: list[tuple[str, Exception]] = []
 
-            json_data: dict[str, dict[t.Literal["pretrained", "description"], t.Any]] = {}
+            json_data: dict[str, dict[t.Literal["model_id", "description"], t.Any]] = {}
 
             for m in models:
                 try:
                     model = openllm.AutoLLM.for_model(m)
                     docs = inspect.cleandoc(model.config.__doc__ or "(No description)")
-                    json_data[m] = {"pretrained": model.pretrained, "description": docs}
+                    json_data[m] = {"model_id": model.model_ids, "description": docs}
                 except Exception as err:
                     failed_initialized.append((m, err))
 
@@ -701,7 +704,7 @@ def cli_factory() -> click.Group:
 
                 data: list[str | tuple[str, str, list[str]]] = []
                 for m, v in json_data.items():
-                    data.extend([(m, v["description"], v["pretrained"])])
+                    data.extend([(m, v["description"], v["model_id"])])
                 column_widths = [int(COLUMNS / 6), int(COLUMNS / 3 * 2), int(COLUMNS / 6)]
 
                 if len(data) == 0 and len(failed_initialized) > 0:
@@ -714,7 +717,7 @@ def cli_factory() -> click.Group:
                 table = tabulate.tabulate(
                     data,
                     tablefmt="fancy_grid",
-                    headers=["LLM", "Description", "Pretrained"],
+                    headers=["LLM", "Description", "Models Id"],
                     maxcolwidths=column_widths,
                 )
 
@@ -739,11 +742,9 @@ def cli_factory() -> click.Group:
     @click.argument(
         "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
     )
-    @click.option(
-        "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
-    )
+    @model_id_option
     @output_option
-    def download_models(model_name: str, pretrained: str | None, output: OutputLiteral):
+    def download_models(model_name: str, model_id: str | None, output: OutputLiteral):
         """Setup LLM interactively.
 
         Note: This is useful for development and setup for fine-tune.
@@ -751,11 +752,11 @@ def cli_factory() -> click.Group:
         config = openllm.AutoConfig.for_model(model_name)
         env = config.__openllm_env__.get_framework_env()
         if env == "flax":
-            model = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
         elif env == "tf":
-            model = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
         else:
-            model = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
 
         tag = model.make_tag(trust_remote_code=config.__openllm_trust_remote_code__)
 
@@ -829,7 +830,7 @@ def cli_factory() -> click.Group:
     )
     @output_option
     @click.argument("query", type=click.STRING)
-    def query(
+    def query_(
         query: str,
         endpoint: str,
         timeout: int,
@@ -838,7 +839,7 @@ def cli_factory() -> click.Group:
     ):
         """Ask a LLM interactively, from a terminal.
 
-        $ openllm query --endpoint http://12.323.2.1 "What is the meaning of life?"
+        $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
         """
         if server_type == "grpc":
             endpoint = re.sub(r"http://", "", endpoint)
@@ -870,7 +871,7 @@ def cli_factory() -> click.Group:
             _echo(res["responses"], fg="white")
 
     if t.TYPE_CHECKING:
-        assert download_models and build and models and start and start_grpc and query and prune
+        assert download_models and build and models and start and start_grpc and query_ and prune
 
     if psutil.WINDOWS:
         sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
index 48620bd5..f2af4d06 100644
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -47,7 +47,7 @@ class _BaseAutoLLMClass:
     def for_model(
         cls,
         model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
         return_runner_kwargs: t.Literal[False] = ...,
         llm_config: openllm.LLMConfig | None = ...,
         **attrs: t.Any,
@@ -59,7 +59,7 @@ class _BaseAutoLLMClass:
     def for_model(
         cls,
         model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
         return_runner_kwargs: t.Literal[True] = ...,
         llm_config: openllm.LLMConfig | None = ...,
         **attrs: t.Any,
@@ -70,11 +70,18 @@ class _BaseAutoLLMClass:
     def for_model(
         cls,
         model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
         return_runner_kwargs: bool = False,
         llm_config: openllm.LLMConfig | None = ...,
         **attrs: t.Any,
     ) -> openllm.LLM | tuple[openllm.LLM, dict[str, t.Any]]:
+        """The lower level API for creating a LLM instance.
+
+        ```python
+        >>> import openllm
+        >>> llm = openllm.AutoLLM.for_model("flan-t5")
+        ```
+        """
         runner_kwargs_name = [
             "models",
             "max_batch_size",
@@ -88,7 +95,7 @@ class _BaseAutoLLMClass:
             # The rest of kwargs is now passed to config
             llm_config = AutoConfig.for_model(model_name, **attrs)
         if type(llm_config) in cls._model_mapping.keys():
-            llm = cls._model_mapping[type(llm_config)].from_pretrained(pretrained, llm_config=llm_config, **attrs)
+            llm = cls._model_mapping[type(llm_config)].from_pretrained(model_id, llm_config=llm_config, **attrs)
             if not return_runner_kwargs:
                 return llm
             return llm, to_runner_attrs
@@ -98,19 +105,19 @@ class _BaseAutoLLMClass:
         )
 
     @classmethod
-    def create_runner(cls, model_name: str, pretrained: str | None = None, **attrs: t.Any) -> bentoml.Runner:
+    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
         """
         Create a LLM Runner for the given model name.
 
         Args:
             model_name: The model name to instantiate.
-            pretrained: The pretrained model name to instantiate.
+            model_id: The pretrained model name to instantiate.
             **attrs: Additional keyword arguments passed along to the specific configuration class.
 
         Returns:
             A LLM instance.
         """
-        llm, runner_attrs = cls.for_model(model_name, pretrained, return_runner_kwargs=True, **attrs)
+        llm, runner_attrs = cls.for_model(model_name, model_id, return_runner_kwargs=True, **attrs)
         return llm.to_runner(**runner_attrs)
 
     @classmethod
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py
index 783b844c..d5230d2e 100644
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -55,7 +55,7 @@ class ChatGLMConfig(
 
 
 START_CHATGLM_COMMAND_DOCSTRING = """\
-Run a LLMServer for ChatGLM model and pretrained.
+Run a LLMServer for ChatGLM model.
 
 \b
 > See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
@@ -67,7 +67,11 @@ Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in yo
 
 \b
 ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
-saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
+saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_MODEL_ID='thudm/chatglm-6b-int8'``
+or provide `--model-id` flag when running ``openllm start chatglm``:
+
+\b
+$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """
 
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
index bce2f513..01b1154b 100644
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -41,22 +41,27 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
 class ChatGLM(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model = "thudm/chatglm-6b-int4"
+    default_id = "thudm/chatglm-6b-int4"
 
-    pretrained = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
+    model_ids = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
 
     device = torch.device("cuda")
 
     def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self,
+        model_id: str,
+        tag: bentoml.Tag,
+        *model_args: t.Any,
+        tokenizer_kwds: dict[str, t.Any],
+        **attrs: t.Any,
     ) -> bentoml.Model:
         trust_remote_code = attrs.pop("trust_remote_code", True)
         return bentoml.transformers.save_model(
             tag,
-            transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=trust_remote_code),
+            transformers.AutoModel.from_pretrained(model_id, trust_remote_code=trust_remote_code),
             custom_objects={
                 "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                    pretrained, trust_remote_code=trust_remote_code, **tokenizer_kwds
+                    model_id, trust_remote_code=trust_remote_code, **tokenizer_kwds
                 )
             },
         )
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index 42a4b0bb..2b148bce 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -51,7 +51,7 @@ class DollyV2Config(
 
 
 START_DOLLY_V2_COMMAND_DOCSTRING = """\
-Run a LLMServer for dolly-v2 model and pretrained.
+Run a LLMServer for dolly-v2 model.
 
 \b
 > See more information about dolly-v2 at [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
@@ -63,7 +63,11 @@ Currently, dolly-v2 only supports PyTorch. Make sure ``torch`` is available in y
 
 \b
 Dolly-v2 Runner will use databricks/dolly-v2-3b as the default model. To change any to any other dolly-v2
-saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_PRETRAINED='databricks/dolly-v2-7b'``
+saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_MODEL_ID='databricks/dolly-v2-7b'``
+or provide `--model-id` flag when running ``openllm start dolly-v2``:
+
+\b
+$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
 """
 
 INSTRUCTION_KEY = "### Instruction:"
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index fc1434ee..d01b8cfe 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -38,9 +38,9 @@ class DollyV2(openllm.LLM):
 
     __openllm_internal__ = True
 
-    default_model = "databricks/dolly-v2-3b"
+    default_id = "databricks/dolly-v2-3b"
 
-    pretrained = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
+    model_ids = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
 
     import_kwargs = {
         "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
@@ -51,15 +51,15 @@ class DollyV2(openllm.LLM):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
     ) -> bentoml.Model:
         trust_remote_code = attrs.pop("trust_remote_code", True)
         torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
         device_map = attrs.pop("device_map", "auto")
 
-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
         pipeline = transformers.pipeline(
-            model=pretrained,
+            model=model_id,
             tokenizer=tokenizer,
             trust_remote_code=trust_remote_code,
             torch_dtype=torch_dtype,
diff --git a/src/openllm/models/falcon/configuration_falcon.py b/src/openllm/models/falcon/configuration_falcon.py
index bdd2e453..a2704e25 100644
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -40,7 +40,7 @@ class FalconConfig(
 
 
 START_FALCON_COMMAND_DOCSTRING = """\
-Run a LLMServer for FalconLM model and pretrained.
+Run a LLMServer for FalconLM model.
 
 \b
 > See more information about falcon at [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
@@ -52,7 +52,11 @@ Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in y
 
 \b
 FalconLM Runner will use tiiuae/falcon-7b as the default model. To change any to any other FalconLM
-saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_PRETRAINED='tiiuae/falcon-7b-instruct'``
+saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_MODEL_ID='tiiuae/falcon-7b-instruct'``
+or provide `--model-id` flag when running ``openllm start falcon``:
+
+\b
+$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
 """
 
 DEFAULT_PROMPT_TEMPLATE = """{context}
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py
index d44a0f84..41535d0f 100644
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -34,9 +34,9 @@ else:
 class Falcon(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model = "tiiuae/falcon-7b"
+    default_id = "tiiuae/falcon-7b"
 
-    pretrained = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
+    model_ids = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
 
     import_kwargs = {
         "torch_dtype": torch.bfloat16,
@@ -44,15 +44,15 @@ class Falcon(openllm.LLM):
     }
 
     def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
     ) -> bentoml.Model:
         trust_remote_code = attrs.pop("trust_remote_code", True)
         torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
         device_map = attrs.pop("device_map", "auto")
 
-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained,
+            model_id,
             trust_remote_code=trust_remote_code,
             torch_dtype=torch_dtype,
             device_map=device_map,
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/src/openllm/models/flan_t5/configuration_flan_t5.py
index 9f0584e8..d67f972c 100644
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -16,7 +16,7 @@ from __future__ import annotations
 import openllm
 
 START_FLAN_T5_COMMAND_DOCSTRING = """\
-Run a LLMServer for FLAN-T5 model and pretrained.
+Run a LLMServer for FLAN-T5 model.
 
 \b
 > See more information about FLAN-T5 at [huggingface/transformers](https://huggingface.co/docs/transformers/model_doc/flan-t5)
@@ -34,7 +34,11 @@ By default, this model will use the PyTorch model for inference. However, this m
 
 \b
 FLAN-T5 Runner will use google/flan-t5-large as the default model. To change any to any other FLAN-T5
-saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_PRETRAINED='google/flan-t5-xxl'``
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
+or provide `--model-id` flag when running ``openllm start flan-t5``:
+
+\b
+$ openllm start flan-t5 --model-id google/flan-t5-xxl
 """
 
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index e748076e..658436e0 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -29,9 +29,9 @@ else:
 class FlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model = "google/flan-t5-large"
+    default_id = "google/flan-t5-large"
 
-    pretrained = [
+    model_ids = [
         "google/flan-t5-small",
         "google/flan-t5-base",
         "google/flan-t5-large",
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index f7827c7b..5c08db5e 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class FlaxFlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model: str = "google/flan-t5-large"
+    default_id: str = "google/flan-t5-large"
 
-    pretrained = [
+    model_ids = [
         "google/flan-t5-small",
         "google/flan-t5-base",
         "google/flan-t5-large",
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index 6ba86448..e950919c 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class TFFlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model: str = "google/flan-t5-large"
+    default_id: str = "google/flan-t5-large"
 
-    pretrained = [
+    model_ids = [
         "google/flan-t5-small",
         "google/flan-t5-base",
         "google/flan-t5-large",
diff --git a/src/openllm/models/stablelm/configuration_stablelm.py b/src/openllm/models/stablelm/configuration_stablelm.py
index 7552578e..4dd777e1 100644
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -38,7 +38,7 @@ class StableLMConfig(openllm.LLMConfig, name_type="lowercase", url="https://gith
 
 
 START_STABLELM_COMMAND_DOCSTRING = """\
-Run a LLMServer for StableLM model and pretrained.
+Run a LLMServer for StableLM model.
 
 \b
 > See more information about StableLM at [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
@@ -50,7 +50,11 @@ Currently, StableLM only supports PyTorch. Make sure ``torch`` is available in y
 
 \b
 StableLM Runner will use stabilityai/stablelm-base-alpha-3b as the default model. To change any to any other StableLM
-saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_PRETRAINED='stabilityai/stablelm-tuned-alpha-3b'``
+saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_MODEL_ID='stabilityai/stablelm-tuned-alpha-3b'``
+or provide `--model-id` flag when running ``openllm start stablelm``:
+
+\b
+$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
 
 SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py
index e0e574e8..12817e2c 100644
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -43,9 +43,9 @@ class StableLM(openllm.LLM):
     __openllm_internal__ = True
 
     load_in_mha = True
-    default_model = "stabilityai/stablelm-tuned-alpha-3b"
+    default_id = "stabilityai/stablelm-tuned-alpha-3b"
 
-    pretrained = [
+    model_ids = [
         "stabilityai/stablelm-tuned-alpha-3b",
         "stabilityai/stablelm-tuned-alpha-7b",
         "stabilityai/stablelm-base-alpha-3b",
@@ -70,7 +70,7 @@ class StableLM(openllm.LLM):
         use_default_prompt_template: bool = True,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        if "tuned" in self._pretrained and use_default_prompt_template:
+        if "tuned" in self._model_id and use_default_prompt_template:
             prompt_variables = {
                 k: v
                 for k, v in attrs.items()
diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/src/openllm/models/starcoder/configuration_starcoder.py
index cfc874ad..2dc9b307 100644
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -42,7 +42,7 @@ class StarCoderConfig(
 
 
 START_STARCODER_COMMAND_DOCSTRING = """\
-Run a LLMServer for StarCoder model and pretrained.
+Run a LLMServer for StarCoder model.
 
 \b
 > See more information about StarCoder at [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
@@ -54,7 +54,11 @@ Currently, StarCoder only supports PyTorch. Make sure ``torch`` is available in
 
 \b
 StarCoder Runner will use bigcode/starcoder as the default model. To change any to any other StarCoder
-saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_PRETRAINED='bigcode/starcoder'``
+saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_MODEL_ID='bigcode/starcoder'``
+or provide `--model-id` flag when running ``openllm start starcoder``:
+
+\b
+$ openllm start starcoder --model-id 'bigcode/starcoder'
 """
 
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index c84ddf8f..00f4c639 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -40,21 +40,21 @@ FIM_INDICATOR = "<FILL_HERE>"
 class StarCoder(openllm.LLM):
     __openllm_internal__ = True
 
-    default_model = "bigcode/starcoder"
+    default_id = "bigcode/starcoder"
 
-    pretrained = ["bigcode/starcoder", "bigcode/starcoderbase"]
+    model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"]
 
     device = torch.device("cuda")
 
     def import_model(
         self,
-        pretrained: str,
+        model_id: str,
         tag: bentoml.Tag,
         *model_args: t.Any,
         tokenizer_kwds: dict[str, t.Any],
         **attrs: t.Any,
     ) -> bentoml.Model:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
         tokenizer.add_special_tokens(
             {
                 "additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
@@ -62,7 +62,7 @@ class StarCoder(openllm.LLM):
             }
         )
 
-        model = transformers.AutoModelForCausalLM.from_pretrained(pretrained, **attrs)
+        model = transformers.AutoModelForCausalLM.from_pretrained(model_id, **attrs)
 
         try:
             return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
index 3d883698..cf042b4a 100644
--- a/src/openllm/utils/__init__.py
+++ b/src/openllm/utils/__init__.py
@@ -91,8 +91,8 @@ class ModelEnv:
         return f"OPENLLM_{self.model_name.upper()}_CONFIG"
 
     @property
-    def pretrained(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_PRETRAINED"
+    def model_id(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"
 
     @property
     def bettertransformer(self) -> str: