refactor: pretrained => model_id

I think model_id makes more sense than calling it pretrained Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-06-10 17:36:02 -04:00
parent 4841051fc5
commit 05fa34f9e6
18 changed files with 155 additions and 116 deletions
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -92,14 +92,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):


 def import_model(
-    model_name: str,
+    model_id: str,
    tag: bentoml.Tag,
    _model_framework: str,
    *model_args: t.Any,
    tokenizer_kwds: dict[str, t.Any],
    **attrs: t.Any,
 ):
-    """Auto detect model type from given model_name and import it to bentoml's model store.
+    """Auto detect model type from given model_id and import it to bentoml's model store.

    For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
    returning all of the unused kwargs.
@@ -111,7 +111,7 @@ def import_model(
    Refer to Transformers documentation for more information about kwargs.

    Args:
-        model_name: Model name to be imported. use `openllm models` to see available entries
+        model_id: Model id to be imported. See `openllm models` for all supported models.
        tag: Tag to be used for the model. This is usually generated for you.
        model_args: Args to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
        **attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
@@ -139,7 +139,7 @@ def import_model(
        config, attrs = t.cast(
            "tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
            transformers.AutoConfig.from_pretrained(
-                model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
+                model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
            ),
        )

@@ -156,13 +156,13 @@ def import_model(
            getattr(
                transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[_model_framework][TaskType[task_type].value - 1]
            ).from_pretrained(
-                model_name, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
+                model_id, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
            ),
            custom_objects={
                "tokenizer": t.cast(
                    "LLMTokenizer",
                    transformers.AutoTokenizer.from_pretrained(
-                        model_name, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
+                        model_id, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
                    ),
                )
            },
@@ -179,7 +179,7 @@ def import_model(
            torch.cuda.empty_cache()


-_required_namespace = {"default_model", "pretrained"}
+_required_namespace = {"default_id", "model_ids"}

 _reserved_namespace = _required_namespace | {
    "config_class",
@@ -192,11 +192,11 @@ _reserved_namespace = _required_namespace | {
 class LLMInterface(ABC):
    """This defines the loose contract for all openllm.LLM implementations."""

-    default_model: str
-    """Return the default model to use when using 'openllm start <model_name>'.
-    This could be one of the keys in 'self.pretrained' or custom users model."""
+    default_id: str
+    """Return the default model to use when using 'openllm start <model_id>'.
+    This could be one of the keys in 'self.model_ids' or custom users model."""

-    pretrained: list[str]
+    model_ids: list[str]
    """A list of supported pretrained models tag for this given runnable.

    For example:
@@ -253,7 +253,7 @@ class LLMInterface(ABC):
        pass

    def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
    ) -> bentoml.Model:
        """This function can be implemented if default import_model doesn't satisfy your needs."""
        raise NotImplementedError
@@ -275,6 +275,8 @@ class LLMMetaclass(ABCMeta):
                namespace["__annotations__"] = annotations_dict

            # NOTE: check for required attributes
+            if "__openllm_internal__" not in namespace:
+                _required_namespace.add("config_class")
            for k in _required_namespace:
                if k not in namespace:
                    raise RuntimeError(f"Missing required key '{k}'. Make sure to define it within the LLM subclass.")
@@ -378,13 +380,13 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):

    @classmethod
    def from_pretrained(
-        cls, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
+        cls, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
    ) -> LLM:
-        return cls(pretrained=pretrained, llm_config=llm_config, *args, **attrs)
+        return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)

    def __init__(
        self,
-        pretrained: str | None = None,
+        model_id: str | None = None,
        llm_config: openllm.LLMConfig | None = None,
        *args: t.Any,
        **attrs: t.Any,
@@ -408,7 +410,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        ```python
        def import_model(
            self,
-            pretrained: str,
+            model_id: str,
            tag: bentoml.Tag,
            *args: t.Any,
            tokenizer_kwds: dict[str, t.Any],
@@ -417,11 +419,11 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
            return bentoml.transformers.save_model(
                tag,
                transformers.AutoModelForCausalLM.from_pretrained(
-                    pretrained, device_map="auto", torch_dtype=torch.bfloat16, **attrs
+                    model_id, device_map="auto", torch_dtype=torch.bfloat16, **attrs
                ),
                custom_objects={
                    "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                        pretrained, padding_size="left", **tokenizer_kwds
+                        model_id, padding_size="left", **tokenizer_kwds
                    )
                },
            )
@@ -440,14 +442,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        Note: If you implement your own `import_model`, then `import_kwargs` will be the
        default kwargs for every load. You can still override those via ``openllm.Runner``.

-        Note that this tag will be generated based on `self.default_model` or the given `pretrained` kwds.
+        Note that this tag will be generated based on `self.default_id` or the given `pretrained` kwds.
        passed from the __init__ constructor.

        ``llm_post_init`` can also be implemented if you need to do any
        additional initialization after everything is setup.

        Args:
-            pretrained: The pretrained model to use. Defaults to None. It will use 'self.default_model' if None.
+            model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
            llm_config: The config to use for this LLM. Defaults to None. If not passed, we will use 'self.config_class'
                        to construct default configuration.
            *args: The args to be passed to the model.
@@ -462,14 +464,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
            # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
            attrs = self.config.__openllm_extras__

-        if pretrained is None:
-            pretrained = os.environ.get(self.config.__openllm_env__.pretrained, None)
-            if not pretrained:
-                assert self.default_model, "A default model is required for any LLM."
-                pretrained = self.default_model
+        if model_id is None:
+            model_id = os.environ.get(self.config.__openllm_env__.model_id, None)
+            if not model_id:
+                assert self.default_id, "A default model is required for any LLM."
+                model_id = self.default_id

        # NOTE: This is the actual given path or pretrained weight for this LLM.
-        self._pretrained = pretrained
+        self._model_id = model_id

        # NOTE: Save the args and kwargs for latter load
        self._llm_args = args
@@ -491,19 +493,19 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
    # NOTE: The section below defines a loose contract with langchain's LLM interface.
    @property
    def llm_type(self) -> str:
-        return convert_transformers_model_name(self._pretrained)
+        return convert_transformers_model_name(self._model_id)

    @property
    def identifying_params(self) -> dict[str, t.Any]:
        return {
            "configuration": self.config.model_dump_json().decode(),
-            "pretrained": orjson.dumps(self.pretrained).decode(),
+            "model_ids": orjson.dumps(self.model_ids).decode(),
        }

    @t.overload
    def make_tag(
        self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
        return_unused_kwargs: t.Literal[False] = ...,
        trust_remote_code: bool = ...,
        **attrs: t.Any,
@@ -513,7 +515,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
    @t.overload
    def make_tag(
        self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
        return_unused_kwargs: t.Literal[True] = ...,
        trust_remote_code: bool = ...,
        **attrs: t.Any,
@@ -522,7 +524,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):

    def make_tag(
        self,
-        model_name_or_path: str | None = None,
+        model_id: str | None = None,
        return_unused_kwargs: bool = False,
        trust_remote_code: bool = False,
        **attrs: t.Any,
@@ -543,8 +545,8 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        Returns:
            A tuple of ``bentoml.Tag`` and a dict of unused kwargs.
        """
-        if model_name_or_path is None:
-            model_name_or_path = self._pretrained
+        if model_id is None:
+            model_id = self._model_id

        if "return_unused_kwargs" in attrs:
            logger.debug("Ignoring 'return_unused_kwargs' in 'generate_tag_from_model_name'.")
@@ -553,12 +555,12 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
        config, attrs = t.cast(
            "tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
            transformers.AutoConfig.from_pretrained(
-                model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
+                model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
            ),
        )
-        name = convert_transformers_model_name(model_name_or_path)
+        name = convert_transformers_model_name(model_id)

-        if os.path.exists(os.path.dirname(model_name_or_path)):
+        if os.path.exists(os.path.dirname(model_id)):
            # If the model_name_or_path is a path, we assume it's a local path,
            # then users must pass a version for this.
            model_version = attrs.pop("openllm_model_version", None)
@@ -590,7 +592,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
                    "Given %s from '%s' doesn't contain a commit hash. We will generate"
                    " the tag without specific version.",
                    t.cast("type[transformers.PretrainedConfig]", config.__class__),
-                    model_name_or_path,
+                    model_id,
                )
        tag = bentoml.Tag.from_taglike(f"{self.__llm_implementation__}-{name}:{model_version}")

@@ -621,7 +623,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
                }

            return self.import_model(
-                self._pretrained,
+                self._model_id,
                tag,
                *self._llm_args,
                tokenizer_kwds=tokenizer_kwds,
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -388,8 +388,8 @@ def start_model_command(
    docstring = f"""\
 {ModelEnv.start_docstring}
 \b
-The available pretrained models to use with '{model_name}' are: {for_doc.pretrained} [default: {for_doc.default_model}]
-Tip: One can pass one of the aforementioned to '--pretrained' to use other pretrained weights.
+Available model_id(s) to use with '{model_name}' are: {for_doc.model_ids} [default: {for_doc.default_id}]
+Tip: One can pass one of the aforementioned to '--model-id' to use other pretrained weights.
 """
    command_attrs: dict[str, t.Any] = {
        "name": ModelEnv.model_name,
@@ -430,9 +430,7 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
    @llm_config.to_click_options
    @parse_serve_args(_serve_grpc)
    @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
-    @click.option(
-        "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
-    )
+    @model_id_option
    @click.option(
        "--device",
        type=tuple,
@@ -444,18 +442,18 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
    )
    def model_start(
        server_timeout: int,
-        pretrained: str | None,
+        model_id: str | None,
        device: tuple[str, ...] | None,
        **attrs: t.Any,
    ) -> openllm.LLMConfig:
        config, server_attrs = llm_config.model_validate_click(**attrs)

        if ModelEnv.get_framework_env() == "flax":
-            llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
        elif ModelEnv.get_framework_env() == "tf":
-            llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
        else:
-            llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            llm = openllm.AutoLLM.for_model(model_name, mdoel_id=model_id, llm_config=config)

        # NOTE: We need to initialize llm here first to check if the model is already downloaded to
        # avoid deadlock before the subprocess forking.
@@ -580,6 +578,12 @@ output_option = click.option(
    default="pretty",
    help="Showing output type. Default to 'pretty'",
 )
+model_id_option = click.option(
+    "--model-id",
+    type=click.STRING,
+    default=None,
+    help="Optional model_id name or path for (fine-tune) weight.",
+)


 def cli_factory() -> click.Group:
@@ -626,16 +630,15 @@ def cli_factory() -> click.Group:
    @click.argument(
        "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
    )
-    @click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].")
-    @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+    @model_id_option
    @output_option
-    def build(model_name: str, pretrained: str | None, overwrite: bool, output: OutputLiteral):
+    @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+    def build(model_name: str, model_id: str | None, overwrite: bool, output: OutputLiteral):
        """Package a given models into a Bento.

        $ openllm build flan-t5

        \b
-
        NOTE: To run a container built from this Bento with GPU support, make sure
        to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.
        """
@@ -645,7 +648,7 @@ def cli_factory() -> click.Group:
        bento, _previously_built = openllm.build(
            model_name,
            __cli__=True,
-            pretrained=pretrained,
+            model_id=model_id,
            _overwrite_existing_bento=overwrite,
        )

@@ -684,13 +687,13 @@ def cli_factory() -> click.Group:
        else:
            failed_initialized: list[tuple[str, Exception]] = []

-            json_data: dict[str, dict[t.Literal["pretrained", "description"], t.Any]] = {}
+            json_data: dict[str, dict[t.Literal["model_id", "description"], t.Any]] = {}

            for m in models:
                try:
                    model = openllm.AutoLLM.for_model(m)
                    docs = inspect.cleandoc(model.config.__doc__ or "(No description)")
-                    json_data[m] = {"pretrained": model.pretrained, "description": docs}
+                    json_data[m] = {"model_id": model.model_ids, "description": docs}
                except Exception as err:
                    failed_initialized.append((m, err))

@@ -701,7 +704,7 @@ def cli_factory() -> click.Group:

                data: list[str | tuple[str, str, list[str]]] = []
                for m, v in json_data.items():
-                    data.extend([(m, v["description"], v["pretrained"])])
+                    data.extend([(m, v["description"], v["model_id"])])
                column_widths = [int(COLUMNS / 6), int(COLUMNS / 3 * 2), int(COLUMNS / 6)]

                if len(data) == 0 and len(failed_initialized) > 0:
@@ -714,7 +717,7 @@ def cli_factory() -> click.Group:
                table = tabulate.tabulate(
                    data,
                    tablefmt="fancy_grid",
-                    headers=["LLM", "Description", "Pretrained"],
+                    headers=["LLM", "Description", "Models Id"],
                    maxcolwidths=column_widths,
                )

@@ -739,11 +742,9 @@ def cli_factory() -> click.Group:
    @click.argument(
        "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
    )
-    @click.option(
-        "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
-    )
+    @model_id_option
    @output_option
-    def download_models(model_name: str, pretrained: str | None, output: OutputLiteral):
+    def download_models(model_name: str, model_id: str | None, output: OutputLiteral):
        """Setup LLM interactively.

        Note: This is useful for development and setup for fine-tune.
@@ -751,11 +752,11 @@ def cli_factory() -> click.Group:
        config = openllm.AutoConfig.for_model(model_name)
        env = config.__openllm_env__.get_framework_env()
        if env == "flax":
-            model = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
        elif env == "tf":
-            model = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
        else:
-            model = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
+            model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)

        tag = model.make_tag(trust_remote_code=config.__openllm_trust_remote_code__)

@@ -829,7 +830,7 @@ def cli_factory() -> click.Group:
    )
    @output_option
    @click.argument("query", type=click.STRING)
-    def query(
+    def query_(
        query: str,
        endpoint: str,
        timeout: int,
@@ -838,7 +839,7 @@ def cli_factory() -> click.Group:
    ):
        """Ask a LLM interactively, from a terminal.

-        $ openllm query --endpoint http://12.323.2.1 "What is the meaning of life?"
+        $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
        """
        if server_type == "grpc":
            endpoint = re.sub(r"http://", "", endpoint)
@@ -870,7 +871,7 @@ def cli_factory() -> click.Group:
            _echo(res["responses"], fg="white")

    if t.TYPE_CHECKING:
-        assert download_models and build and models and start and start_grpc and query and prune
+        assert download_models and build and models and start and start_grpc and query_ and prune

    if psutil.WINDOWS:
        sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -47,7 +47,7 @@ class _BaseAutoLLMClass:
    def for_model(
        cls,
        model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
        return_runner_kwargs: t.Literal[False] = ...,
        llm_config: openllm.LLMConfig | None = ...,
        **attrs: t.Any,
@@ -59,7 +59,7 @@ class _BaseAutoLLMClass:
    def for_model(
        cls,
        model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
        return_runner_kwargs: t.Literal[True] = ...,
        llm_config: openllm.LLMConfig | None = ...,
        **attrs: t.Any,
@@ -70,11 +70,18 @@ class _BaseAutoLLMClass:
    def for_model(
        cls,
        model_name: str,
-        pretrained: str | None = None,
+        model_id: str | None = None,
        return_runner_kwargs: bool = False,
        llm_config: openllm.LLMConfig | None = ...,
        **attrs: t.Any,
    ) -> openllm.LLM | tuple[openllm.LLM, dict[str, t.Any]]:
+        """The lower level API for creating a LLM instance.
+
+        ```python
+        >>> import openllm
+        >>> llm = openllm.AutoLLM.for_model("flan-t5")
+        ```
+        """
        runner_kwargs_name = [
            "models",
            "max_batch_size",
@@ -88,7 +95,7 @@ class _BaseAutoLLMClass:
            # The rest of kwargs is now passed to config
            llm_config = AutoConfig.for_model(model_name, **attrs)
        if type(llm_config) in cls._model_mapping.keys():
-            llm = cls._model_mapping[type(llm_config)].from_pretrained(pretrained, llm_config=llm_config, **attrs)
+            llm = cls._model_mapping[type(llm_config)].from_pretrained(model_id, llm_config=llm_config, **attrs)
            if not return_runner_kwargs:
                return llm
            return llm, to_runner_attrs
@@ -98,19 +105,19 @@ class _BaseAutoLLMClass:
        )

    @classmethod
-    def create_runner(cls, model_name: str, pretrained: str | None = None, **attrs: t.Any) -> bentoml.Runner:
+    def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
        """
        Create a LLM Runner for the given model name.

        Args:
            model_name: The model name to instantiate.
-            pretrained: The pretrained model name to instantiate.
+            model_id: The pretrained model name to instantiate.
            **attrs: Additional keyword arguments passed along to the specific configuration class.

        Returns:
            A LLM instance.
        """
-        llm, runner_attrs = cls.for_model(model_name, pretrained, return_runner_kwargs=True, **attrs)
+        llm, runner_attrs = cls.for_model(model_name, model_id, return_runner_kwargs=True, **attrs)
        return llm.to_runner(**runner_attrs)

    @classmethod
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -55,7 +55,7 @@ class ChatGLMConfig(


 START_CHATGLM_COMMAND_DOCSTRING = """\
-Run a LLMServer for ChatGLM model and pretrained.
+Run a LLMServer for ChatGLM model.

 \b
 > See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
@@ -67,7 +67,11 @@ Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in yo

 \b
 ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
-saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
+saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_MODEL_ID='thudm/chatglm-6b-int8'``
+or provide `--model-id` flag when running ``openllm start chatglm``:
+
+\b
+$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """

 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -41,22 +41,27 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
 class ChatGLM(openllm.LLM):
    __openllm_internal__ = True

-    default_model = "thudm/chatglm-6b-int4"
+    default_id = "thudm/chatglm-6b-int4"

-    pretrained = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
+    model_ids = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]

    device = torch.device("cuda")

    def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self,
+        model_id: str,
+        tag: bentoml.Tag,
+        *model_args: t.Any,
+        tokenizer_kwds: dict[str, t.Any],
+        **attrs: t.Any,
    ) -> bentoml.Model:
        trust_remote_code = attrs.pop("trust_remote_code", True)
        return bentoml.transformers.save_model(
            tag,
-            transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=trust_remote_code),
+            transformers.AutoModel.from_pretrained(model_id, trust_remote_code=trust_remote_code),
            custom_objects={
                "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                    pretrained, trust_remote_code=trust_remote_code, **tokenizer_kwds
+                    model_id, trust_remote_code=trust_remote_code, **tokenizer_kwds
                )
            },
        )
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -51,7 +51,7 @@ class DollyV2Config(


 START_DOLLY_V2_COMMAND_DOCSTRING = """\
-Run a LLMServer for dolly-v2 model and pretrained.
+Run a LLMServer for dolly-v2 model.

 \b
 > See more information about dolly-v2 at [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
@@ -63,7 +63,11 @@ Currently, dolly-v2 only supports PyTorch. Make sure ``torch`` is available in y

 \b
 Dolly-v2 Runner will use databricks/dolly-v2-3b as the default model. To change any to any other dolly-v2
-saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_PRETRAINED='databricks/dolly-v2-7b'``
+saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_MODEL_ID='databricks/dolly-v2-7b'``
+or provide `--model-id` flag when running ``openllm start dolly-v2``:
+
+\b
+$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
 """

 INSTRUCTION_KEY = "### Instruction:"
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -38,9 +38,9 @@ class DollyV2(openllm.LLM):

    __openllm_internal__ = True

-    default_model = "databricks/dolly-v2-3b"
+    default_id = "databricks/dolly-v2-3b"

-    pretrained = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
+    model_ids = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]

    import_kwargs = {
        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
@@ -51,15 +51,15 @@ class DollyV2(openllm.LLM):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
    ) -> bentoml.Model:
        trust_remote_code = attrs.pop("trust_remote_code", True)
        torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
        device_map = attrs.pop("device_map", "auto")

-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
        pipeline = transformers.pipeline(
-            model=pretrained,
+            model=model_id,
            tokenizer=tokenizer,
            trust_remote_code=trust_remote_code,
            torch_dtype=torch_dtype,
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -40,7 +40,7 @@ class FalconConfig(


 START_FALCON_COMMAND_DOCSTRING = """\
-Run a LLMServer for FalconLM model and pretrained.
+Run a LLMServer for FalconLM model.

 \b
 > See more information about falcon at [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
@@ -52,7 +52,11 @@ Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in y

 \b
 FalconLM Runner will use tiiuae/falcon-7b as the default model. To change any to any other FalconLM
-saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_PRETRAINED='tiiuae/falcon-7b-instruct'``
+saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_MODEL_ID='tiiuae/falcon-7b-instruct'``
+or provide `--model-id` flag when running ``openllm start falcon``:
+
+\b
+$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
 """

 DEFAULT_PROMPT_TEMPLATE = """{context}
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -34,9 +34,9 @@ else:
 class Falcon(openllm.LLM):
    __openllm_internal__ = True

-    default_model = "tiiuae/falcon-7b"
+    default_id = "tiiuae/falcon-7b"

-    pretrained = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
+    model_ids = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]

    import_kwargs = {
        "torch_dtype": torch.bfloat16,
@@ -44,15 +44,15 @@ class Falcon(openllm.LLM):
    }

    def import_model(
-        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+        self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
    ) -> bentoml.Model:
        trust_remote_code = attrs.pop("trust_remote_code", True)
        torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
        device_map = attrs.pop("device_map", "auto")

-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
        model = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained,
+            model_id,
            trust_remote_code=trust_remote_code,
            torch_dtype=torch_dtype,
            device_map=device_map,
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -16,7 +16,7 @@ from __future__ import annotations
 import openllm

 START_FLAN_T5_COMMAND_DOCSTRING = """\
-Run a LLMServer for FLAN-T5 model and pretrained.
+Run a LLMServer for FLAN-T5 model.

 \b
 > See more information about FLAN-T5 at [huggingface/transformers](https://huggingface.co/docs/transformers/model_doc/flan-t5)
@@ -34,7 +34,11 @@ By default, this model will use the PyTorch model for inference. However, this m

 \b
 FLAN-T5 Runner will use google/flan-t5-large as the default model. To change any to any other FLAN-T5
-saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_PRETRAINED='google/flan-t5-xxl'``
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
+or provide `--model-id` flag when running ``openllm start flan-t5``:
+
+\b
+$ openllm start flan-t5 --model-id google/flan-t5-xxl
 """

 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -29,9 +29,9 @@ else:
 class FlanT5(openllm.LLM):
    __openllm_internal__ = True

-    default_model = "google/flan-t5-large"
+    default_id = "google/flan-t5-large"

-    pretrained = [
+    model_ids = [
        "google/flan-t5-small",
        "google/flan-t5-base",
        "google/flan-t5-large",
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class FlaxFlanT5(openllm.LLM):
    __openllm_internal__ = True

-    default_model: str = "google/flan-t5-large"
+    default_id: str = "google/flan-t5-large"

-    pretrained = [
+    model_ids = [
        "google/flan-t5-small",
        "google/flan-t5-base",
        "google/flan-t5-large",
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class TFFlanT5(openllm.LLM):
    __openllm_internal__ = True

-    default_model: str = "google/flan-t5-large"
+    default_id: str = "google/flan-t5-large"

-    pretrained = [
+    model_ids = [
        "google/flan-t5-small",
        "google/flan-t5-base",
        "google/flan-t5-large",
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -38,7 +38,7 @@ class StableLMConfig(openllm.LLMConfig, name_type="lowercase", url="https://gith


 START_STABLELM_COMMAND_DOCSTRING = """\
-Run a LLMServer for StableLM model and pretrained.
+Run a LLMServer for StableLM model.

 \b
 > See more information about StableLM at [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
@@ -50,7 +50,11 @@ Currently, StableLM only supports PyTorch. Make sure ``torch`` is available in y

 \b
 StableLM Runner will use stabilityai/stablelm-base-alpha-3b as the default model. To change any to any other StableLM
-saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_PRETRAINED='stabilityai/stablelm-tuned-alpha-3b'``
+saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_MODEL_ID='stabilityai/stablelm-tuned-alpha-3b'``
+or provide `--model-id` flag when running ``openllm start stablelm``:
+
+\b
+$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """

 SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -43,9 +43,9 @@ class StableLM(openllm.LLM):
    __openllm_internal__ = True

    load_in_mha = True
-    default_model = "stabilityai/stablelm-tuned-alpha-3b"
+    default_id = "stabilityai/stablelm-tuned-alpha-3b"

-    pretrained = [
+    model_ids = [
        "stabilityai/stablelm-tuned-alpha-3b",
        "stabilityai/stablelm-tuned-alpha-7b",
        "stabilityai/stablelm-base-alpha-3b",
@@ -70,7 +70,7 @@ class StableLM(openllm.LLM):
        use_default_prompt_template: bool = True,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        if "tuned" in self._pretrained and use_default_prompt_template:
+        if "tuned" in self._model_id and use_default_prompt_template:
            prompt_variables = {
                k: v
                for k, v in attrs.items()
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -42,7 +42,7 @@ class StarCoderConfig(


 START_STARCODER_COMMAND_DOCSTRING = """\
-Run a LLMServer for StarCoder model and pretrained.
+Run a LLMServer for StarCoder model.

 \b
 > See more information about StarCoder at [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
@@ -54,7 +54,11 @@ Currently, StarCoder only supports PyTorch. Make sure ``torch`` is available in

 \b
 StarCoder Runner will use bigcode/starcoder as the default model. To change any to any other StarCoder
-saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_PRETRAINED='bigcode/starcoder'``
+saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_MODEL_ID='bigcode/starcoder'``
+or provide `--model-id` flag when running ``openllm start starcoder``:
+
+\b
+$ openllm start starcoder --model-id 'bigcode/starcoder'
 """

 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -40,21 +40,21 @@ FIM_INDICATOR = "<FILL_HERE>"
 class StarCoder(openllm.LLM):
    __openllm_internal__ = True

-    default_model = "bigcode/starcoder"
+    default_id = "bigcode/starcoder"

-    pretrained = ["bigcode/starcoder", "bigcode/starcoderbase"]
+    model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"]

    device = torch.device("cuda")

    def import_model(
        self,
-        pretrained: str,
+        model_id: str,
        tag: bentoml.Tag,
        *model_args: t.Any,
        tokenizer_kwds: dict[str, t.Any],
        **attrs: t.Any,
    ) -> bentoml.Model:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
        tokenizer.add_special_tokens(
            {
                "additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
@@ -62,7 +62,7 @@ class StarCoder(openllm.LLM):
            }
        )

-        model = transformers.AutoModelForCausalLM.from_pretrained(pretrained, **attrs)
+        model = transformers.AutoModelForCausalLM.from_pretrained(model_id, **attrs)

        try:
            return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -91,8 +91,8 @@ class ModelEnv:
        return f"OPENLLM_{self.model_name.upper()}_CONFIG"

    @property
-    def pretrained(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_PRETRAINED"
+    def model_id(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"

    @property
    def bettertransformer(self) -> str: