mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-23 16:39:48 -04:00
refactor: pretrained => model_id
I think model_id makes more sense than calling it pretrained Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -92,14 +92,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):
|
||||
|
||||
|
||||
def import_model(
|
||||
model_name: str,
|
||||
model_id: str,
|
||||
tag: bentoml.Tag,
|
||||
_model_framework: str,
|
||||
*model_args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**attrs: t.Any,
|
||||
):
|
||||
"""Auto detect model type from given model_name and import it to bentoml's model store.
|
||||
"""Auto detect model type from given model_id and import it to bentoml's model store.
|
||||
|
||||
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
|
||||
returning all of the unused kwargs.
|
||||
@@ -111,7 +111,7 @@ def import_model(
|
||||
Refer to Transformers documentation for more information about kwargs.
|
||||
|
||||
Args:
|
||||
model_name: Model name to be imported. use `openllm models` to see available entries
|
||||
model_id: Model id to be imported. See `openllm models` for all supported models.
|
||||
tag: Tag to be used for the model. This is usually generated for you.
|
||||
model_args: Args to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
|
||||
**attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
|
||||
@@ -139,7 +139,7 @@ def import_model(
|
||||
config, attrs = t.cast(
|
||||
"tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
|
||||
transformers.AutoConfig.from_pretrained(
|
||||
model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
|
||||
model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs
|
||||
),
|
||||
)
|
||||
|
||||
@@ -156,13 +156,13 @@ def import_model(
|
||||
getattr(
|
||||
transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[_model_framework][TaskType[task_type].value - 1]
|
||||
).from_pretrained(
|
||||
model_name, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
|
||||
model_id, *model_args, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
|
||||
),
|
||||
custom_objects={
|
||||
"tokenizer": t.cast(
|
||||
"LLMTokenizer",
|
||||
transformers.AutoTokenizer.from_pretrained(
|
||||
model_name, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
|
||||
model_id, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_kwds
|
||||
),
|
||||
)
|
||||
},
|
||||
@@ -179,7 +179,7 @@ def import_model(
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
_required_namespace = {"default_model", "pretrained"}
|
||||
_required_namespace = {"default_id", "model_ids"}
|
||||
|
||||
_reserved_namespace = _required_namespace | {
|
||||
"config_class",
|
||||
@@ -192,11 +192,11 @@ _reserved_namespace = _required_namespace | {
|
||||
class LLMInterface(ABC):
|
||||
"""This defines the loose contract for all openllm.LLM implementations."""
|
||||
|
||||
default_model: str
|
||||
"""Return the default model to use when using 'openllm start <model_name>'.
|
||||
This could be one of the keys in 'self.pretrained' or custom users model."""
|
||||
default_id: str
|
||||
"""Return the default model to use when using 'openllm start <model_id>'.
|
||||
This could be one of the keys in 'self.model_ids' or custom users model."""
|
||||
|
||||
pretrained: list[str]
|
||||
model_ids: list[str]
|
||||
"""A list of supported pretrained models tag for this given runnable.
|
||||
|
||||
For example:
|
||||
@@ -253,7 +253,7 @@ class LLMInterface(ABC):
|
||||
pass
|
||||
|
||||
def import_model(
|
||||
self, pretrained: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
self, model_id: str, tag: bentoml.Tag, *args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
) -> bentoml.Model:
|
||||
"""This function can be implemented if default import_model doesn't satisfy your needs."""
|
||||
raise NotImplementedError
|
||||
@@ -275,6 +275,8 @@ class LLMMetaclass(ABCMeta):
|
||||
namespace["__annotations__"] = annotations_dict
|
||||
|
||||
# NOTE: check for required attributes
|
||||
if "__openllm_internal__" not in namespace:
|
||||
_required_namespace.add("config_class")
|
||||
for k in _required_namespace:
|
||||
if k not in namespace:
|
||||
raise RuntimeError(f"Missing required key '{k}'. Make sure to define it within the LLM subclass.")
|
||||
@@ -378,13 +380,13 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
|
||||
cls, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
|
||||
) -> LLM:
|
||||
return cls(pretrained=pretrained, llm_config=llm_config, *args, **attrs)
|
||||
return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pretrained: str | None = None,
|
||||
model_id: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
**attrs: t.Any,
|
||||
@@ -408,7 +410,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
```python
|
||||
def import_model(
|
||||
self,
|
||||
pretrained: str,
|
||||
model_id: str,
|
||||
tag: bentoml.Tag,
|
||||
*args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
@@ -417,11 +419,11 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
return bentoml.transformers.save_model(
|
||||
tag,
|
||||
transformers.AutoModelForCausalLM.from_pretrained(
|
||||
pretrained, device_map="auto", torch_dtype=torch.bfloat16, **attrs
|
||||
model_id, device_map="auto", torch_dtype=torch.bfloat16, **attrs
|
||||
),
|
||||
custom_objects={
|
||||
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
||||
pretrained, padding_size="left", **tokenizer_kwds
|
||||
model_id, padding_size="left", **tokenizer_kwds
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -440,14 +442,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
Note: If you implement your own `import_model`, then `import_kwargs` will be the
|
||||
default kwargs for every load. You can still override those via ``openllm.Runner``.
|
||||
|
||||
Note that this tag will be generated based on `self.default_model` or the given `pretrained` kwds.
|
||||
Note that this tag will be generated based on `self.default_id` or the given `pretrained` kwds.
|
||||
passed from the __init__ constructor.
|
||||
|
||||
``llm_post_init`` can also be implemented if you need to do any
|
||||
additional initialization after everything is setup.
|
||||
|
||||
Args:
|
||||
pretrained: The pretrained model to use. Defaults to None. It will use 'self.default_model' if None.
|
||||
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
||||
llm_config: The config to use for this LLM. Defaults to None. If not passed, we will use 'self.config_class'
|
||||
to construct default configuration.
|
||||
*args: The args to be passed to the model.
|
||||
@@ -462,14 +464,14 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
||||
attrs = self.config.__openllm_extras__
|
||||
|
||||
if pretrained is None:
|
||||
pretrained = os.environ.get(self.config.__openllm_env__.pretrained, None)
|
||||
if not pretrained:
|
||||
assert self.default_model, "A default model is required for any LLM."
|
||||
pretrained = self.default_model
|
||||
if model_id is None:
|
||||
model_id = os.environ.get(self.config.__openllm_env__.model_id, None)
|
||||
if not model_id:
|
||||
assert self.default_id, "A default model is required for any LLM."
|
||||
model_id = self.default_id
|
||||
|
||||
# NOTE: This is the actual given path or pretrained weight for this LLM.
|
||||
self._pretrained = pretrained
|
||||
self._model_id = model_id
|
||||
|
||||
# NOTE: Save the args and kwargs for latter load
|
||||
self._llm_args = args
|
||||
@@ -491,19 +493,19 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
||||
@property
|
||||
def llm_type(self) -> str:
|
||||
return convert_transformers_model_name(self._pretrained)
|
||||
return convert_transformers_model_name(self._model_id)
|
||||
|
||||
@property
|
||||
def identifying_params(self) -> dict[str, t.Any]:
|
||||
return {
|
||||
"configuration": self.config.model_dump_json().decode(),
|
||||
"pretrained": orjson.dumps(self.pretrained).decode(),
|
||||
"model_ids": orjson.dumps(self.model_ids).decode(),
|
||||
}
|
||||
|
||||
@t.overload
|
||||
def make_tag(
|
||||
self,
|
||||
model_name_or_path: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_unused_kwargs: t.Literal[False] = ...,
|
||||
trust_remote_code: bool = ...,
|
||||
**attrs: t.Any,
|
||||
@@ -513,7 +515,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
@t.overload
|
||||
def make_tag(
|
||||
self,
|
||||
model_name_or_path: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_unused_kwargs: t.Literal[True] = ...,
|
||||
trust_remote_code: bool = ...,
|
||||
**attrs: t.Any,
|
||||
@@ -522,7 +524,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
|
||||
def make_tag(
|
||||
self,
|
||||
model_name_or_path: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_unused_kwargs: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
**attrs: t.Any,
|
||||
@@ -543,8 +545,8 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
Returns:
|
||||
A tuple of ``bentoml.Tag`` and a dict of unused kwargs.
|
||||
"""
|
||||
if model_name_or_path is None:
|
||||
model_name_or_path = self._pretrained
|
||||
if model_id is None:
|
||||
model_id = self._model_id
|
||||
|
||||
if "return_unused_kwargs" in attrs:
|
||||
logger.debug("Ignoring 'return_unused_kwargs' in 'generate_tag_from_model_name'.")
|
||||
@@ -553,12 +555,12 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
config, attrs = t.cast(
|
||||
"tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
|
||||
transformers.AutoConfig.from_pretrained(
|
||||
model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
|
||||
model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
|
||||
),
|
||||
)
|
||||
name = convert_transformers_model_name(model_name_or_path)
|
||||
name = convert_transformers_model_name(model_id)
|
||||
|
||||
if os.path.exists(os.path.dirname(model_name_or_path)):
|
||||
if os.path.exists(os.path.dirname(model_id)):
|
||||
# If the model_name_or_path is a path, we assume it's a local path,
|
||||
# then users must pass a version for this.
|
||||
model_version = attrs.pop("openllm_model_version", None)
|
||||
@@ -590,7 +592,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
"Given %s from '%s' doesn't contain a commit hash. We will generate"
|
||||
" the tag without specific version.",
|
||||
t.cast("type[transformers.PretrainedConfig]", config.__class__),
|
||||
model_name_or_path,
|
||||
model_id,
|
||||
)
|
||||
tag = bentoml.Tag.from_taglike(f"{self.__llm_implementation__}-{name}:{model_version}")
|
||||
|
||||
@@ -621,7 +623,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
}
|
||||
|
||||
return self.import_model(
|
||||
self._pretrained,
|
||||
self._model_id,
|
||||
tag,
|
||||
*self._llm_args,
|
||||
tokenizer_kwds=tokenizer_kwds,
|
||||
|
||||
@@ -388,8 +388,8 @@ def start_model_command(
|
||||
docstring = f"""\
|
||||
{ModelEnv.start_docstring}
|
||||
\b
|
||||
The available pretrained models to use with '{model_name}' are: {for_doc.pretrained} [default: {for_doc.default_model}]
|
||||
Tip: One can pass one of the aforementioned to '--pretrained' to use other pretrained weights.
|
||||
Available model_id(s) to use with '{model_name}' are: {for_doc.model_ids} [default: {for_doc.default_id}]
|
||||
Tip: One can pass one of the aforementioned to '--model-id' to use other pretrained weights.
|
||||
"""
|
||||
command_attrs: dict[str, t.Any] = {
|
||||
"name": ModelEnv.model_name,
|
||||
@@ -430,9 +430,7 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
|
||||
@llm_config.to_click_options
|
||||
@parse_serve_args(_serve_grpc)
|
||||
@click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
|
||||
@click.option(
|
||||
"--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
|
||||
)
|
||||
@model_id_option
|
||||
@click.option(
|
||||
"--device",
|
||||
type=tuple,
|
||||
@@ -444,18 +442,18 @@ Tip: One can pass one of the aforementioned to '--pretrained' to use other pretr
|
||||
)
|
||||
def model_start(
|
||||
server_timeout: int,
|
||||
pretrained: str | None,
|
||||
model_id: str | None,
|
||||
device: tuple[str, ...] | None,
|
||||
**attrs: t.Any,
|
||||
) -> openllm.LLMConfig:
|
||||
config, server_attrs = llm_config.model_validate_click(**attrs)
|
||||
|
||||
if ModelEnv.get_framework_env() == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
elif ModelEnv.get_framework_env() == "tf":
|
||||
llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
else:
|
||||
llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
llm = openllm.AutoLLM.for_model(model_name, mdoel_id=model_id, llm_config=config)
|
||||
|
||||
# NOTE: We need to initialize llm here first to check if the model is already downloaded to
|
||||
# avoid deadlock before the subprocess forking.
|
||||
@@ -580,6 +578,12 @@ output_option = click.option(
|
||||
default="pretty",
|
||||
help="Showing output type. Default to 'pretty'",
|
||||
)
|
||||
model_id_option = click.option(
|
||||
"--model-id",
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
help="Optional model_id name or path for (fine-tune) weight.",
|
||||
)
|
||||
|
||||
|
||||
def cli_factory() -> click.Group:
|
||||
@@ -626,16 +630,15 @@ def cli_factory() -> click.Group:
|
||||
@click.argument(
|
||||
"model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
|
||||
)
|
||||
@click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].")
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
|
||||
@model_id_option
|
||||
@output_option
|
||||
def build(model_name: str, pretrained: str | None, overwrite: bool, output: OutputLiteral):
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
|
||||
def build(model_name: str, model_id: str | None, overwrite: bool, output: OutputLiteral):
|
||||
"""Package a given models into a Bento.
|
||||
|
||||
$ openllm build flan-t5
|
||||
|
||||
\b
|
||||
|
||||
NOTE: To run a container built from this Bento with GPU support, make sure
|
||||
to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.
|
||||
"""
|
||||
@@ -645,7 +648,7 @@ def cli_factory() -> click.Group:
|
||||
bento, _previously_built = openllm.build(
|
||||
model_name,
|
||||
__cli__=True,
|
||||
pretrained=pretrained,
|
||||
model_id=model_id,
|
||||
_overwrite_existing_bento=overwrite,
|
||||
)
|
||||
|
||||
@@ -684,13 +687,13 @@ def cli_factory() -> click.Group:
|
||||
else:
|
||||
failed_initialized: list[tuple[str, Exception]] = []
|
||||
|
||||
json_data: dict[str, dict[t.Literal["pretrained", "description"], t.Any]] = {}
|
||||
json_data: dict[str, dict[t.Literal["model_id", "description"], t.Any]] = {}
|
||||
|
||||
for m in models:
|
||||
try:
|
||||
model = openllm.AutoLLM.for_model(m)
|
||||
docs = inspect.cleandoc(model.config.__doc__ or "(No description)")
|
||||
json_data[m] = {"pretrained": model.pretrained, "description": docs}
|
||||
json_data[m] = {"model_id": model.model_ids, "description": docs}
|
||||
except Exception as err:
|
||||
failed_initialized.append((m, err))
|
||||
|
||||
@@ -701,7 +704,7 @@ def cli_factory() -> click.Group:
|
||||
|
||||
data: list[str | tuple[str, str, list[str]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v["description"], v["pretrained"])])
|
||||
data.extend([(m, v["description"], v["model_id"])])
|
||||
column_widths = [int(COLUMNS / 6), int(COLUMNS / 3 * 2), int(COLUMNS / 6)]
|
||||
|
||||
if len(data) == 0 and len(failed_initialized) > 0:
|
||||
@@ -714,7 +717,7 @@ def cli_factory() -> click.Group:
|
||||
table = tabulate.tabulate(
|
||||
data,
|
||||
tablefmt="fancy_grid",
|
||||
headers=["LLM", "Description", "Pretrained"],
|
||||
headers=["LLM", "Description", "Models Id"],
|
||||
maxcolwidths=column_widths,
|
||||
)
|
||||
|
||||
@@ -739,11 +742,9 @@ def cli_factory() -> click.Group:
|
||||
@click.argument(
|
||||
"model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
|
||||
)
|
||||
@click.option(
|
||||
"--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight."
|
||||
)
|
||||
@model_id_option
|
||||
@output_option
|
||||
def download_models(model_name: str, pretrained: str | None, output: OutputLiteral):
|
||||
def download_models(model_name: str, model_id: str | None, output: OutputLiteral):
|
||||
"""Setup LLM interactively.
|
||||
|
||||
Note: This is useful for development and setup for fine-tune.
|
||||
@@ -751,11 +752,11 @@ def cli_factory() -> click.Group:
|
||||
config = openllm.AutoConfig.for_model(model_name)
|
||||
env = config.__openllm_env__.get_framework_env()
|
||||
if env == "flax":
|
||||
model = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
elif env == "tf":
|
||||
model = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
else:
|
||||
model = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config)
|
||||
model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
|
||||
tag = model.make_tag(trust_remote_code=config.__openllm_trust_remote_code__)
|
||||
|
||||
@@ -829,7 +830,7 @@ def cli_factory() -> click.Group:
|
||||
)
|
||||
@output_option
|
||||
@click.argument("query", type=click.STRING)
|
||||
def query(
|
||||
def query_(
|
||||
query: str,
|
||||
endpoint: str,
|
||||
timeout: int,
|
||||
@@ -838,7 +839,7 @@ def cli_factory() -> click.Group:
|
||||
):
|
||||
"""Ask a LLM interactively, from a terminal.
|
||||
|
||||
$ openllm query --endpoint http://12.323.2.1 "What is the meaning of life?"
|
||||
$ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
|
||||
"""
|
||||
if server_type == "grpc":
|
||||
endpoint = re.sub(r"http://", "", endpoint)
|
||||
@@ -870,7 +871,7 @@ def cli_factory() -> click.Group:
|
||||
_echo(res["responses"], fg="white")
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
assert download_models and build and models and start and start_grpc and query and prune
|
||||
assert download_models and build and models and start and start_grpc and query_ and prune
|
||||
|
||||
if psutil.WINDOWS:
|
||||
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
||||
|
||||
@@ -47,7 +47,7 @@ class _BaseAutoLLMClass:
|
||||
def for_model(
|
||||
cls,
|
||||
model_name: str,
|
||||
pretrained: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_runner_kwargs: t.Literal[False] = ...,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
**attrs: t.Any,
|
||||
@@ -59,7 +59,7 @@ class _BaseAutoLLMClass:
|
||||
def for_model(
|
||||
cls,
|
||||
model_name: str,
|
||||
pretrained: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_runner_kwargs: t.Literal[True] = ...,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
**attrs: t.Any,
|
||||
@@ -70,11 +70,18 @@ class _BaseAutoLLMClass:
|
||||
def for_model(
|
||||
cls,
|
||||
model_name: str,
|
||||
pretrained: str | None = None,
|
||||
model_id: str | None = None,
|
||||
return_runner_kwargs: bool = False,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
**attrs: t.Any,
|
||||
) -> openllm.LLM | tuple[openllm.LLM, dict[str, t.Any]]:
|
||||
"""The lower level API for creating a LLM instance.
|
||||
|
||||
```python
|
||||
>>> import openllm
|
||||
>>> llm = openllm.AutoLLM.for_model("flan-t5")
|
||||
```
|
||||
"""
|
||||
runner_kwargs_name = [
|
||||
"models",
|
||||
"max_batch_size",
|
||||
@@ -88,7 +95,7 @@ class _BaseAutoLLMClass:
|
||||
# The rest of kwargs is now passed to config
|
||||
llm_config = AutoConfig.for_model(model_name, **attrs)
|
||||
if type(llm_config) in cls._model_mapping.keys():
|
||||
llm = cls._model_mapping[type(llm_config)].from_pretrained(pretrained, llm_config=llm_config, **attrs)
|
||||
llm = cls._model_mapping[type(llm_config)].from_pretrained(model_id, llm_config=llm_config, **attrs)
|
||||
if not return_runner_kwargs:
|
||||
return llm
|
||||
return llm, to_runner_attrs
|
||||
@@ -98,19 +105,19 @@ class _BaseAutoLLMClass:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create_runner(cls, model_name: str, pretrained: str | None = None, **attrs: t.Any) -> bentoml.Runner:
|
||||
def create_runner(cls, model_name: str, model_id: str | None = None, **attrs: t.Any) -> bentoml.Runner:
|
||||
"""
|
||||
Create a LLM Runner for the given model name.
|
||||
|
||||
Args:
|
||||
model_name: The model name to instantiate.
|
||||
pretrained: The pretrained model name to instantiate.
|
||||
model_id: The pretrained model name to instantiate.
|
||||
**attrs: Additional keyword arguments passed along to the specific configuration class.
|
||||
|
||||
Returns:
|
||||
A LLM instance.
|
||||
"""
|
||||
llm, runner_attrs = cls.for_model(model_name, pretrained, return_runner_kwargs=True, **attrs)
|
||||
llm, runner_attrs = cls.for_model(model_name, model_id, return_runner_kwargs=True, **attrs)
|
||||
return llm.to_runner(**runner_attrs)
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -55,7 +55,7 @@ class ChatGLMConfig(
|
||||
|
||||
|
||||
START_CHATGLM_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for ChatGLM model and pretrained.
|
||||
Run a LLMServer for ChatGLM model.
|
||||
|
||||
\b
|
||||
> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
|
||||
@@ -67,7 +67,11 @@ Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in yo
|
||||
|
||||
\b
|
||||
ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
|
||||
saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
|
||||
saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_MODEL_ID='thudm/chatglm-6b-int8'``
|
||||
or provide `--model-id` flag when running ``openllm start chatglm``:
|
||||
|
||||
\b
|
||||
$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -41,22 +41,27 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
|
||||
class ChatGLM(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model = "thudm/chatglm-6b-int4"
|
||||
default_id = "thudm/chatglm-6b-int4"
|
||||
|
||||
pretrained = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
|
||||
model_ids = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
|
||||
|
||||
device = torch.device("cuda")
|
||||
|
||||
def import_model(
|
||||
self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
self,
|
||||
model_id: str,
|
||||
tag: bentoml.Tag,
|
||||
*model_args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**attrs: t.Any,
|
||||
) -> bentoml.Model:
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
return bentoml.transformers.save_model(
|
||||
tag,
|
||||
transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=trust_remote_code),
|
||||
transformers.AutoModel.from_pretrained(model_id, trust_remote_code=trust_remote_code),
|
||||
custom_objects={
|
||||
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
||||
pretrained, trust_remote_code=trust_remote_code, **tokenizer_kwds
|
||||
model_id, trust_remote_code=trust_remote_code, **tokenizer_kwds
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -51,7 +51,7 @@ class DollyV2Config(
|
||||
|
||||
|
||||
START_DOLLY_V2_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for dolly-v2 model and pretrained.
|
||||
Run a LLMServer for dolly-v2 model.
|
||||
|
||||
\b
|
||||
> See more information about dolly-v2 at [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
|
||||
@@ -63,7 +63,11 @@ Currently, dolly-v2 only supports PyTorch. Make sure ``torch`` is available in y
|
||||
|
||||
\b
|
||||
Dolly-v2 Runner will use databricks/dolly-v2-3b as the default model. To change any to any other dolly-v2
|
||||
saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_PRETRAINED='databricks/dolly-v2-7b'``
|
||||
saved pretrained, or a fine-tune dolly-v2, provide ``OPENLLM_DOLLY_V2_MODEL_ID='databricks/dolly-v2-7b'``
|
||||
or provide `--model-id` flag when running ``openllm start dolly-v2``:
|
||||
|
||||
\b
|
||||
$ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
|
||||
"""
|
||||
|
||||
INSTRUCTION_KEY = "### Instruction:"
|
||||
|
||||
@@ -38,9 +38,9 @@ class DollyV2(openllm.LLM):
|
||||
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model = "databricks/dolly-v2-3b"
|
||||
default_id = "databricks/dolly-v2-3b"
|
||||
|
||||
pretrained = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
|
||||
model_ids = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
|
||||
|
||||
import_kwargs = {
|
||||
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
@@ -51,15 +51,15 @@ class DollyV2(openllm.LLM):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def import_model(
|
||||
self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
) -> bentoml.Model:
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
|
||||
device_map = attrs.pop("device_map", "auto")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
|
||||
pipeline = transformers.pipeline(
|
||||
model=pretrained,
|
||||
model=model_id,
|
||||
tokenizer=tokenizer,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype=torch_dtype,
|
||||
|
||||
@@ -40,7 +40,7 @@ class FalconConfig(
|
||||
|
||||
|
||||
START_FALCON_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for FalconLM model and pretrained.
|
||||
Run a LLMServer for FalconLM model.
|
||||
|
||||
\b
|
||||
> See more information about falcon at [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
|
||||
@@ -52,7 +52,11 @@ Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in y
|
||||
|
||||
\b
|
||||
FalconLM Runner will use tiiuae/falcon-7b as the default model. To change any to any other FalconLM
|
||||
saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_PRETRAINED='tiiuae/falcon-7b-instruct'``
|
||||
saved pretrained, or a fine-tune FalconLM, provide ``OPENLLM_FALCON_MODEL_ID='tiiuae/falcon-7b-instruct'``
|
||||
or provide `--model-id` flag when running ``openllm start falcon``:
|
||||
|
||||
\b
|
||||
$ openllm start falcon --model-id tiiuae/falcon-7b-instruct
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{context}
|
||||
|
||||
@@ -34,9 +34,9 @@ else:
|
||||
class Falcon(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model = "tiiuae/falcon-7b"
|
||||
default_id = "tiiuae/falcon-7b"
|
||||
|
||||
pretrained = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
|
||||
model_ids = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
|
||||
|
||||
import_kwargs = {
|
||||
"torch_dtype": torch.bfloat16,
|
||||
@@ -44,15 +44,15 @@ class Falcon(openllm.LLM):
|
||||
}
|
||||
|
||||
def import_model(
|
||||
self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
|
||||
) -> bentoml.Model:
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
|
||||
device_map = attrs.pop("device_map", "auto")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
pretrained,
|
||||
model_id,
|
||||
trust_remote_code=trust_remote_code,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device_map,
|
||||
|
||||
@@ -16,7 +16,7 @@ from __future__ import annotations
|
||||
import openllm
|
||||
|
||||
START_FLAN_T5_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for FLAN-T5 model and pretrained.
|
||||
Run a LLMServer for FLAN-T5 model.
|
||||
|
||||
\b
|
||||
> See more information about FLAN-T5 at [huggingface/transformers](https://huggingface.co/docs/transformers/model_doc/flan-t5)
|
||||
@@ -34,7 +34,11 @@ By default, this model will use the PyTorch model for inference. However, this m
|
||||
|
||||
\b
|
||||
FLAN-T5 Runner will use google/flan-t5-large as the default model. To change any to any other FLAN-T5
|
||||
saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_PRETRAINED='google/flan-t5-xxl'``
|
||||
saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
|
||||
or provide `--model-id` flag when running ``openllm start flan-t5``:
|
||||
|
||||
\b
|
||||
$ openllm start flan-t5 --model-id google/flan-t5-xxl
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
|
||||
|
||||
@@ -29,9 +29,9 @@ else:
|
||||
class FlanT5(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model = "google/flan-t5-large"
|
||||
default_id = "google/flan-t5-large"
|
||||
|
||||
pretrained = [
|
||||
model_ids = [
|
||||
"google/flan-t5-small",
|
||||
"google/flan-t5-base",
|
||||
"google/flan-t5-large",
|
||||
|
||||
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
|
||||
class FlaxFlanT5(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model: str = "google/flan-t5-large"
|
||||
default_id: str = "google/flan-t5-large"
|
||||
|
||||
pretrained = [
|
||||
model_ids = [
|
||||
"google/flan-t5-small",
|
||||
"google/flan-t5-base",
|
||||
"google/flan-t5-large",
|
||||
|
||||
@@ -24,9 +24,9 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
|
||||
class TFFlanT5(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model: str = "google/flan-t5-large"
|
||||
default_id: str = "google/flan-t5-large"
|
||||
|
||||
pretrained = [
|
||||
model_ids = [
|
||||
"google/flan-t5-small",
|
||||
"google/flan-t5-base",
|
||||
"google/flan-t5-large",
|
||||
|
||||
@@ -38,7 +38,7 @@ class StableLMConfig(openllm.LLMConfig, name_type="lowercase", url="https://gith
|
||||
|
||||
|
||||
START_STABLELM_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for StableLM model and pretrained.
|
||||
Run a LLMServer for StableLM model.
|
||||
|
||||
\b
|
||||
> See more information about StableLM at [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
|
||||
@@ -50,7 +50,11 @@ Currently, StableLM only supports PyTorch. Make sure ``torch`` is available in y
|
||||
|
||||
\b
|
||||
StableLM Runner will use stabilityai/stablelm-base-alpha-3b as the default model. To change any to any other StableLM
|
||||
saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_PRETRAINED='stabilityai/stablelm-tuned-alpha-3b'``
|
||||
saved pretrained, or a fine-tune StableLM, provide ``OPENLLM_STABLELM_MODEL_ID='stabilityai/stablelm-tuned-alpha-3b'``
|
||||
or provide `--model-id` flag when running ``openllm start stablelm``:
|
||||
|
||||
\b
|
||||
$ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
|
||||
"""
|
||||
|
||||
SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
|
||||
|
||||
@@ -43,9 +43,9 @@ class StableLM(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
load_in_mha = True
|
||||
default_model = "stabilityai/stablelm-tuned-alpha-3b"
|
||||
default_id = "stabilityai/stablelm-tuned-alpha-3b"
|
||||
|
||||
pretrained = [
|
||||
model_ids = [
|
||||
"stabilityai/stablelm-tuned-alpha-3b",
|
||||
"stabilityai/stablelm-tuned-alpha-7b",
|
||||
"stabilityai/stablelm-base-alpha-3b",
|
||||
@@ -70,7 +70,7 @@ class StableLM(openllm.LLM):
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if "tuned" in self._pretrained and use_default_prompt_template:
|
||||
if "tuned" in self._model_id and use_default_prompt_template:
|
||||
prompt_variables = {
|
||||
k: v
|
||||
for k, v in attrs.items()
|
||||
|
||||
@@ -42,7 +42,7 @@ class StarCoderConfig(
|
||||
|
||||
|
||||
START_STARCODER_COMMAND_DOCSTRING = """\
|
||||
Run a LLMServer for StarCoder model and pretrained.
|
||||
Run a LLMServer for StarCoder model.
|
||||
|
||||
\b
|
||||
> See more information about StarCoder at [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
|
||||
@@ -54,7 +54,11 @@ Currently, StarCoder only supports PyTorch. Make sure ``torch`` is available in
|
||||
|
||||
\b
|
||||
StarCoder Runner will use bigcode/starcoder as the default model. To change any to any other StarCoder
|
||||
saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_PRETRAINED='bigcode/starcoder'``
|
||||
saved pretrained, or a fine-tune StarCoder, provide ``OPENLLM_STARCODER_MODEL_ID='bigcode/starcoder'``
|
||||
or provide `--model-id` flag when running ``openllm start starcoder``:
|
||||
|
||||
\b
|
||||
$ openllm start starcoder --model-id 'bigcode/starcoder'
|
||||
"""
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
|
||||
|
||||
@@ -40,21 +40,21 @@ FIM_INDICATOR = "<FILL_HERE>"
|
||||
class StarCoder(openllm.LLM):
|
||||
__openllm_internal__ = True
|
||||
|
||||
default_model = "bigcode/starcoder"
|
||||
default_id = "bigcode/starcoder"
|
||||
|
||||
pretrained = ["bigcode/starcoder", "bigcode/starcoderbase"]
|
||||
model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"]
|
||||
|
||||
device = torch.device("cuda")
|
||||
|
||||
def import_model(
|
||||
self,
|
||||
pretrained: str,
|
||||
model_id: str,
|
||||
tag: bentoml.Tag,
|
||||
*model_args: t.Any,
|
||||
tokenizer_kwds: dict[str, t.Any],
|
||||
**attrs: t.Any,
|
||||
) -> bentoml.Model:
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
|
||||
@@ -62,7 +62,7 @@ class StarCoder(openllm.LLM):
|
||||
}
|
||||
)
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(pretrained, **attrs)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(model_id, **attrs)
|
||||
|
||||
try:
|
||||
return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
|
||||
|
||||
@@ -91,8 +91,8 @@ class ModelEnv:
|
||||
return f"OPENLLM_{self.model_name.upper()}_CONFIG"
|
||||
|
||||
@property
|
||||
def pretrained(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_PRETRAINED"
|
||||
def model_id(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"
|
||||
|
||||
@property
|
||||
def bettertransformer(self) -> str:
|
||||
|
||||
Reference in New Issue
Block a user