diff --git a/changelog.d/223.feature.md b/changelog.d/223.feature.md new file mode 100644 index 00000000..dadc2b75 --- /dev/null +++ b/changelog.d/223.feature.md @@ -0,0 +1 @@ +Running vLLM with Falcon is now supported diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index e1d45aec..20e2245c 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -114,6 +114,7 @@ else: _import_structure["models.llama"].extend(["VLLMLlama"]) _import_structure["models.opt"].extend(["VLLMOPT"]) _import_structure["models.dolly_v2"].extend(["VLLMDollyV2"]) + _import_structure["models.falcon"].extend(["VLLMFalcon"]) _import_structure["models.gpt_neox"].extend(["VLLMGPTNeoX"]) _import_structure["models.mpt"].extend(["VLLMMPT"]) _import_structure["models.stablelm"].extend(["VLLMStableLM"]) @@ -124,6 +125,7 @@ else: from .models.baichuan import VLLMBaichuan as VLLMBaichuan from .models.dolly_v2 import VLLMDollyV2 as VLLMDollyV2 from .models.gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX + from .models.falcon import VLLMFalcon as VLLMFalcon from .models.llama import VLLMLlama as VLLMLlama from .models.mpt import VLLMMPT as VLLMMPT from .models.opt import VLLMOPT as VLLMOPT diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index c510441e..12778a65 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -4,7 +4,7 @@ from collections import OrderedDict from .configuration_auto import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass, _LazyAutoMapping -MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")]) +MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) class AutoVLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING diff --git a/openllm-python/src/openllm/models/falcon/__init__.py b/openllm-python/src/openllm/models/falcon/__init__.py index bfc5341b..0d6f3138 100644 --- a/openllm-python/src/openllm/models/falcon/__init__.py +++ b/openllm-python/src/openllm/models/falcon/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError -from openllm.utils import LazyModule, is_torch_available +from openllm.utils import LazyModule, is_torch_available, is_vllm_available _import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]} if t.TYPE_CHECKING: @@ -16,5 +16,11 @@ except MissingDependencyError: pass else: _import_structure["modeling_falcon"] = ["Falcon"] if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon +try: + if not is_vllm_available(): raise MissingDependencyError +except MissingDependencyError: pass +else: + _import_structure["modeling_vllm_falcon"] = ["VLLMFalcon"] + if t.TYPE_CHECKING: from .modeling_vllm_falcon import VLLMFalcon as VLLMFalcon sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/openllm-python/src/openllm/models/falcon/configuration_falcon.py b/openllm-python/src/openllm/models/falcon/configuration_falcon.py index 6b1c90a2..07101fb1 100644 --- a/openllm-python/src/openllm/models/falcon/configuration_falcon.py +++ b/openllm-python/src/openllm/models/falcon/configuration_falcon.py @@ -27,7 +27,9 @@ Run a LLMServer for FalconLM model. \b ## Usage -Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in your system. +By default, this model will use the PyTorch model for inference. However, this model also support vLLM. + +Note that if you use vLLM, a NVIDIA GPU is required. \b FalconLM Runner will use tiiuae/falcon-7b as the default model. To change to any other FalconLM diff --git a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py new file mode 100644 index 00000000..14a5af2e --- /dev/null +++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py @@ -0,0 +1,11 @@ +from __future__ import annotations +import logging, typing as t, openllm +from openllm._prompt import process_prompt +from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE +if t.TYPE_CHECKING: import vllm, transformers + +logger = logging.getLogger(__name__) +class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]): + __openllm_internal__ = True + tokenizer_id = "local" + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {} diff --git a/openllm-python/src/openllm/utils/dummy_vllm_objects.py b/openllm-python/src/openllm/utils/dummy_vllm_objects.py index f227b3ff..d1ca58c8 100644 --- a/openllm-python/src/openllm/utils/dummy_vllm_objects.py +++ b/openllm-python/src/openllm/utils/dummy_vllm_objects.py @@ -9,6 +9,9 @@ class VLLMBaichuan(metaclass=_DummyMetaclass): class VLLMDollyV2(metaclass=_DummyMetaclass): _backends=["vllm"] def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"]) +class VLLMFalcon(metaclass=_DummyMetaclass): + _backends=["vllm","einops","xformers"] + def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm","einops","xformers"]) class VLLMGPTNeoX(metaclass=_DummyMetaclass): _backends=["vllm"] def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"]) @@ -31,4 +34,4 @@ class AutoVLLM(metaclass=_DummyMetaclass): _backends=["vllm"] def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"]) MODEL_VLLM_MAPPING_NAMES:_t.Any=None -__all__:list[str]=["MODEL_VLLM_MAPPING_NAMES","AutoVLLM","VLLMBaichuan","VLLMDollyV2","VLLMGPTNeoX","VLLMMPT","VLLMOPT","VLLMStableLM","VLLMStarCoder","VLLMLlama"] +__all__:list[str]=["MODEL_VLLM_MAPPING_NAMES","AutoVLLM","VLLMBaichuan","VLLMDollyV2","VLLMFalcon","VLLMGPTNeoX","VLLMMPT","VLLMOPT","VLLMStableLM","VLLMStarCoder","VLLMLlama"]