mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-05 07:36:15 -05:00
feat(models): add vLLM support for Falcon (#223)
This commit is contained in:
@@ -114,6 +114,7 @@ else:
|
||||
_import_structure["models.llama"].extend(["VLLMLlama"])
|
||||
_import_structure["models.opt"].extend(["VLLMOPT"])
|
||||
_import_structure["models.dolly_v2"].extend(["VLLMDollyV2"])
|
||||
_import_structure["models.falcon"].extend(["VLLMFalcon"])
|
||||
_import_structure["models.gpt_neox"].extend(["VLLMGPTNeoX"])
|
||||
_import_structure["models.mpt"].extend(["VLLMMPT"])
|
||||
_import_structure["models.stablelm"].extend(["VLLMStableLM"])
|
||||
@@ -124,6 +125,7 @@ else:
|
||||
from .models.baichuan import VLLMBaichuan as VLLMBaichuan
|
||||
from .models.dolly_v2 import VLLMDollyV2 as VLLMDollyV2
|
||||
from .models.gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX
|
||||
from .models.falcon import VLLMFalcon as VLLMFalcon
|
||||
from .models.llama import VLLMLlama as VLLMLlama
|
||||
from .models.mpt import VLLMMPT as VLLMMPT
|
||||
from .models.opt import VLLMOPT as VLLMOPT
|
||||
|
||||
@@ -4,7 +4,7 @@ from collections import OrderedDict
|
||||
from .configuration_auto import CONFIG_MAPPING_NAMES
|
||||
from .factory import BaseAutoLLMClass, _LazyAutoMapping
|
||||
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
|
||||
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
|
||||
class AutoVLLM(BaseAutoLLMClass):
|
||||
_model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import sys, typing as t
|
||||
from openllm.exceptions import MissingDependencyError
|
||||
from openllm.utils import LazyModule, is_torch_available
|
||||
from openllm.utils import LazyModule, is_torch_available, is_vllm_available
|
||||
|
||||
_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -16,5 +16,11 @@ except MissingDependencyError: pass
|
||||
else:
|
||||
_import_structure["modeling_falcon"] = ["Falcon"]
|
||||
if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon
|
||||
try:
|
||||
if not is_vllm_available(): raise MissingDependencyError
|
||||
except MissingDependencyError: pass
|
||||
else:
|
||||
_import_structure["modeling_vllm_falcon"] = ["VLLMFalcon"]
|
||||
if t.TYPE_CHECKING: from .modeling_vllm_falcon import VLLMFalcon as VLLMFalcon
|
||||
|
||||
sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
|
||||
|
||||
@@ -27,7 +27,9 @@ Run a LLMServer for FalconLM model.
|
||||
\b
|
||||
## Usage
|
||||
|
||||
Currently, FalconLM only supports PyTorch. Make sure ``torch`` is available in your system.
|
||||
By default, this model will use the PyTorch model for inference. However, this model also support vLLM.
|
||||
|
||||
Note that if you use vLLM, a NVIDIA GPU is required.
|
||||
|
||||
\b
|
||||
FalconLM Runner will use tiiuae/falcon-7b as the default model. To change to any other FalconLM
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
from __future__ import annotations
|
||||
import logging, typing as t, openllm
|
||||
from openllm._prompt import process_prompt
|
||||
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
|
||||
if t.TYPE_CHECKING: import vllm, transformers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
|
||||
__openllm_internal__ = True
|
||||
tokenizer_id = "local"
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
|
||||
@@ -9,6 +9,9 @@ class VLLMBaichuan(metaclass=_DummyMetaclass):
|
||||
class VLLMDollyV2(metaclass=_DummyMetaclass):
|
||||
_backends=["vllm"]
|
||||
def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"])
|
||||
class VLLMFalcon(metaclass=_DummyMetaclass):
|
||||
_backends=["vllm","einops","xformers"]
|
||||
def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm","einops","xformers"])
|
||||
class VLLMGPTNeoX(metaclass=_DummyMetaclass):
|
||||
_backends=["vllm"]
|
||||
def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"])
|
||||
@@ -31,4 +34,4 @@ class AutoVLLM(metaclass=_DummyMetaclass):
|
||||
_backends=["vllm"]
|
||||
def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,["vllm"])
|
||||
MODEL_VLLM_MAPPING_NAMES:_t.Any=None
|
||||
__all__:list[str]=["MODEL_VLLM_MAPPING_NAMES","AutoVLLM","VLLMBaichuan","VLLMDollyV2","VLLMGPTNeoX","VLLMMPT","VLLMOPT","VLLMStableLM","VLLMStarCoder","VLLMLlama"]
|
||||
__all__:list[str]=["MODEL_VLLM_MAPPING_NAMES","AutoVLLM","VLLMBaichuan","VLLMDollyV2","VLLMFalcon","VLLMGPTNeoX","VLLMMPT","VLLMOPT","VLLMStableLM","VLLMStarCoder","VLLMLlama"]
|
||||
|
||||
Reference in New Issue
Block a user