mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-22 14:31:26 -05:00
185 lines
11 KiB
Python
185 lines
11 KiB
Python
"""OpenLLM.
|
|
|
|
An open platform for operating large language models in production. Fine-tune, serve,
|
|
deploy, and monitor any LLMs with ease.
|
|
|
|
* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
|
|
* Option to bring your own fine-tuned LLMs
|
|
* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
|
|
* Native integration with BentoML and LangChain for custom LLM apps
|
|
"""
|
|
from __future__ import annotations
|
|
import logging as _logging, os as _os, typing as _t, warnings as _warnings, openllm_core
|
|
from pathlib import Path as _Path
|
|
from . import exceptions as exceptions, utils as utils
|
|
|
|
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
|
|
from openllm_core._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
|
|
from openllm_core._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
|
|
from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig
|
|
|
|
if openllm_core.utils.DEBUG:
|
|
openllm_core.utils.set_debug_mode(True)
|
|
openllm_core.utils.set_quiet_mode(False)
|
|
_logging.basicConfig(level=_logging.NOTSET)
|
|
else:
|
|
# configuration for bitsandbytes before import
|
|
_os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
|
|
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
|
|
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
|
|
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
|
|
_warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
|
|
# NOTE: ignore the following warning from ghapi as it is not important for users
|
|
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
|
|
|
|
_import_structure: dict[str, list[str]] = {
|
|
"exceptions": [],
|
|
"models": [],
|
|
"client": [],
|
|
"bundle": [],
|
|
"playground": [],
|
|
"testing": [],
|
|
"utils": ["infer_auto_class"],
|
|
"serialisation": ["ggml", "transformers"],
|
|
"cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
|
|
"_quantisation": ["infer_quantisation_config"],
|
|
"_embeddings": ["GenericEmbeddingRunnable"],
|
|
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
|
|
"_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
|
|
"models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"],
|
|
"models.chatglm": [],
|
|
"models.baichuan": [],
|
|
"models.dolly_v2": [],
|
|
"models.falcon": [],
|
|
"models.flan_t5": [],
|
|
"models.gpt_neox": [],
|
|
"models.llama": [],
|
|
"models.mpt": [],
|
|
"models.opt": [],
|
|
"models.stablelm": [],
|
|
"models.starcoder": []
|
|
}
|
|
COMPILED = _Path(__file__).suffix in (".pyd", ".so")
|
|
|
|
if _t.TYPE_CHECKING:
|
|
from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
|
|
from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
|
|
from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
|
|
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
|
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
|
|
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
|
|
from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
|
|
from .serialisation import ggml as ggml, transformers as transformers
|
|
from .utils import infer_auto_class as infer_auto_class
|
|
|
|
try:
|
|
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()):
|
|
raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
_import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
|
|
else:
|
|
_import_structure["models.chatglm"].extend(["ChatGLM"])
|
|
_import_structure["models.baichuan"].extend(["Baichuan"])
|
|
if _t.TYPE_CHECKING:
|
|
from .models.baichuan import Baichuan as Baichuan
|
|
from .models.chatglm import ChatGLM as ChatGLM
|
|
try:
|
|
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()):
|
|
raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
|
|
else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
|
|
else:
|
|
_import_structure["models.mpt"].extend(["MPT"])
|
|
if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
|
|
try:
|
|
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()):
|
|
raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
|
|
else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
|
|
else:
|
|
_import_structure["models.falcon"].extend(["Falcon"])
|
|
if _t.TYPE_CHECKING: from .models.falcon import Falcon as Falcon
|
|
|
|
try:
|
|
if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
_import_structure["utils.dummy_pt_objects"] = [
|
|
name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
|
|
]
|
|
else:
|
|
_import_structure["models.flan_t5"].extend(["FlanT5"])
|
|
_import_structure["models.dolly_v2"].extend(["DollyV2"])
|
|
_import_structure["models.starcoder"].extend(["StarCoder"])
|
|
_import_structure["models.stablelm"].extend(["StableLM"])
|
|
_import_structure["models.opt"].extend(["OPT"])
|
|
_import_structure["models.gpt_neox"].extend(["GPTNeoX"])
|
|
_import_structure["models.llama"].extend(["Llama"])
|
|
_import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING"])
|
|
if _t.TYPE_CHECKING:
|
|
from .models.auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM
|
|
from .models.dolly_v2 import DollyV2 as DollyV2
|
|
from .models.flan_t5 import FlanT5 as FlanT5
|
|
from .models.gpt_neox import GPTNeoX as GPTNeoX
|
|
from .models.llama import Llama as Llama
|
|
from .models.opt import OPT as OPT
|
|
from .models.stablelm import StableLM as StableLM
|
|
from .models.starcoder import StarCoder as StarCoder
|
|
try:
|
|
if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
_import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
|
|
else:
|
|
_import_structure["models.baichuan"].extend(["VLLMBaichuan"])
|
|
_import_structure["models.llama"].extend(["VLLMLlama"])
|
|
_import_structure["models.opt"].extend(["VLLMOPT"])
|
|
_import_structure["models.dolly_v2"].extend(["VLLMDollyV2"])
|
|
_import_structure["models.falcon"].extend(["VLLMFalcon"])
|
|
_import_structure["models.gpt_neox"].extend(["VLLMGPTNeoX"])
|
|
_import_structure["models.mpt"].extend(["VLLMMPT"])
|
|
_import_structure["models.stablelm"].extend(["VLLMStableLM"])
|
|
_import_structure["models.starcoder"].extend(["VLLMStarCoder"])
|
|
_import_structure["models.auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"])
|
|
if _t.TYPE_CHECKING:
|
|
from .models.auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM
|
|
from .models.baichuan import VLLMBaichuan as VLLMBaichuan
|
|
from .models.dolly_v2 import VLLMDollyV2 as VLLMDollyV2
|
|
from .models.gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX
|
|
from .models.falcon import VLLMFalcon as VLLMFalcon
|
|
from .models.llama import VLLMLlama as VLLMLlama
|
|
from .models.mpt import VLLMMPT as VLLMMPT
|
|
from .models.opt import VLLMOPT as VLLMOPT
|
|
from .models.stablelm import VLLMStableLM as VLLMStableLM
|
|
from .models.starcoder import VLLMStarCoder as VLLMStarCoder
|
|
try:
|
|
if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
_import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
|
|
else:
|
|
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
|
|
_import_structure["models.opt"].extend(["FlaxOPT"])
|
|
_import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"])
|
|
if _t.TYPE_CHECKING:
|
|
from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
|
|
from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
|
|
from .models.opt import FlaxOPT as FlaxOPT
|
|
try:
|
|
if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
|
|
except exceptions.MissingDependencyError:
|
|
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
|
|
else:
|
|
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
|
|
_import_structure["models.opt"].extend(["TFOPT"])
|
|
_import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"])
|
|
if _t.TYPE_CHECKING:
|
|
from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM
|
|
from .models.flan_t5 import TFFlanT5 as TFFlanT5
|
|
from .models.opt import TFOPT as TFOPT
|
|
|
|
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
|
|
__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED, "__openllm_migration__": {"LLMEmbeddings": "EmbeddingsOutput"}})
|
|
__all__ = __lazy.__all__
|
|
__dir__ = __lazy.__dir__
|
|
__getattr__ = __lazy.__getattr__
|