mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-17 03:47:54 -05:00
136 lines
4.5 KiB
Python
136 lines
4.5 KiB
Python
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
|
|
|
|
import attr
|
|
import torch
|
|
from peft.config import PeftConfig
|
|
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
|
|
|
|
from bentoml import Model, Tag
|
|
from openllm_core import LLMConfig
|
|
from openllm_core._schemas import GenerationOutput
|
|
from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
|
|
|
|
from ._quantisation import QuantizationConfig
|
|
from ._runners import Runner
|
|
|
|
InjectedModel = Union[PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM]
|
|
|
|
class IdentifyingParams(TypedDict):
|
|
configuration: str
|
|
model_ids: str
|
|
model_id: str
|
|
|
|
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
|
|
CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
|
|
Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]
|
|
|
|
@attr.define(slots=True, repr=False, init=False)
|
|
class LLM(Generic[M, T]):
|
|
_model_id: str
|
|
_revision: Optional[str]
|
|
_quantization_config: Optional[QuantizationConfig]
|
|
_quantise: Optional[LiteralQuantise]
|
|
_model_decls: Tuple[Any, ...]
|
|
__model_attrs: Dict[str, Any]
|
|
__tokenizer_attrs: Dict[str, Any]
|
|
_tag: Tag
|
|
_adapter_map: Optional[AdapterMap]
|
|
_serialisation: LiteralSerialisation
|
|
_local: bool
|
|
|
|
__llm_dtype__: Dtype = ...
|
|
__llm_torch_dtype__: Optional[torch.dtype] = ...
|
|
__llm_config__: Optional[LLMConfig] = ...
|
|
__llm_backend__: LiteralBackend = ...
|
|
__llm_quantization_config__: Optional[QuantizationConfig] = ...
|
|
__llm_runner__: Optional[Runner[M, T]] = ...
|
|
__llm_model__: Optional[M] = ...
|
|
__llm_tokenizer__: Optional[T] = ...
|
|
__llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
|
|
__llm_trust_remote_code__: bool = ...
|
|
|
|
def __repr__(self) -> str: ...
|
|
def __init__(
|
|
self,
|
|
model_id: str,
|
|
model_version: Optional[str] = ...,
|
|
model_tag: Optional[Union[str, Tag]] = ...,
|
|
llm_config: Optional[LLMConfig] = ...,
|
|
backend: Optional[LiteralBackend] = ...,
|
|
*args: Any,
|
|
quantize: Optional[LiteralQuantise] = ...,
|
|
quantization_config: Optional[QuantizationConfig] = ...,
|
|
adapter_map: Optional[Dict[str, str]] = ...,
|
|
serialisation: LiteralSerialisation = ...,
|
|
trust_remote_code: bool = ...,
|
|
embedded: bool = ...,
|
|
dtype: Dtype = ...,
|
|
low_cpu_mem_usage: bool = ...,
|
|
**attrs: Any,
|
|
) -> None: ...
|
|
@property
|
|
def _torch_dtype(self) -> torch.dtype: ...
|
|
@property
|
|
def _model_attrs(self) -> Dict[str, Any]: ...
|
|
@_model_attrs.setter
|
|
def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
|
|
@property
|
|
def _tokenizer_attrs(self) -> Dict[str, Any]: ...
|
|
@property
|
|
def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
|
|
@property
|
|
def trust_remote_code(self) -> bool: ...
|
|
@property
|
|
def model_id(self) -> str: ...
|
|
@property
|
|
def revision(self) -> str: ...
|
|
@property
|
|
def tag(self) -> Tag: ...
|
|
@property
|
|
def bentomodel(self) -> Model: ...
|
|
@property
|
|
def quantization_config(self) -> QuantizationConfig: ...
|
|
@property
|
|
def has_adapters(self) -> bool: ...
|
|
@property
|
|
def local(self) -> bool: ...
|
|
@property
|
|
def quantise(self) -> Optional[LiteralQuantise]: ...
|
|
@property
|
|
def llm_type(self) -> str: ...
|
|
@property
|
|
def identifying_params(self) -> IdentifyingParams: ...
|
|
@property
|
|
def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
|
|
@property
|
|
def config(self) -> LLMConfig: ...
|
|
@property
|
|
def tokenizer(self) -> T: ...
|
|
@property
|
|
def model(self) -> M: ...
|
|
@property
|
|
def runner(self) -> Runner[M, T]: ...
|
|
@property
|
|
def adapter_map(self) -> ResolvedAdapterMap: ...
|
|
def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
|
|
async def generate(
|
|
self,
|
|
prompt: Optional[str],
|
|
prompt_token_ids: Optional[List[int]] = ...,
|
|
stop: Optional[Union[str, Iterable[str]]] = ...,
|
|
stop_token_ids: Optional[List[int]] = ...,
|
|
request_id: Optional[str] = ...,
|
|
adapter_name: Optional[str] = ...,
|
|
**attrs: Any,
|
|
) -> GenerationOutput: ...
|
|
async def generate_iterator(
|
|
self,
|
|
prompt: Optional[str],
|
|
prompt_token_ids: Optional[List[int]] = ...,
|
|
stop: Optional[Union[str, Iterable[str]]] = ...,
|
|
stop_token_ids: Optional[List[int]] = ...,
|
|
request_id: Optional[str] = ...,
|
|
adapter_name: Optional[str] = ...,
|
|
**attrs: Any,
|
|
) -> AsyncGenerator[GenerationOutput, None]: ...
|