Files
OpenLLM/openllm-python/src/openllm/_llm.pyi
2024-03-15 03:47:23 -04:00

136 lines
4.5 KiB
Python

from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
import attr
import torch
from peft.config import PeftConfig
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
from bentoml import Model, Tag
from openllm_core import LLMConfig
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
from ._quantisation import QuantizationConfig
from ._runners import Runner
InjectedModel = Union[PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM]
class IdentifyingParams(TypedDict):
configuration: str
model_ids: str
model_id: str
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]
@attr.define(slots=True, repr=False, init=False)
class LLM(Generic[M, T]):
_model_id: str
_revision: Optional[str]
_quantization_config: Optional[QuantizationConfig]
_quantise: Optional[LiteralQuantise]
_model_decls: Tuple[Any, ...]
__model_attrs: Dict[str, Any]
__tokenizer_attrs: Dict[str, Any]
_tag: Tag
_adapter_map: Optional[AdapterMap]
_serialisation: LiteralSerialisation
_local: bool
__llm_dtype__: Dtype = ...
__llm_torch_dtype__: Optional[torch.dtype] = ...
__llm_config__: Optional[LLMConfig] = ...
__llm_backend__: LiteralBackend = ...
__llm_quantization_config__: Optional[QuantizationConfig] = ...
__llm_runner__: Optional[Runner[M, T]] = ...
__llm_model__: Optional[M] = ...
__llm_tokenizer__: Optional[T] = ...
__llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
__llm_trust_remote_code__: bool = ...
def __repr__(self) -> str: ...
def __init__(
self,
model_id: str,
model_version: Optional[str] = ...,
model_tag: Optional[Union[str, Tag]] = ...,
llm_config: Optional[LLMConfig] = ...,
backend: Optional[LiteralBackend] = ...,
*args: Any,
quantize: Optional[LiteralQuantise] = ...,
quantization_config: Optional[QuantizationConfig] = ...,
adapter_map: Optional[Dict[str, str]] = ...,
serialisation: LiteralSerialisation = ...,
trust_remote_code: bool = ...,
embedded: bool = ...,
dtype: Dtype = ...,
low_cpu_mem_usage: bool = ...,
**attrs: Any,
) -> None: ...
@property
def _torch_dtype(self) -> torch.dtype: ...
@property
def _model_attrs(self) -> Dict[str, Any]: ...
@_model_attrs.setter
def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
@property
def _tokenizer_attrs(self) -> Dict[str, Any]: ...
@property
def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
@property
def trust_remote_code(self) -> bool: ...
@property
def model_id(self) -> str: ...
@property
def revision(self) -> str: ...
@property
def tag(self) -> Tag: ...
@property
def bentomodel(self) -> Model: ...
@property
def quantization_config(self) -> QuantizationConfig: ...
@property
def has_adapters(self) -> bool: ...
@property
def local(self) -> bool: ...
@property
def quantise(self) -> Optional[LiteralQuantise]: ...
@property
def llm_type(self) -> str: ...
@property
def identifying_params(self) -> IdentifyingParams: ...
@property
def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
@property
def config(self) -> LLMConfig: ...
@property
def tokenizer(self) -> T: ...
@property
def model(self) -> M: ...
@property
def runner(self) -> Runner[M, T]: ...
@property
def adapter_map(self) -> ResolvedAdapterMap: ...
def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
async def generate(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> GenerationOutput: ...
async def generate_iterator(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[GenerationOutput, None]: ...