from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypedDict, Union

import attr
import torch
from peft.config import PeftConfig
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM

from bentoml import Model, Tag
from openllm_core import LLMConfig
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
  AdapterMap,
  AdapterType,
  LiteralBackend,
  LiteralDtype,
  LiteralQuantise,
  LiteralSerialisation,
  M,
  T,
)

from ._quantisation import QuantizationConfig
from ._runners import Runner

InjectedModel = Union[PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM]

class IdentifyingParams(TypedDict):
  configuration: str
  model_ids: str
  model_id: str

ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]

@attr.define(slots=True, repr=False, init=False)
class LLM(Generic[M, T]):
  _model_id: str
  _revision: Optional[str]
  _quantization_config: Optional[QuantizationConfig]
  _quantise: Optional[LiteralQuantise]
  _model_decls: Tuple[Any, ...]
  __model_attrs: Dict[str, Any]
  __tokenizer_attrs: Dict[str, Any]
  _tag: Tag
  _adapter_map: Optional[AdapterMap]
  _serialisation: LiteralSerialisation
  _local: bool

  __llm_dtype__: Dtype = ...
  __llm_torch_dtype__: Optional[torch.dtype] = ...
  __llm_config__: Optional[LLMConfig] = ...
  __llm_backend__: LiteralBackend = ...
  __llm_quantization_config__: Optional[QuantizationConfig] = ...
  __llm_runner__: Optional[Runner[M, T]] = ...
  __llm_model__: Optional[M] = ...
  __llm_tokenizer__: Optional[T] = ...
  __llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
  __llm_trust_remote_code__: bool = ...

  def __repr__(self) -> str: ...
  def __init__(
    self,
    model_id: str,
    model_version: Optional[str] = ...,
    model_tag: Optional[Union[str, Tag]] = ...,
    llm_config: Optional[LLMConfig] = ...,
    backend: Optional[LiteralBackend] = ...,
    *args: Any,
    quantize: Optional[LiteralQuantise] = ...,
    quantization_config: Optional[QuantizationConfig] = ...,
    adapter_map: Optional[Dict[str, str]] = ...,
    serialisation: LiteralSerialisation = ...,
    trust_remote_code: bool = ...,
    embedded: bool = ...,
    dtype: Dtype = ...,
    low_cpu_mem_usage: bool = ...,
    **attrs: Any,
  ) -> None: ...
  @property
  def _torch_dtype(self) -> torch.dtype: ...
  @property
  def _model_attrs(self) -> Dict[str, Any]: ...
  @_model_attrs.setter
  def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
  @property
  def _tokenizer_attrs(self) -> Dict[str, Any]: ...
  @property
  def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
  @property
  def trust_remote_code(self) -> bool: ...
  @property
  def model_id(self) -> str: ...
  @property
  def revision(self) -> str: ...
  @property
  def tag(self) -> Tag: ...
  @property
  def bentomodel(self) -> Model: ...
  @property
  def quantization_config(self) -> QuantizationConfig: ...
  @property
  def has_adapters(self) -> bool: ...
  @property
  def local(self) -> bool: ...
  @property
  def quantise(self) -> Optional[LiteralQuantise]: ...
  @property
  def llm_type(self) -> str: ...
  @property
  def identifying_params(self) -> IdentifyingParams: ...
  @property
  def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
  @property
  def config(self) -> LLMConfig: ...
  @property
  def tokenizer(self) -> T: ...
  @property
  def model(self) -> M: ...
  @property
  def runner(self) -> Runner[M, T]: ...
  @property
  def adapter_map(self) -> ResolvedAdapterMap: ...
  def prepare(
    self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
  ) -> Tuple[InjectedModel, T]: ...
  async def generate(
    self,
    prompt: Optional[str],
    prompt_token_ids: Optional[List[int]] = ...,
    stop: Optional[Union[str, Iterable[str]]] = ...,
    stop_token_ids: Optional[List[int]] = ...,
    request_id: Optional[str] = ...,
    adapter_name: Optional[str] = ...,
    **attrs: Any,
  ) -> GenerationOutput: ...
  async def generate_iterator(
    self,
    prompt: Optional[str],
    prompt_token_ids: Optional[List[int]] = ...,
    stop: Optional[Union[str, Iterable[str]]] = ...,
    stop_token_ids: Optional[List[int]] = ...,
    request_id: Optional[str] = ...,
    adapter_name: Optional[str] = ...,
    **attrs: Any,
  ) -> AsyncGenerator[GenerationOutput, None]: ...