mirror of
https://github.com/exo-explore/exo.git
synced 2025-12-23 22:27:50 -05:00
169 lines
4.9 KiB
Python
169 lines
4.9 KiB
Python
"""
|
|
This type stub file was generated by pyright.
|
|
"""
|
|
|
|
from functools import partial
|
|
from pathlib import Path
|
|
|
|
from transformers import PreTrainedTokenizerFast
|
|
|
|
class StreamingDetokenizer:
|
|
"""The streaming detokenizer interface so that we can detokenize one token at a time.
|
|
|
|
Example usage is as follows:
|
|
|
|
detokenizer = ...
|
|
|
|
# Reset the tokenizer state
|
|
detokenizer.reset()
|
|
|
|
for token in generate(...):
|
|
detokenizer.add_token(token.item())
|
|
|
|
# Contains the whole text so far. Some tokens may not be included
|
|
# since it contains whole words usually.
|
|
detokenizer.text
|
|
|
|
# Contains the printable segment (usually a word) since the last
|
|
# time it was accessed
|
|
detokenizer.last_segment
|
|
|
|
# Contains all the tokens added so far
|
|
detokenizer.tokens
|
|
|
|
# Make sure that we detokenize any remaining tokens
|
|
detokenizer.finalize()
|
|
|
|
# Now detokenizer.text should match tokenizer.decode(detokenizer.tokens)
|
|
"""
|
|
|
|
__slots__ = ...
|
|
def reset(self): ...
|
|
def add_token(self, token): ...
|
|
def finalize(self): ...
|
|
@property
|
|
def last_segment(self):
|
|
"""Return the last segment of readable text since last time this property was accessed."""
|
|
|
|
class NaiveStreamingDetokenizer(StreamingDetokenizer):
|
|
"""NaiveStreamingDetokenizer relies on the underlying tokenizer
|
|
implementation and should work with every tokenizer.
|
|
|
|
Its complexity is O(T^2) where T is the longest line since it will
|
|
repeatedly detokenize the same tokens until a new line is generated.
|
|
"""
|
|
def __init__(self, tokenizer) -> None: ...
|
|
def reset(self): # -> None:
|
|
...
|
|
def add_token(self, token): # -> None:
|
|
...
|
|
def finalize(self): # -> None:
|
|
...
|
|
@property
|
|
def text(self): # -> str:
|
|
...
|
|
|
|
class SPMStreamingDetokenizer(StreamingDetokenizer):
|
|
"""A streaming detokenizer for SPM models.
|
|
|
|
It adds tokens to the text if the next token starts with the special SPM
|
|
underscore which results in linear complexity.
|
|
"""
|
|
def __init__(self, tokenizer, trim_space=...) -> None: ...
|
|
def reset(self): # -> None:
|
|
...
|
|
def add_token(self, token): # -> None:
|
|
...
|
|
def finalize(self): # -> None:
|
|
...
|
|
|
|
class BPEStreamingDetokenizer(StreamingDetokenizer):
|
|
"""A streaming detokenizer for OpenAI style BPE models.
|
|
|
|
It adds tokens to the text if the next token starts with a space similar to
|
|
the SPM detokenizer.
|
|
"""
|
|
|
|
_byte_decoder = ...
|
|
_space_matches = ...
|
|
def __init__(self, tokenizer) -> None: ...
|
|
def reset(self): # -> None:
|
|
...
|
|
def add_token(self, token): # -> None:
|
|
...
|
|
def finalize(self): # -> None:
|
|
...
|
|
@classmethod
|
|
def make_byte_decoder(cls): # -> None:
|
|
"""See https://github.com/openai/gpt-2/blob/master/src/encoder.py for the rationale."""
|
|
|
|
class TokenizerWrapper:
|
|
"""A wrapper that combines an HF tokenizer and a detokenizer.
|
|
|
|
Accessing any attribute other than the ``detokenizer`` is forwarded to the
|
|
huggingface tokenizer.
|
|
"""
|
|
def __init__(self, tokenizer, detokenizer_class=..., eos_token_ids=...) -> None: ...
|
|
def add_eos_token(self, token: str): # -> None:
|
|
...
|
|
@property
|
|
def has_thinking(self): # -> bool:
|
|
...
|
|
@property
|
|
def think_start(self): # -> str | None:
|
|
...
|
|
@property
|
|
def think_end(self): # -> str | None:
|
|
...
|
|
@property
|
|
def has_tool_calling(self): # -> bool:
|
|
...
|
|
@property
|
|
def tool_call_start(self): # -> str | None:
|
|
...
|
|
@property
|
|
def tool_call_end(self): # -> str | None:
|
|
...
|
|
@property
|
|
def detokenizer(self): # -> NaiveStreamingDetokenizer:
|
|
"""
|
|
Get a stateful streaming detokenizer.
|
|
"""
|
|
|
|
def __getattr__(self, attr): # -> set[Any] | Any:
|
|
...
|
|
def __setattr__(self, attr, value): # -> None:
|
|
...
|
|
|
|
class NewlineTokenizer(PreTrainedTokenizerFast):
|
|
"""A tokenizer that replaces newlines with <n> and <n> with new line."""
|
|
def __init__(self, *args, **kwargs) -> None: ...
|
|
def encode(self, text, **kwargs): # -> list[int]:
|
|
...
|
|
def encode_batch(self, texts, **kwargs): ...
|
|
def decode(self, *args, **kwargs): # -> str:
|
|
...
|
|
def batch_decode(self, *args, **kwargs): # -> list[str]:
|
|
...
|
|
|
|
def load_tokenizer(
|
|
model_path: Path,
|
|
tokenizer_config_extra=...,
|
|
return_tokenizer=...,
|
|
eos_token_ids=...,
|
|
) -> (
|
|
TokenizerWrapper
|
|
| type[SPMStreamingDetokenizer]
|
|
| partial[SPMStreamingDetokenizer]
|
|
| type[BPEStreamingDetokenizer]
|
|
| type[NaiveStreamingDetokenizer]
|
|
):
|
|
"""Load a huggingface tokenizer and try to infer the type of streaming
|
|
detokenizer to use.
|
|
|
|
Note, to use a fast streaming tokenizer, pass a local file path rather than
|
|
a Hugging Face repo ID.
|
|
"""
|
|
|
|
def no_bos_or_eos(sequence: list, bos: int, eos: int) -> list: ...
|