mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-10 02:57:37 -04:00
revert: "ci: pre-commit autoupdate [pre-commit.ci] (#931)"
This reverts commit 7b00c84c2a.
This commit is contained in:
@@ -9,16 +9,10 @@ else:
|
||||
# configuration for bitsandbytes before import
|
||||
_os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
|
||||
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
|
||||
_warnings.filterwarnings(
|
||||
'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
|
||||
)
|
||||
_warnings.filterwarnings(
|
||||
'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
|
||||
)
|
||||
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
|
||||
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
|
||||
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
|
||||
_warnings.filterwarnings(
|
||||
'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
|
||||
)
|
||||
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
|
||||
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
|
||||
__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
|
||||
__name__,
|
||||
|
||||
@@ -11,48 +11,14 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
|
||||
"""
|
||||
|
||||
# update-config-stubs.py: import stubs start
|
||||
from openlm_core.config import (
|
||||
CONFIG_MAPPING as CONFIG_MAPPING,
|
||||
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
|
||||
AutoConfig as AutoConfig,
|
||||
BaichuanConfig as BaichuanConfig,
|
||||
ChatGLMConfig as ChatGLMConfig,
|
||||
DollyV2Config as DollyV2Config,
|
||||
FalconConfig as FalconConfig,
|
||||
FlanT5Config as FlanT5Config,
|
||||
GPTNeoXConfig as GPTNeoXConfig,
|
||||
LlamaConfig as LlamaConfig,
|
||||
MistralConfig as MistralConfig,
|
||||
MixtralConfig as MixtralConfig,
|
||||
MPTConfig as MPTConfig,
|
||||
OPTConfig as OPTConfig,
|
||||
PhiConfig as PhiConfig,
|
||||
QwenConfig as QwenConfig,
|
||||
StableLMConfig as StableLMConfig,
|
||||
StarCoderConfig as StarCoderConfig,
|
||||
YiConfig as YiConfig,
|
||||
)
|
||||
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
|
||||
# update-config-stubs.py: import stubs stop
|
||||
|
||||
from openllm_cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start
|
||||
from openllm_core._configuration import (
|
||||
GenerationConfig as GenerationConfig,
|
||||
LLMConfig as LLMConfig,
|
||||
SamplingParams as SamplingParams,
|
||||
)
|
||||
from openllm_core._schemas import (
|
||||
GenerationInput as GenerationInput,
|
||||
GenerationOutput as GenerationOutput,
|
||||
MetadataOutput as MetadataOutput,
|
||||
)
|
||||
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
|
||||
from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput
|
||||
|
||||
from . import (
|
||||
bundle as bundle,
|
||||
client as client,
|
||||
exceptions as exceptions,
|
||||
serialisation as serialisation,
|
||||
utils as utils,
|
||||
)
|
||||
from . import bundle as bundle, client as client, exceptions as exceptions, serialisation as serialisation, utils as utils
|
||||
from ._deprecated import Runner as Runner
|
||||
from ._llm import LLM as LLM
|
||||
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
||||
|
||||
@@ -19,9 +19,7 @@ def Runner(
|
||||
if llm_config is None:
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
if not ensure_available:
|
||||
logger.warning(
|
||||
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
|
||||
)
|
||||
logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
|
||||
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
|
||||
warnings.warn(
|
||||
f"""\
|
||||
@@ -42,13 +40,8 @@ def Runner(
|
||||
attrs.update({
|
||||
'model_id': model_id,
|
||||
'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
|
||||
'serialisation': getenv(
|
||||
'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
|
||||
),
|
||||
'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
|
||||
})
|
||||
return openllm.LLM(
|
||||
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'),
|
||||
llm_config=llm_config,
|
||||
embedded=init_local,
|
||||
**attrs,
|
||||
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs
|
||||
).runner
|
||||
|
||||
@@ -47,9 +47,7 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
|
||||
|
||||
@attr.define(slots=False, repr=False, init=False)
|
||||
class LLM(t.Generic[M, T]):
|
||||
async def generate(
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
):
|
||||
async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt':
|
||||
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
|
||||
config = self.config.model_construct_env(**attrs)
|
||||
@@ -64,15 +62,10 @@ class LLM(t.Generic[M, T]):
|
||||
raise RuntimeError('No result is returned.')
|
||||
return final_result.with_options(
|
||||
prompt=prompt,
|
||||
outputs=[
|
||||
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
|
||||
for output in final_result.outputs
|
||||
],
|
||||
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs],
|
||||
)
|
||||
|
||||
async def generate_iterator(
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
):
|
||||
async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
|
||||
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
|
||||
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt':
|
||||
@@ -137,9 +130,7 @@ class LLM(t.Generic[M, T]):
|
||||
# The below are mainly for internal implementation that you don't have to worry about.
|
||||
_model_id: str
|
||||
_revision: t.Optional[str] #
|
||||
_quantization_config: t.Optional[
|
||||
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
|
||||
]
|
||||
_quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
|
||||
_quantise: t.Optional[LiteralQuantise]
|
||||
_model_decls: t.Tuple[t.Any, ...]
|
||||
__model_attrs: t.Dict[str, t.Any] #
|
||||
@@ -155,9 +146,7 @@ class LLM(t.Generic[M, T]):
|
||||
__llm_torch_dtype__: 'torch.dtype' = None
|
||||
__llm_config__: t.Optional[LLMConfig] = None
|
||||
__llm_backend__: LiteralBackend = None
|
||||
__llm_quantization_config__: t.Optional[
|
||||
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
|
||||
] = None
|
||||
__llm_quantization_config__: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] = None
|
||||
__llm_runner__: t.Optional[Runner[M, T]] = None
|
||||
__llm_model__: t.Optional[M] = None
|
||||
__llm_tokenizer__: t.Optional[T] = None
|
||||
@@ -188,9 +177,7 @@ class LLM(t.Generic[M, T]):
|
||||
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
|
||||
if torch_dtype is not None:
|
||||
warnings.warn(
|
||||
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3
|
||||
)
|
||||
dtype = torch_dtype
|
||||
_local = False
|
||||
@@ -246,19 +233,13 @@ class LLM(t.Generic[M, T]):
|
||||
|
||||
class _Quantise:
|
||||
@staticmethod
|
||||
def pt(llm: LLM, quantise=None):
|
||||
return quantise
|
||||
|
||||
def pt(llm: LLM, quantise=None): return quantise
|
||||
@staticmethod
|
||||
def vllm(llm: LLM, quantise=None):
|
||||
return quantise
|
||||
|
||||
def vllm(llm: LLM, quantise=None): return quantise
|
||||
@staticmethod
|
||||
def ctranslate(llm: LLM, quantise=None):
|
||||
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
|
||||
raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise == 'int8':
|
||||
quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
|
||||
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
|
||||
return quantise
|
||||
|
||||
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
|
||||
@@ -266,15 +247,10 @@ class LLM(t.Generic[M, T]):
|
||||
model_id, *maybe_revision = model_id.rsplit(':')
|
||||
if len(maybe_revision) > 0:
|
||||
if model_version is not None:
|
||||
logger.warning(
|
||||
"revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
|
||||
)
|
||||
logger.warning("revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
|
||||
model_version = maybe_revision[0]
|
||||
if validate_is_path(model_id):
|
||||
model_id, model_version = (
|
||||
resolve_filepath(model_id),
|
||||
first_not_none(model_version, default=generate_hash_from_file(model_id)),
|
||||
)
|
||||
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
|
||||
return f'{backend}-{normalise_model_name(model_id)}', model_version
|
||||
|
||||
@functools.cached_property
|
||||
@@ -283,11 +259,9 @@ class LLM(t.Generic[M, T]):
|
||||
from cuda import cuda
|
||||
|
||||
err, *_ = cuda.cuInit(0)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
err, _ = cuda.cuDeviceGetCount()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Failed to get CUDA device count.')
|
||||
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to get CUDA device count.')
|
||||
return True
|
||||
except (ImportError, RuntimeError):
|
||||
return False
|
||||
@@ -299,9 +273,7 @@ class LLM(t.Generic[M, T]):
|
||||
_map = _torch_dtype_mapping()
|
||||
if not isinstance(self.__llm_torch_dtype__, torch.dtype):
|
||||
try:
|
||||
hf_config = transformers.AutoConfig.from_pretrained(
|
||||
self.bentomodel.path, trust_remote_code=self.trust_remote_code
|
||||
)
|
||||
hf_config = transformers.AutoConfig.from_pretrained(self.bentomodel.path, trust_remote_code=self.trust_remote_code)
|
||||
except OpenLLMException:
|
||||
hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
|
||||
config_dtype = getattr(hf_config, 'torch_dtype', None)
|
||||
@@ -332,9 +304,7 @@ class LLM(t.Generic[M, T]):
|
||||
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
|
||||
|
||||
def _cascade_backend(self) -> LiteralBackend:
|
||||
logger.warning(
|
||||
'It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.'
|
||||
)
|
||||
logger.warning('It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.')
|
||||
if self._has_gpus:
|
||||
if is_vllm_available():
|
||||
return 'vllm'
|
||||
@@ -369,10 +339,7 @@ class LLM(t.Generic[M, T]):
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
|
||||
'padding_side': 'left',
|
||||
'truncation_side': 'left',
|
||||
}
|
||||
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {'padding_side': 'left', 'truncation_side': 'left'}
|
||||
|
||||
@property
|
||||
def trust_remote_code(self):
|
||||
@@ -405,9 +372,7 @@ class LLM(t.Generic[M, T]):
|
||||
if self._quantization_config is not None:
|
||||
self.__llm_quantization_config__ = self._quantization_config
|
||||
elif self._quantise is not None:
|
||||
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
|
||||
self, self._quantise, **self._model_attrs
|
||||
)
|
||||
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self, self._quantise, **self._model_attrs)
|
||||
else:
|
||||
raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
|
||||
return self.__llm_quantization_config__
|
||||
@@ -462,11 +427,7 @@ class LLM(t.Generic[M, T]):
|
||||
|
||||
model = get_peft_model(
|
||||
prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
|
||||
self.config['fine_tune_strategies']
|
||||
.get(adapter_type, self.config.make_fine_tune_config(adapter_type))
|
||||
.train()
|
||||
.with_config(**attrs)
|
||||
.build(),
|
||||
self.config['fine_tune_strategies'].get(adapter_type, self.config.make_fine_tune_config(adapter_type)).train().with_config(**attrs).build(),
|
||||
)
|
||||
if DEBUG:
|
||||
model.print_trainable_parameters()
|
||||
@@ -486,10 +447,7 @@ class LLM(t.Generic[M, T]):
|
||||
if self.__llm_adapter_map__ is None:
|
||||
_map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
|
||||
for adapter_type, adapter_tuple in self._adapter_map.items():
|
||||
base = first_not_none(
|
||||
self.config['fine_tune_strategies'].get(adapter_type),
|
||||
default=self.config.make_fine_tune_config(adapter_type),
|
||||
)
|
||||
base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type))
|
||||
for adapter in adapter_tuple:
|
||||
_map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
|
||||
self.__llm_adapter_map__ = _map
|
||||
@@ -504,9 +462,7 @@ class LLM(t.Generic[M, T]):
|
||||
import torch
|
||||
|
||||
loaded_in_kbit = (
|
||||
getattr(model, 'is_loaded_in_8bit', False)
|
||||
or getattr(model, 'is_loaded_in_4bit', False)
|
||||
or getattr(model, 'is_quantized', False)
|
||||
getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
|
||||
)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
try:
|
||||
@@ -528,9 +484,7 @@ class LLM(t.Generic[M, T]):
|
||||
if self.__llm_config__ is None:
|
||||
if self.__llm_backend__ == 'ctranslate':
|
||||
try:
|
||||
config = transformers.AutoConfig.from_pretrained(
|
||||
self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
|
||||
)
|
||||
config = transformers.AutoConfig.from_pretrained(self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code)
|
||||
except OpenLLMException:
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
|
||||
for architecture in config.architectures:
|
||||
@@ -563,18 +517,12 @@ def _torch_dtype_mapping() -> dict[str, torch.dtype]:
|
||||
|
||||
|
||||
def normalise_model_name(name: str) -> str:
|
||||
return (
|
||||
os.path.basename(resolve_filepath(name))
|
||||
if validate_is_path(name)
|
||||
else inflection.dasherize(name.replace('/', '--'))
|
||||
)
|
||||
return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/', '--'))
|
||||
|
||||
|
||||
def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
if not is_peft_available():
|
||||
raise RuntimeError(
|
||||
"LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
|
||||
)
|
||||
raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
resolved: AdapterMap = {}
|
||||
|
||||
@@ -8,16 +8,7 @@ from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2Seq
|
||||
from bentoml import Model, Tag
|
||||
from openllm_core import LLMConfig
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from openllm_core._typing_compat import (
|
||||
AdapterMap,
|
||||
AdapterType,
|
||||
LiteralBackend,
|
||||
LiteralDtype,
|
||||
LiteralQuantise,
|
||||
LiteralSerialisation,
|
||||
M,
|
||||
T,
|
||||
)
|
||||
from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
|
||||
|
||||
from ._quantisation import QuantizationConfig
|
||||
from ._runners import Runner
|
||||
@@ -121,9 +112,7 @@ class LLM(Generic[M, T]):
|
||||
def runner(self) -> Runner[M, T]: ...
|
||||
@property
|
||||
def adapter_map(self) -> ResolvedAdapterMap: ...
|
||||
def prepare(
|
||||
self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
|
||||
) -> Tuple[InjectedModel, T]: ...
|
||||
def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
|
||||
async def generate(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
|
||||
@@ -83,25 +83,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
|
||||
|
||||
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
|
||||
if not is_bitsandbytes_available():
|
||||
raise RuntimeError(
|
||||
'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
|
||||
)
|
||||
raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
|
||||
if quantise == 'int8':
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == 'int4':
|
||||
quantisation_config = create_int4_config()
|
||||
elif quantise == 'gptq':
|
||||
if not is_autogptq_available():
|
||||
raise MissingDependencyError(
|
||||
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
|
||||
)
|
||||
raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
|
||||
else:
|
||||
quantisation_config = create_gptq_config()
|
||||
elif quantise == 'awq':
|
||||
if not is_autoawq_available():
|
||||
raise MissingDependencyError(
|
||||
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
|
||||
)
|
||||
raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
|
||||
else:
|
||||
quantisation_config = create_awq_config()
|
||||
else:
|
||||
|
||||
@@ -9,18 +9,10 @@ from ._llm import LLM
|
||||
QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
|
||||
) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
|
||||
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
|
||||
) -> tuple[GPTQConfig, Dict[str, Any]]: ...
|
||||
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any) -> tuple[GPTQConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
|
||||
) -> tuple[AwqConfig, Dict[str, Any]]: ...
|
||||
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['awq'], **attrs: Any) -> tuple[AwqConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
|
||||
) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
|
||||
def infer_quantisation_config(self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
|
||||
|
||||
@@ -46,10 +46,7 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
|
||||
(
|
||||
'runner_methods',
|
||||
{
|
||||
method.name: {
|
||||
'batchable': method.config.batchable,
|
||||
'batch_dim': method.config.batch_dim if method.config.batchable else None,
|
||||
}
|
||||
method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
|
||||
for method in _.runner_methods
|
||||
},
|
||||
),
|
||||
@@ -114,7 +111,6 @@ class CTranslateRunnable(bentoml.Runnable):
|
||||
).model_dump_json()
|
||||
yield bentoml.io.SSE(out).marshal()
|
||||
|
||||
|
||||
@registry
|
||||
class vLLMRunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
|
||||
@@ -130,9 +126,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
if dev >= 2:
|
||||
num_gpus = min(dev // 2 * 2, dev)
|
||||
quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
|
||||
dtype = (
|
||||
torch.float16 if quantise == 'gptq' else llm._torch_dtype
|
||||
) # NOTE: quantise GPTQ doesn't support bfloat16 yet.
|
||||
dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
|
||||
try:
|
||||
self.model = vllm.AsyncLLMEngine.from_engine_args(
|
||||
vllm.AsyncEngineArgs(
|
||||
@@ -151,9 +145,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f'Failed to initialise vLLMEngine due to the following error:\n{err}'
|
||||
) from err
|
||||
raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
|
||||
|
||||
@bentoml.Runnable.method(batchable=False)
|
||||
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
|
||||
@@ -210,9 +202,7 @@ class PyTorchRunnable(bentoml.Runnable):
|
||||
if config['logprobs']: # FIXME: logprobs is not supported
|
||||
raise NotImplementedError('Logprobs is yet to be supported with encoder-decoder models.')
|
||||
encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
|
||||
start_ids = torch.as_tensor(
|
||||
[[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device
|
||||
)
|
||||
start_ids = torch.as_tensor([[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device)
|
||||
else:
|
||||
start_ids = torch.as_tensor([prompt_token_ids], device=self.device)
|
||||
|
||||
@@ -240,9 +230,7 @@ class PyTorchRunnable(bentoml.Runnable):
|
||||
)
|
||||
logits = self.model.lm_head(out[0])
|
||||
else:
|
||||
out = self.model(
|
||||
input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True
|
||||
)
|
||||
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True)
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
if logits_processor:
|
||||
@@ -286,12 +274,7 @@ class PyTorchRunnable(bentoml.Runnable):
|
||||
|
||||
tmp_output_ids, rfind_start = output_token_ids[input_len:], 0
|
||||
# XXX: Move this to API server
|
||||
text = self.tokenizer.decode(
|
||||
tmp_output_ids,
|
||||
skip_special_tokens=True,
|
||||
spaces_between_special_tokens=False,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
text = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
|
||||
|
||||
if len(stop) > 0:
|
||||
for it in stop:
|
||||
|
||||
@@ -1,19 +1,4 @@
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Dict,
|
||||
Generic,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
final,
|
||||
)
|
||||
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Protocol, Tuple, Type, TypeVar, Union, final
|
||||
|
||||
import torch
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizer
|
||||
@@ -89,11 +74,7 @@ class Runner(Protocol[Mo, To]):
|
||||
class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
|
||||
@staticmethod
|
||||
def async_stream(
|
||||
prompt_token_ids: List[int],
|
||||
request_id: str,
|
||||
stop: Optional[Union[Iterable[str], str]] = ...,
|
||||
adapter_name: Optional[str] = ...,
|
||||
**attrs: Any,
|
||||
prompt_token_ids: List[int], request_id: str, stop: Optional[Union[Iterable[str], str]] = ..., adapter_name: Optional[str] = ..., **attrs: Any
|
||||
) -> AsyncGenerator[str, None]: ...
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -18,20 +18,12 @@ svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[l
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
|
||||
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate',
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=JSON.from_sample(openllm.GenerationOutput.examples()),
|
||||
)
|
||||
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
|
||||
return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
|
||||
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate_stream',
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=Text(content_type='text/event-stream'),
|
||||
)
|
||||
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
|
||||
yield f'data: {it.model_dump_json()}\n\n'
|
||||
@@ -76,6 +68,4 @@ def helpers_messages_v1(message: MessagesConverterInput) -> str:
|
||||
return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
||||
|
||||
|
||||
openllm.mount_entrypoints(
|
||||
svc, llm
|
||||
) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
|
||||
@@ -158,16 +158,12 @@ class _ResourceMixin:
|
||||
elif isinstance(spec, list):
|
||||
return [str(x) for x in spec]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
|
||||
)
|
||||
raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
|
||||
|
||||
@staticmethod
|
||||
def validate(cls, val: list[t.Any]) -> None:
|
||||
if cls.resource_id == 'amd.com/gpu':
|
||||
raise RuntimeError(
|
||||
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
|
||||
)
|
||||
raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
|
||||
if not all(isinstance(i, str) for i in val):
|
||||
raise ValueError('Input list should be all string type.')
|
||||
|
||||
@@ -311,18 +307,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
|
||||
worker_index,
|
||||
assigned_resource_per_worker,
|
||||
)
|
||||
raise IndexError(
|
||||
f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
|
||||
)
|
||||
assigned_gpu = gpus[
|
||||
assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
|
||||
]
|
||||
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
|
||||
assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
|
||||
dev = ','.join(assigned_gpu)
|
||||
else:
|
||||
idx = worker_index // workers_per_resource
|
||||
if idx >= len(gpus):
|
||||
raise ValueError(
|
||||
f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
|
||||
)
|
||||
raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
|
||||
dev = str(gpus[idx])
|
||||
return dev
|
||||
|
||||
@@ -13,12 +13,7 @@ class CascadingResourceStrategy:
|
||||
TODO: Support CloudTPUResource
|
||||
"""
|
||||
@classmethod
|
||||
def get_worker_count(
|
||||
cls,
|
||||
runnable_class: Type[bentoml.Runnable],
|
||||
resource_request: Optional[Dict[str, Any]],
|
||||
workers_per_resource: float,
|
||||
) -> int:
|
||||
def get_worker_count(cls, runnable_class: Type[bentoml.Runnable], resource_request: Optional[Dict[str, Any]], workers_per_resource: float) -> int:
|
||||
"""Return the number of workers to be used for the given runnable class.
|
||||
|
||||
Note that for all available GPU, the number of workers will always be 1.
|
||||
@@ -40,7 +35,5 @@ class CascadingResourceStrategy:
|
||||
worker_index: The index of the worker, start from 0.
|
||||
"""
|
||||
@staticmethod
|
||||
def transpile_workers_to_cuda_envvar(
|
||||
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
|
||||
) -> str:
|
||||
def transpile_workers_to_cuda_envvar(workers_per_resource: Union[float, int], gpus: List[str], worker_index: int) -> str:
|
||||
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
|
||||
|
||||
@@ -38,12 +38,8 @@ def build_editable(path, package='openllm'):
|
||||
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
|
||||
from . import RefResolver
|
||||
|
||||
openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == 'vllm' else 'openllm'
|
||||
packages = [
|
||||
'scipy',
|
||||
'bentoml[tracing]>=1.1.11,<1.2',
|
||||
f'{openllm_package}>={RefResolver.from_strategy("release").version}',
|
||||
] # apparently bnb misses this one
|
||||
openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == "vllm" else "openllm"
|
||||
packages = ['scipy', 'bentoml[tracing]>=1.1.11,<1.2', f'{openllm_package}>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
|
||||
if adapter_map is not None:
|
||||
packages += ['openllm[fine-tune]']
|
||||
if extra_dependencies is not None:
|
||||
@@ -61,18 +57,7 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N
|
||||
def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
|
||||
from openllm_cli.entrypoint import process_environ
|
||||
|
||||
environ = process_environ(
|
||||
llm.config,
|
||||
llm.config['timeout'],
|
||||
1.0,
|
||||
None,
|
||||
True,
|
||||
llm.model_id,
|
||||
None,
|
||||
llm._serialisation,
|
||||
llm,
|
||||
use_current_env=False,
|
||||
)
|
||||
environ = process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm, use_current_env=False)
|
||||
# XXX: We need to quote this so that the envvar in container recognize as valid json
|
||||
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
|
||||
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
|
||||
@@ -101,10 +86,7 @@ def create_bento(
|
||||
'start_name': llm.config['start_name'],
|
||||
'base_name_or_path': llm.model_id,
|
||||
'bundler': 'openllm.bundle',
|
||||
**{
|
||||
f'{package.replace("-", "_")}_version': importlib.metadata.version(package)
|
||||
for package in {'openllm', 'openllm-core', 'openllm-client'}
|
||||
},
|
||||
**{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
|
||||
})
|
||||
if adapter_map:
|
||||
labels.update(adapter_map)
|
||||
|
||||
@@ -13,10 +13,7 @@ from .._llm import LLM
|
||||
|
||||
def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
|
||||
def construct_python_options(
|
||||
llm: LLM[M, T],
|
||||
llm_fs: FS,
|
||||
extra_dependencies: Optional[Tuple[str, ...]] = ...,
|
||||
adapter_map: Optional[Dict[str, str]] = ...,
|
||||
llm: LLM[M, T], llm_fs: FS, extra_dependencies: Optional[Tuple[str, ...]] = ..., adapter_map: Optional[Dict[str, str]] = ...
|
||||
) -> PythonOptions: ...
|
||||
def construct_docker_options(
|
||||
llm: LLM[M, T],
|
||||
|
||||
@@ -11,7 +11,5 @@ def mount_entrypoints(svc, llm):
|
||||
return svc
|
||||
|
||||
|
||||
__lazy = LazyModule(
|
||||
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
|
||||
)
|
||||
__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
|
||||
@@ -501,11 +501,7 @@ class OpenLLMSchemaGenerator(SchemaGenerator):
|
||||
endpoints_info.extend(sub_endpoints)
|
||||
elif not isinstance(route, Route) or not route.include_in_schema:
|
||||
continue
|
||||
elif (
|
||||
inspect.isfunction(route.endpoint)
|
||||
or inspect.ismethod(route.endpoint)
|
||||
or isinstance(route.endpoint, functools.partial)
|
||||
):
|
||||
elif inspect.isfunction(route.endpoint) or inspect.ismethod(route.endpoint) or isinstance(route.endpoint, functools.partial):
|
||||
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
|
||||
path = self._remove_converter(route.path)
|
||||
for method in route.methods or ['GET']:
|
||||
@@ -552,9 +548,7 @@ def get_generator(title, components=None, tags=None, inject=True):
|
||||
|
||||
def component_schema_generator(attr_cls, description=None):
|
||||
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
|
||||
schema['description'] = first_not_none(
|
||||
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
|
||||
)
|
||||
schema['description'] = first_not_none(getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}')
|
||||
for field in attr.fields(attr.resolve_types(attr_cls)):
|
||||
attr_type = field.type
|
||||
origin_type = t.get_origin(attr_type)
|
||||
@@ -596,10 +590,7 @@ def component_schema_generator(attr_cls, description=None):
|
||||
|
||||
|
||||
_SimpleSchema = types.new_class(
|
||||
'_SimpleSchema',
|
||||
(object,),
|
||||
{},
|
||||
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
|
||||
'_SimpleSchema', (object,), {}, lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it})
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,8 @@ class OpenLLMSchemaGenerator:
|
||||
|
||||
def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
|
||||
def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
|
||||
def append_schemas(
|
||||
svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
|
||||
) -> Service: ...
|
||||
def append_schemas(svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...) -> Service: ...
|
||||
def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
|
||||
def get_generator(
|
||||
title: str,
|
||||
components: Optional[List[Type[AttrsInstance]]] = ...,
|
||||
tags: Optional[List[Dict[str, Any]]] = ...,
|
||||
inject: bool = ...,
|
||||
title: str, components: Optional[List[Type[AttrsInstance]]] = ..., tags: Optional[List[Dict[str, Any]]] = ..., inject: bool = ...
|
||||
) -> OpenLLMSchemaGenerator: ...
|
||||
|
||||
@@ -59,10 +59,7 @@ def error_response(status_code, message):
|
||||
async def check_model(request, model):
|
||||
if request.model is None or request.model == model:
|
||||
return None
|
||||
return error_response(
|
||||
HTTPStatus.NOT_FOUND,
|
||||
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
|
||||
)
|
||||
return error_response(HTTPStatus.NOT_FOUND, f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.")
|
||||
|
||||
|
||||
def mount_to_svc(svc, llm):
|
||||
@@ -71,17 +68,13 @@ def mount_to_svc(svc, llm):
|
||||
routes=[
|
||||
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
|
||||
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
|
||||
Route(
|
||||
'/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
|
||||
),
|
||||
Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
|
||||
],
|
||||
)
|
||||
mount_path = '/cohere'
|
||||
|
||||
svc.mount_asgi_app(app, path=mount_path)
|
||||
return append_schemas(
|
||||
svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
|
||||
)
|
||||
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
|
||||
|
||||
|
||||
@add_schema_definitions
|
||||
@@ -140,18 +133,14 @@ async def cohere_generate(req, llm):
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.with_options(
|
||||
outputs=[
|
||||
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
|
||||
for output in final_result.outputs
|
||||
]
|
||||
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
|
||||
)
|
||||
return JSONResponse(
|
||||
converter.unstructure(
|
||||
Generations(
|
||||
id=request_id,
|
||||
generations=[
|
||||
Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason)
|
||||
for output in final_result.outputs
|
||||
Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason) for output in final_result.outputs
|
||||
],
|
||||
)
|
||||
),
|
||||
@@ -258,9 +247,7 @@ async def cohere_chat(req, llm):
|
||||
final_result = res
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.with_options(
|
||||
outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)]
|
||||
)
|
||||
final_result = final_result.with_options(outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)])
|
||||
num_prompt_tokens, num_response_tokens = len(final_result.prompt_token_ids), len(token_ids)
|
||||
return JSONResponse(
|
||||
converter.unstructure(
|
||||
|
||||
@@ -14,8 +14,6 @@ from ..protocol.cohere import CohereChatRequest, CohereGenerateRequest
|
||||
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
|
||||
def jsonify_attr(obj: AttrsInstance) -> str: ...
|
||||
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
|
||||
async def check_model(
|
||||
request: Union[CohereGenerateRequest, CohereChatRequest], model: str
|
||||
) -> Optional[JSONResponse]: ...
|
||||
async def check_model(request: Union[CohereGenerateRequest, CohereChatRequest], model: str) -> Optional[JSONResponse]: ...
|
||||
async def cohere_generate(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
async def cohere_chat(req: Request, llm: LLM[M, T]) -> Response: ...
|
||||
|
||||
@@ -37,10 +37,7 @@ def mount_to_svc(svc, llm):
|
||||
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(
|
||||
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
|
||||
status_code=status_code.value,
|
||||
)
|
||||
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
|
||||
|
||||
|
||||
@add_schema_definitions
|
||||
@@ -56,9 +53,7 @@ async def hf_agent(req, llm):
|
||||
stop = request.parameters.pop('stop', [])
|
||||
try:
|
||||
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
|
||||
return JSONResponse(
|
||||
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
|
||||
)
|
||||
return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
|
||||
except Exception as err:
|
||||
logger.error('Error while generating: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
|
||||
|
||||
@@ -61,11 +61,7 @@ def jsonify_attr(obj):
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(
|
||||
{
|
||||
'error': converter.unstructure(
|
||||
ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
|
||||
)
|
||||
},
|
||||
{'error': converter.unstructure(ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value)))},
|
||||
status_code=status_code.value,
|
||||
)
|
||||
|
||||
@@ -99,11 +95,7 @@ def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initi
|
||||
logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
|
||||
last_token_len = len(token)
|
||||
if num_output_top_logprobs:
|
||||
logprobs.top_logprobs.append(
|
||||
{llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
|
||||
if step_top_logprobs
|
||||
else None
|
||||
)
|
||||
logprobs.top_logprobs.append({llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()} if step_top_logprobs else None)
|
||||
return logprobs
|
||||
|
||||
|
||||
@@ -114,14 +106,8 @@ def mount_to_svc(svc, llm):
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[
|
||||
Route(
|
||||
'/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
|
||||
),
|
||||
Route(
|
||||
'/completions',
|
||||
functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
|
||||
methods=['POST'],
|
||||
),
|
||||
Route('/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']),
|
||||
Route('/completions', functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm), methods=['POST']),
|
||||
Route(
|
||||
'/chat/completions',
|
||||
functools.partial(
|
||||
@@ -146,9 +132,7 @@ def mount_to_svc(svc, llm):
|
||||
# GET /v1/models
|
||||
@add_schema_definitions
|
||||
def list_models(_, llm):
|
||||
return JSONResponse(
|
||||
converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
|
||||
)
|
||||
return JSONResponse(converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value)
|
||||
|
||||
|
||||
# POST /v1/chat/completions
|
||||
@@ -182,9 +166,7 @@ async def chat_completions(req, llm):
|
||||
config = llm.config.compatible_options(request)
|
||||
|
||||
def get_role() -> str:
|
||||
return (
|
||||
request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
|
||||
) # TODO: Support custom role here.
|
||||
return request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant' # TODO: Support custom role here.
|
||||
|
||||
try:
|
||||
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
|
||||
@@ -198,9 +180,7 @@ async def chat_completions(req, llm):
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=[
|
||||
ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
|
||||
],
|
||||
choices=[ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)],
|
||||
)
|
||||
if usage is not None:
|
||||
response.usage = usage
|
||||
@@ -251,17 +231,12 @@ async def chat_completions(req, llm):
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.with_options(
|
||||
outputs=[
|
||||
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
|
||||
for output in final_result.outputs
|
||||
]
|
||||
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
|
||||
)
|
||||
|
||||
role = get_role()
|
||||
choices = [
|
||||
ChatCompletionResponseChoice(
|
||||
index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
|
||||
)
|
||||
ChatCompletionResponseChoice(index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason)
|
||||
for output in final_result.outputs
|
||||
]
|
||||
if request.echo:
|
||||
@@ -275,9 +250,7 @@ async def chat_completions(req, llm):
|
||||
num_prompt_tokens = len(final_result.prompt_token_ids)
|
||||
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
|
||||
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
|
||||
response = ChatCompletionResponse(
|
||||
id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
|
||||
)
|
||||
response = ChatCompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
|
||||
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
@@ -369,13 +342,7 @@ async def completions(req, llm):
|
||||
top_logprobs = res.prompt_logprobs
|
||||
previous_echo[i] = True
|
||||
if request.logprobs is not None:
|
||||
logprobs = create_logprobs(
|
||||
output.token_ids,
|
||||
output.logprobs[previous_num_tokens[i] :],
|
||||
request.logprobs,
|
||||
len(previous_texts[i]),
|
||||
llm=llm,
|
||||
)
|
||||
logprobs = create_logprobs(output.token_ids, output.logprobs[previous_num_tokens[i] :], request.logprobs, len(previous_texts[i]), llm=llm)
|
||||
previous_num_tokens[i] += len(output.token_ids)
|
||||
previous_texts[i] += output.text
|
||||
yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
|
||||
@@ -402,10 +369,7 @@ async def completions(req, llm):
|
||||
if final_result is None:
|
||||
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
|
||||
final_result = final_result.with_options(
|
||||
outputs=[
|
||||
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
|
||||
for output in final_result.outputs
|
||||
]
|
||||
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
|
||||
)
|
||||
|
||||
choices = []
|
||||
@@ -428,9 +392,7 @@ async def completions(req, llm):
|
||||
output_text = prompt_text + output_text
|
||||
else:
|
||||
output_text = prompt_text
|
||||
choice_data = CompletionResponseChoice(
|
||||
index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
|
||||
)
|
||||
choice_data = CompletionResponseChoice(index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason)
|
||||
choices.append(choice_data)
|
||||
|
||||
num_prompt_tokens = len(final_result.prompt_token_ids)
|
||||
|
||||
@@ -14,9 +14,7 @@ from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
|
||||
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
|
||||
def jsonify_attr(obj: AttrsInstance) -> str: ...
|
||||
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
|
||||
async def check_model(
|
||||
request: Union[CompletionRequest, ChatCompletionRequest], model: str
|
||||
) -> Optional[JSONResponse]: ...
|
||||
async def check_model(request: Union[CompletionRequest, ChatCompletionRequest], model: str) -> Optional[JSONResponse]: ...
|
||||
def create_logprobs(
|
||||
token_ids: List[int],
|
||||
top_logprobs: List[Dict[int, float]], #
|
||||
|
||||
@@ -30,9 +30,7 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
|
||||
'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
|
||||
) from None
|
||||
else:
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs
|
||||
)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
|
||||
|
||||
if tokenizer.pad_token_id is None:
|
||||
if config.pad_token_id is not None:
|
||||
|
||||
@@ -29,9 +29,7 @@ def patch_correct_tag(llm, config, _revision=None) -> None:
|
||||
if _revision is None and llm.tag.version is not None:
|
||||
_revision = llm.tag.version
|
||||
if llm.tag.version is None:
|
||||
_object_setattr(
|
||||
llm, '_tag', attr.evolve(llm.tag, version=_revision)
|
||||
) # HACK: This copies the correct revision into llm.tag
|
||||
_object_setattr(llm, '_tag', attr.evolve(llm.tag, version=_revision)) # HACK: This copies the correct revision into llm.tag
|
||||
if llm._revision is None:
|
||||
_object_setattr(llm, '_revision', _revision) # HACK: This copies the correct revision into llm._model_version
|
||||
|
||||
@@ -47,9 +45,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
|
||||
if trust_remote_code:
|
||||
auto_map = getattr(config, 'auto_map', {})
|
||||
if not auto_map:
|
||||
raise RuntimeError(
|
||||
f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
|
||||
)
|
||||
raise RuntimeError(f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}')
|
||||
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
|
||||
if autoclass not in auto_map:
|
||||
raise RuntimeError(
|
||||
@@ -60,10 +56,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
|
||||
raise RuntimeError(
|
||||
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
|
||||
)
|
||||
metadata.update({
|
||||
'_pretrained_class': architectures[0],
|
||||
'_revision': get_hash(config) if not llm.local else llm.revision,
|
||||
})
|
||||
metadata.update({'_pretrained_class': architectures[0], '_revision': get_hash(config) if not llm.local else llm.revision})
|
||||
return metadata
|
||||
|
||||
|
||||
@@ -144,9 +137,7 @@ def save_model(
|
||||
bentomodel.flush()
|
||||
bentomodel.save(_model_store)
|
||||
openllm.utils.analytics.track(
|
||||
openllm.utils.analytics.ModelSaveEvent(
|
||||
module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024
|
||||
)
|
||||
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)
|
||||
)
|
||||
finally:
|
||||
bentomodel.exit_cloudpickle_context(imported_modules)
|
||||
|
||||
@@ -10,9 +10,7 @@ from openllm_core._typing_compat import M, T
|
||||
from .._llm import LLM
|
||||
|
||||
def get_hash(config: transformers.PretrainedConfig) -> str: ...
|
||||
def patch_correct_tag(
|
||||
llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...
|
||||
) -> None: ...
|
||||
def patch_correct_tag(llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...) -> None: ...
|
||||
@contextmanager
|
||||
def save_model(
|
||||
llm: LLM[M, T],
|
||||
|
||||
@@ -12,9 +12,7 @@ from .._helpers import patch_correct_tag, save_model
|
||||
from ..transformers._helpers import get_tokenizer, process_config
|
||||
|
||||
if not is_ctranslate_available():
|
||||
raise RuntimeError(
|
||||
"'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'"
|
||||
)
|
||||
raise RuntimeError("'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'")
|
||||
|
||||
import ctranslate2
|
||||
from ctranslate2.converters.transformers import TransformersConverter
|
||||
@@ -44,17 +42,11 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
|
||||
patch_correct_tag(llm, config)
|
||||
tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
|
||||
with save_model(
|
||||
llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]
|
||||
) as save_metadata:
|
||||
with save_model(llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]) as save_metadata:
|
||||
bentomodel, _ = save_metadata
|
||||
if llm._local:
|
||||
shutil.copytree(
|
||||
llm.model_id,
|
||||
bentomodel.path,
|
||||
symlinks=False,
|
||||
ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'),
|
||||
dirs_exist_ok=True,
|
||||
llm.model_id, bentomodel.path, symlinks=False, ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'), dirs_exist_ok=True
|
||||
)
|
||||
else:
|
||||
TransformersConverter(
|
||||
@@ -74,9 +66,7 @@ def get(llm):
|
||||
model = bentoml.models.get(llm.tag)
|
||||
backend = model.info.labels['backend']
|
||||
if backend != llm.__llm_backend__:
|
||||
raise OpenLLMException(
|
||||
f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
|
||||
)
|
||||
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
|
||||
patch_correct_tag(
|
||||
llm,
|
||||
transformers.AutoConfig.from_pretrained(model.path_of('/hf/'), trust_remote_code=llm.trust_remote_code),
|
||||
|
||||
@@ -12,7 +12,7 @@ from .._helpers import patch_correct_tag, save_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['get', 'import_model', 'load_model']
|
||||
__all__ = ['import_model', 'get', 'load_model']
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
|
||||
@@ -44,13 +44,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
|
||||
f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode())
|
||||
if llm._local: # possible local path
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(
|
||||
llm.model_id,
|
||||
*decls,
|
||||
local_files_only=True,
|
||||
config=config,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**hub_attrs,
|
||||
**attrs,
|
||||
llm.model_id, *decls, local_files_only=True, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
|
||||
)
|
||||
# for trust_remote_code to work
|
||||
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
|
||||
@@ -74,9 +68,7 @@ def get(llm):
|
||||
model = bentoml.models.get(llm.tag)
|
||||
backend = model.info.labels['backend']
|
||||
if backend != llm.__llm_backend__:
|
||||
raise OpenLLMException(
|
||||
f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
|
||||
)
|
||||
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
|
||||
patch_correct_tag(
|
||||
llm,
|
||||
transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code),
|
||||
@@ -132,9 +124,7 @@ def load_model(llm, *decls, **attrs):
|
||||
)
|
||||
except Exception as err:
|
||||
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
|
||||
model = auto_class.from_pretrained(
|
||||
llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs
|
||||
)
|
||||
model = auto_class.from_pretrained(llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs)
|
||||
else:
|
||||
try:
|
||||
model = auto_class.from_pretrained(
|
||||
@@ -149,12 +139,7 @@ def load_model(llm, *decls, **attrs):
|
||||
except Exception as err:
|
||||
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
|
||||
model = auto_class.from_pretrained(
|
||||
llm.bentomodel.path,
|
||||
*decls,
|
||||
config=config,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
device_map=device_map,
|
||||
**attrs,
|
||||
llm.bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **attrs
|
||||
)
|
||||
check_unintialised_params(model)
|
||||
return model
|
||||
|
||||
@@ -6,9 +6,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_tokenizer(model_id_or_path, trust_remote_code, **attrs):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
model_id_or_path, trust_remote_code=trust_remote_code, **attrs
|
||||
)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code, **attrs)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
return tokenizer
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import functools, importlib.metadata, openllm_core
|
||||
|
||||
__all__ = ['available_devices', 'device_count', 'generate_labels']
|
||||
__all__ = ['generate_labels', 'available_devices', 'device_count']
|
||||
|
||||
|
||||
def generate_labels(llm):
|
||||
|
||||
Reference in New Issue
Block a user