fix(base-image): update base image to include cuda for now (#720)

* fix(base-image): update base image to include cuda for now

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* fix: build core and client on release images

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup style changes

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-22 01:15:19 -05:00
committed by GitHub
parent 8bb2742a9a
commit 38b7c44df0
41 changed files with 913 additions and 613 deletions

View File

@@ -1,4 +1,4 @@
"""OpenLLM.
'''OpenLLM.
===========
An open platform for operating large language models in production.
@@ -8,13 +8,11 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
* Option to bring your own fine-tuned LLMs
* Online Serving with HTTP, gRPC, SSE or custom API
* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
"""
'''
# fmt: off
# update-config-stubs.py: import stubs start
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,PhiConfig as PhiConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
# update-config-stubs.py: import stubs stop
# fmt: on
from openllm_cli._sdk import (
build as build,

View File

@@ -1,2 +1,4 @@
# fmt: off
if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()
if __name__ == '__main__':
from openllm_cli.entrypoint import cli
cli()

View File

@@ -59,7 +59,7 @@ def Runner(
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
)
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
_RUNNER_MSG = f"""\
_RUNNER_MSG = f'''\
Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:
```python
@@ -71,7 +71,7 @@ def Runner(
async def chat(input: str) -> str:
async for it in llm.generate_iterator(input): print(it)
```
"""
'''
warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
attrs.update(
{

View File

@@ -33,7 +33,7 @@ def is_sentence_complete(output):
def is_partial_stop(output, stop_str):
"""Check whether the output contains a partial stop str."""
'''Check whether the output contains a partial stop str.'''
for i in range(min(len(output), len(stop_str))):
if stop_str.startswith(output[-i:]):
return True

View File

@@ -1,8 +1,6 @@
from __future__ import annotations
import functools, logging, os, warnings
import typing as t
import attr, inflection, orjson
import bentoml, openllm
import functools, logging, os, warnings, typing as t
import attr, inflection, orjson, bentoml, openllm
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
AdapterMap,
@@ -20,9 +18,9 @@ from openllm_core._typing_compat import (
from openllm_core.exceptions import MissingDependencyError
from openllm_core.utils import (
DEBUG,
ENV_VARS_TRUE_VALUES,
ReprMixin,
apply,
check_bool_env,
codegen,
first_not_none,
flatten_attrs,
@@ -142,31 +140,33 @@ class LLM(t.Generic[M, T], ReprMixin):
# NOTE: If you are here to see how generate_iterator and generate works, see above.
# The below are mainly for internal implementation that you don't have to worry about.
# fmt: off
_model_id:str
_revision:t.Optional[str]
_quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]
_model_id: str
_revision: t.Optional[str]
_quantization_config: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
]
_quantise: t.Optional[LiteralQuantise]
_model_decls:TupleAny
__model_attrs:DictStrAny
__tokenizer_attrs:DictStrAny
_tag:bentoml.Tag
_adapter_map:t.Optional[AdapterMap]
_serialisation:LiteralSerialisation
_local:bool
_max_model_len:t.Optional[int]
_model_decls: TupleAny
__model_attrs: DictStrAny
__tokenizer_attrs: DictStrAny
_tag: bentoml.Tag
_adapter_map: t.Optional[AdapterMap]
_serialisation: LiteralSerialisation
_local: bool
_max_model_len: t.Optional[int]
__llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto'
__llm_torch_dtype__:'torch.dtype'=None
__llm_config__:t.Optional[LLMConfig]=None
__llm_backend__:LiteralBackend=None
__llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None
__llm_runner__:t.Optional[Runner[M, T]]=None
__llm_model__:t.Optional[M]=None
__llm_tokenizer__:t.Optional[T]=None
__llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None
__llm_trust_remote_code__:bool=False
__llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
__llm_torch_dtype__: 'torch.dtype' = None
__llm_config__: t.Optional[LLMConfig] = None
__llm_backend__: LiteralBackend = None
__llm_quantization_config__: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
] = None
__llm_runner__: t.Optional[Runner[M, T]] = None
__llm_model__: t.Optional[M] = None
__llm_tokenizer__: t.Optional[T] = None
__llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
__llm_trust_remote_code__: bool = False
def __init__(
self,
@@ -188,26 +188,34 @@ class LLM(t.Generic[M, T], ReprMixin):
_eager=True,
**attrs,
):
torch_dtype=attrs.pop('torch_dtype',None) # backward compatible
if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warns(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,
)
dtype = torch_dtype
_local = False
if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
if validate_is_path(model_id):
model_id, _local = resolve_filepath(model_id), True
backend = first_not_none(getenv('backend', default=backend), default=self._cascade_backend())
dtype = first_not_none(getenv('dtype', default=dtype, var=['TORCH_DTYPE']), default='auto')
quantize = first_not_none(getenv('quantize', default=quantize, var=['QUANITSE']), default=None)
attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage})
# parsing tokenizer and model kwargs, as the hierarchy is param pass > default
model_attrs,tokenizer_attrs=flatten_attrs(**attrs)
model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
if model_tag is None:
model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
if model_version:model_tag=f'{model_tag}:{model_version}'
model_tag, model_version = self._make_tag_components(model_id, model_version, backend=backend)
if model_version:
model_tag = f'{model_tag}:{model_version}'
self.__attrs_init__(
model_id=model_id,
revision=model_version,
tag=bentoml.Tag.from_taglike(model_tag),
quantization_config=quantization_config,
quantise=getattr(self._Quantise,backend)(self,quantize),
quantise=getattr(self._Quantise, backend)(self, quantize),
model_decls=args,
adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
serialisation=serialisation,
@@ -220,143 +228,248 @@ class LLM(t.Generic[M, T], ReprMixin):
llm_config__=llm_config,
llm_trust_remote_code__=trust_remote_code,
)
if _eager:
try:
model=bentoml.models.get(self.tag)
model = bentoml.models.get(self.tag)
except bentoml.exceptions.NotFound:
model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code)
model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
# resolve the tag
self._tag=model.tag
if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True)
self._tag = model.tag
if not _eager and embedded:
raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
if embedded:
logger.warning(
'NOT RECOMMENDED in production and SHOULD ONLY used for development (Loading into current memory).'
)
self.runner.init_local(quiet=True)
class _Quantise:
@staticmethod
def pt(llm:LLM,quantise=None):return quantise
@staticmethod
def vllm(llm:LLM,quantise=None):return quantise
@staticmethod
def ctranslate(llm:LLM,quantise=None):
if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32'
def pt(llm: LLM, quantise=None):
return quantise
@apply(lambda val:tuple(str.lower(i) if i else i for i in val))
def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
model_id,*maybe_revision=model_id.rsplit(':')
if len(maybe_revision)>0:
if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
@staticmethod
def vllm(llm: LLM, quantise=None):
return quantise
@staticmethod
def ctranslate(llm: LLM, quantise=None):
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
if quantise == 'int8':
quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
return quantise
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
def _make_tag_components(self, model_id: str, model_version: str | None, backend: str) -> tuple[str, str | None]:
model_id, *maybe_revision = model_id.rsplit(':')
if len(maybe_revision) > 0:
if model_version is not None:
logger.warning(
"revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
)
model_version = maybe_revision[0]
if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
return f'{backend}-{normalise_model_name(model_id)}',model_version
if validate_is_path(model_id):
model_id, model_version = (
resolve_filepath(model_id),
first_not_none(model_version, default=generate_hash_from_file(model_id)),
)
return f'{backend}-{normalise_model_name(model_id)}', model_version
@functools.cached_property
def _has_gpus(self):
try:
from cuda import cuda
err,*_=cuda.cuInit(0)
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
err,num_gpus=cuda.cuDeviceGetCount()
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
err, *_ = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to initialise CUDA runtime binding.')
err, num_gpus = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to get CUDA device count.')
return True
except (ImportError, RuntimeError):return False
except (ImportError, RuntimeError):
return False
@property
def _torch_dtype(self):
import torch, transformers
_map=_torch_dtype_mapping()
if not isinstance(self.__llm_torch_dtype__,torch.dtype):
try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
config_dtype=getattr(hf_config,'torch_dtype',None)
if config_dtype is None:config_dtype=torch.float32
if self.__llm_dtype__=='auto':
if config_dtype==torch.float32:torch_dtype=torch.float16
else:torch_dtype=config_dtype
_map = _torch_dtype_mapping()
if not isinstance(self.__llm_torch_dtype__, torch.dtype):
try:
hf_config = transformers.AutoConfig.from_pretrained(
self.bentomodel.path, trust_remote_code=self.trust_remote_code
)
except OpenLLMException:
hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
config_dtype = getattr(hf_config, 'torch_dtype', None)
if config_dtype is None:
config_dtype = torch.float32
if self.__llm_dtype__ == 'auto':
if config_dtype == torch.float32:
torch_dtype = torch.float16
else:
torch_dtype = config_dtype
else:
if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
torch_dtype=_map[self.__llm_dtype__]
self.__llm_torch_dtype__=torch_dtype
if self.__llm_dtype__ not in _map:
raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
torch_dtype = _map[self.__llm_dtype__]
self.__llm_torch_dtype__ = torch_dtype
return self.__llm_torch_dtype__
@property
def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
def _model_attrs(self):
return {**self.import_kwargs[0], **self.__model_attrs}
@_model_attrs.setter
def _model_attrs(self, value):self.__model_attrs = value
def _model_attrs(self, value):
self.__model_attrs = value
@property
def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
def _cascade_backend(self)->LiteralBackend:
def _tokenizer_attrs(self):
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
def _cascade_backend(self) -> LiteralBackend:
if self._has_gpus:
if is_vllm_available():return 'vllm'
elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
elif is_ctranslate_available():return 'ctranslate'
else:return 'pt'
def __setattr__(self,attr,value):
if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
if is_vllm_available():
return 'vllm'
elif is_ctranslate_available():
return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
elif is_ctranslate_available():
return 'ctranslate'
else:
return 'pt'
def __setattr__(self, attr, value):
if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:
raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
super().__setattr__(attr, value)
def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
def __del__(self):
try:
del self.__llm_model__, self.__llm_tokenizer__, self.__llm_adapter_map__
except AttributeError:
pass
@property
def __repr_keys__(self):return {'model_id','revision','backend','type'}
def __repr_keys__(self):
return {'model_id', 'revision', 'backend', 'type'}
def __repr_args__(self):
yield 'model_id',self._model_id if not self._local else self.tag.name
yield 'revision',self._revision if self._revision else self.tag.version
yield 'backend',self.__llm_backend__
yield 'type',self.llm_type
yield 'model_id', self._model_id if not self._local else self.tag.name
yield 'revision', self._revision if self._revision else self.tag.version
yield 'backend', self.__llm_backend__
yield 'type', self.llm_type
@property
def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
def import_kwargs(self):
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
'padding_side': 'left',
'truncation_side': 'left',
}
@property
def trust_remote_code(self):
env=os.getenv('TRUST_REMOTE_CODE')
if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
env = os.getenv('TRUST_REMOTE_CODE')
if env is not None:
check_bool_env('TRUST_REMOTE_CODE', env)
return self.__llm_trust_remote_code__
@property
def model_id(self):return self._model_id
def model_id(self):
return self._model_id
@property
def revision(self):return self._revision
def revision(self):
return self._revision
@property
def tag(self):return self._tag
def tag(self):
return self._tag
@property
def bentomodel(self):return openllm.serialisation.get(self)
def bentomodel(self):
return openllm.serialisation.get(self)
@property
def quantization_config(self):
if self.__llm_quantization_config__ is None:
from ._quantisation import infer_quantisation_config
if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
if self._quantization_config is not None:
self.__llm_quantization_config__ = self._quantization_config
elif self._quantise is not None:
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
self, self._quantise, **self._model_attrs
)
else:
raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
return self.__llm_quantization_config__
@property
def has_adapters(self):return self._adapter_map is not None
def has_adapters(self):
return self._adapter_map is not None
@property
def local(self):return self._local
def local(self):
return self._local
@property
def quantise(self):return self._quantise
def quantise(self):
return self._quantise
@property
def llm_type(self):return normalise_model_name(self._model_id)
def llm_type(self):
return normalise_model_name(self._model_id)
@property
def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
def llm_parameters(self):
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
@property
def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
def identifying_params(self):
return {
'configuration': self.config.model_dump_json().decode(),
'model_ids': orjson.dumps(self.config['model_ids']).decode(),
'model_id': self.model_id,
}
@property
def tokenizer(self):
if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
if self.__llm_tokenizer__ is None:
self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
return self.__llm_tokenizer__
@property
def runner(self):
from ._runners import runner
if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
if self.__llm_runner__ is None:
self.__llm_runner__ = runner(self)
return self.__llm_runner__
def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
def prepare(self, adapter_type='lora', use_gradient_checking=True, **attrs):
if self.__llm_backend__ != 'pt':
raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
from peft.mapping import get_peft_model
from peft.utils.other import prepare_model_for_kbit_training
model=get_peft_model(
prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
model = get_peft_model(
prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
self.config['fine_tune_strategies']
.get(adapter_type,self.config.make_fine_tune_config(adapter_type))
.get(adapter_type, self.config.make_fine_tune_config(adapter_type))
.train()
.with_config(**attrs)
.build(),
)
if DEBUG:model.print_trainable_parameters()
return model,self.tokenizer
def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
if DEBUG:
model.print_trainable_parameters()
return model, self.tokenizer
def prepare_for_training(self, *args, **attrs):
logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Use `prepare` instead.')
return self.prepare(*args, **attrs)
@property
def adapter_map(self):
@@ -431,33 +544,49 @@ class LLM(t.Generic[M, T], ReprMixin):
return self.__llm_config__
# fmt: off
@functools.lru_cache(maxsize=1)
def _torch_dtype_mapping()->dict[str,torch.dtype]:
import torch; return {
def _torch_dtype_mapping() -> dict[str, torch.dtype]:
import torch
return {
'half': torch.float16,
'float': torch.float32,
'float16': torch.float16,
'float': torch.float32,
'float32': torch.float32,
'bfloat16': torch.bfloat16,
}
def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--'))
def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap:
if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
def normalise_model_name(name: str) -> str:
return (
os.path.basename(resolve_filepath(name))
if validate_is_path(name)
else inflection.dasherize(name.replace('/', '--'))
)
def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
if not is_peft_available():
raise RuntimeError(
"LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
)
from huggingface_hub import hf_hub_download
resolved:AdapterMap={}
resolved: AdapterMap = {}
for path_or_adapter_id, name in adapter_map.items():
if name is None:raise ValueError('Adapter name must be specified.')
if name is None:
raise ValueError('Adapter name must be specified.')
if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
else:
try:
config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
except Exception as err:
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read())
_peft_type=resolved_config['peft_type'].lower()
if _peft_type not in resolved:resolved[_peft_type]=()
resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
with open(config_file, 'r') as file:
resolved_config = orjson.loads(file.read())
_peft_type = resolved_config['peft_type'].lower()
if _peft_type not in resolved:
resolved[_peft_type] = ()
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
return resolved

View File

@@ -1,2 +1,9 @@
# fmt: off
import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False)
import os, orjson, openllm_core.utils as coreutils
model_id, model_tag, adapter_map, serialization, trust_remote_code = (
os.environ['OPENLLM_MODEL_ID'],
None,
orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
)

View File

@@ -215,18 +215,18 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
NvidiaGpuResource = _make_resource_class(
'NvidiaGpuResource',
'nvidia.com/gpu',
"""NVIDIA GPU resource.
'''NVIDIA GPU resource.
This is a modified version of internal's BentoML's NvidiaGpuResource
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
)
AmdGpuResource = _make_resource_class(
'AmdGpuResource',
'amd.com/gpu',
"""AMD GPU resource.
'''AMD GPU resource.
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
)

View File

@@ -19,10 +19,10 @@ class CascadingResourceStrategy:
resource_request: Optional[Dict[str, Any]],
workers_per_resource: float,
) -> int:
"""Return the number of workers to be used for the given runnable class.
'''Return the number of workers to be used for the given runnable class.
Note that for all available GPU, the number of workers will always be 1.
"""
'''
@classmethod
def get_worker_env(
cls,
@@ -31,16 +31,16 @@ class CascadingResourceStrategy:
workers_per_resource: Union[int, float],
worker_index: int,
) -> Dict[str, Any]:
"""Get worker env for this given worker_index.
'''Get worker env for this given worker_index.
Args:
runnable_class: The runnable class to be run.
resource_request: The resource request of the runnable.
workers_per_resource: # of workers per resource.
worker_index: The index of the worker, start from 0.
"""
'''
@staticmethod
def transpile_workers_to_cuda_envvar(
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
) -> str:
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
'''Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.'''

View File

@@ -22,7 +22,7 @@ OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
def build_editable(path, package='openllm'):
"""Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
'''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
return None
# We need to build the package in editable mode, so that we can import it

View File

@@ -1,12 +1,10 @@
# syntax=docker/dockerfile-upstream:master
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM python:3.9-slim-bullseye as base-container
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base-container
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
@@ -15,23 +13,32 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
ccache \
curl \
libssl-dev ca-certificates make \
git && \
git python3-pip && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p /openllm-python
RUN mkdir -p /openllm-core
RUN mkdir -p /openllm-client
# Install required dependencies
COPY openllm-python/src src
COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
COPY openllm-python/src /openllm-python/src
COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml /openllm-python/
# Install all required dependencies
# We have to install autoawq first to avoid conflict with torch, then reinstall torch with vllm
# below
# pip install autoawq --no-cache-dir && \
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
-v --no-cache-dir \
pip3 install -v --no-cache-dir \
"ray==2.6.0" "vllm==0.2.2" xformers && \
pip install --no-cache-dir -e .
pip3 install --no-cache-dir -e .
COPY openllm-core/src openllm-core/src
COPY hatch.toml README.md CHANGELOG.md openllm-core/pyproject.toml /openllm-core/
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-core/
COPY openllm-client/src openllm-client/src
COPY hatch.toml README.md CHANGELOG.md openllm-client/pyproject.toml /openllm-client/
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-client/
FROM base-container

View File

@@ -50,13 +50,17 @@ class RefResolver:
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
# fmt: off
@property
def tag(self):return 'latest' if self.strategy in {'latest','nightly'} else repr(self.version)
def tag(self):
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
@staticmethod
def construct_base_image(reg,strategy=None):
if reg == 'gh': logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
elif reg == 'docker': logger.warning('docker is base image is yet to be supported. Falling back to "ecr".'); reg = 'ecr'
def construct_base_image(reg, strategy=None):
if reg == 'gh':
logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
elif reg == 'docker':
logger.warning('docker is base image is yet to be supported. Falling back to "ecr".')
reg = 'ecr'
return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'

View File

@@ -1,3 +1,10 @@
# fmt: off
def __dir__():import openllm_client as _client;return sorted(dir(_client))
def __getattr__(it):import openllm_client as _client;return getattr(_client, it)
def __dir__():
import openllm_client as _client
return sorted(dir(_client))
def __getattr__(it):
import openllm_client as _client
return getattr(_client, it)

View File

@@ -1,9 +1,9 @@
"""OpenLLM Python client.
'''OpenLLM Python client.
```python
client = openllm.client.HTTPClient("http://localhost:8080")
client.query("What is the difference between gather and scatter?")
```
"""
'''
from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient

View File

@@ -1,11 +1,11 @@
"""Entrypoint for all third-party apps.
'''Entrypoint for all third-party apps.
Currently support OpenAI, Cohere compatible API.
Each module should implement the following API:
- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
"""
'''
from bentoml import Service
from openllm_core._typing_compat import M, T

View File

@@ -11,7 +11,7 @@ from openllm_core.utils import first_not_none
OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
# NOTE: OpenAI schema
LIST_MODELS_SCHEMA = """\
LIST_MODELS_SCHEMA = '''\
---
consumes:
- application/json
@@ -41,8 +41,8 @@ responses:
owned_by: 'na'
schema:
$ref: '#/components/schemas/ModelList'
"""
CHAT_COMPLETIONS_SCHEMA = """\
'''
CHAT_COMPLETIONS_SCHEMA = '''\
---
consumes:
- application/json
@@ -179,8 +179,8 @@ responses:
}
}
description: Bad Request
"""
COMPLETIONS_SCHEMA = """\
'''
COMPLETIONS_SCHEMA = '''\
---
consumes:
- application/json
@@ -332,8 +332,8 @@ responses:
}
}
description: Bad Request
"""
HF_AGENT_SCHEMA = """\
'''
HF_AGENT_SCHEMA = '''\
---
consumes:
- application/json
@@ -377,8 +377,8 @@ responses:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
"""
HF_ADAPTERS_SCHEMA = """\
'''
HF_ADAPTERS_SCHEMA = '''\
---
consumes:
- application/json
@@ -408,8 +408,8 @@ responses:
schema:
$ref: '#/components/schemas/HFErrorResponse'
description: Not Found
"""
COHERE_GENERATE_SCHEMA = """\
'''
COHERE_GENERATE_SCHEMA = '''\
---
consumes:
- application/json
@@ -453,8 +453,8 @@ requestBody:
stop_sequences:
- "\\n"
- "<|endoftext|>"
"""
COHERE_CHAT_SCHEMA = """\
'''
COHERE_CHAT_SCHEMA = '''\
---
consumes:
- application/json
@@ -467,7 +467,7 @@ tags:
- Cohere
x-bentoml-name: cohere_chat
summary: Creates a model response for the given chat conversation.
"""
'''
_SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}

View File

@@ -14,11 +14,11 @@ P = ParamSpec('P')
def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
"""Load the tokenizer from BentoML store.
'''Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
'''
import cloudpickle
import fs
from transformers import AutoTokenizer

View File

@@ -1,9 +1,9 @@
"""Serialisation utilities for OpenLLM.
'''Serialisation utilities for OpenLLM.
Currently supports transformers for PyTorch, and vLLM.
Currently, GGML format is working in progress.
"""
'''
from typing import Any

View File

@@ -1,7 +1,8 @@
# fmt: off
import functools, importlib.metadata, openllm_core
__all__ = ['generate_labels', 'available_devices', 'device_count']
def generate_labels(llm):
return {
'backend': llm.__llm_backend__,
@@ -11,10 +12,25 @@ def generate_labels(llm):
'serialisation': llm._serialisation,
**{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
}
def available_devices():from ._strategies import NvidiaGpuResource;return tuple(NvidiaGpuResource.from_system())
def available_devices():
from ._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
@functools.lru_cache(maxsize=1)
def device_count()->int:return len(available_devices())
def __dir__():coreutils=set(dir(openllm_core.utils))|set([it for it in openllm_core.utils._extras if not it.startswith('_')]);return sorted(__all__)+sorted(list(coreutils))
def device_count() -> int:
return len(available_devices())
def __dir__():
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
return sorted(__all__) + sorted(list(coreutils))
def __getattr__(it):
if hasattr(openllm_core.utils, it):return getattr(openllm_core.utils, it)
if hasattr(openllm_core.utils, it):
return getattr(openllm_core.utils, it)
raise AttributeError(f'module {__name__} has no attribute {it}')

View File

@@ -1,4 +1,4 @@
"""OpenLLM CLI.
'''OpenLLM CLI.
For more information see ``openllm -h``.
"""
'''

View File

@@ -146,7 +146,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
backend_option(factory=cog.optgroup),
cog.optgroup.group(
'LLM Optimization Options',
help="""Optimization related options.
help='''Optimization related options.
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -154,7 +154,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
""",
''',
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
@@ -196,7 +196,7 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
"""Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
'''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
from bentoml_cli.cli import cli
command = 'serve' if not serve_grpc else 'serve-grpc'
@@ -233,11 +233,11 @@ _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args
def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
"""General ``@click`` decorator with some sauce.
'''General ``@click`` decorator with some sauce.
This decorator extends the default ``@click.option`` plus a factory option and factory attr to
provide type-safe click.option or click.argument wrapper for all compatible factory.
"""
'''
factory = attrs.pop('factory', click)
factory_attr = attrs.pop('attr', 'option')
if factory_attr != 'argument':
@@ -346,7 +346,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
default=None,
envvar='OPENLLM_QUANTIZE',
show_envvar=True,
help="""Dynamic quantization for running this LLM.
help='''Dynamic quantization for running this LLM.
The following quantization strategies are supported:
@@ -361,15 +361,15 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
- ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629)
> [!NOTE] that the model can also be served with quantized weights.
"""
'''
+ (
"""
> [!NOTE] that this will set the mode for serving within deployment."""
'''
> [!NOTE] that this will set the mode for serving within deployment.'''
if build
else ''
)
+ """
> [!NOTE] that quantization are currently only available in *PyTorch* models.""",
+ '''
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
**attrs,
)(f)
@@ -383,7 +383,7 @@ def workers_per_resource_option(
callback=workers_per_resource_callback,
type=str,
required=False,
help="""Number of workers per resource assigned.
help='''Number of workers per resource assigned.
See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
for more information. By default, this is set to 1.
@@ -393,7 +393,7 @@ def workers_per_resource_option(
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
- ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
"""
'''
+ (
"""\n
> [!NOTE] The workers value passed into 'build' will determine how the LLM can
@@ -416,7 +416,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
show_default=True,
show_envvar=True,
envvar='OPENLLM_SERIALIZATION',
help="""Serialisation format for save/load LLM.
help='''Serialisation format for save/load LLM.
Currently the following strategies are supported:
@@ -425,7 +425,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
> [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.
- ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
""",
''',
**attrs,
)(f)

View File

@@ -291,7 +291,7 @@ def _import_model(
def _list_models() -> dict[str, t.Any]:
"""List all available models within the local store."""
'''List all available models within the local store.'''
from .entrypoint import models_command
return models_command.main(args=['--quiet'], standalone_mode=False)

View File

@@ -94,14 +94,14 @@ else:
P = ParamSpec('P')
logger = logging.getLogger('openllm')
OPENLLM_FIGLET = """\
OPENLLM_FIGLET = '''\
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║██║ ██║ ██║╚██╔╝██║
╚██████╔╝██║ ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝ ╚═╝
"""
'''
ServeCommand = t.Literal['serve', 'serve-grpc']
@@ -287,7 +287,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return decorator
def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
"""Additional format methods that include extensions as well as the default cli command."""
'''Additional format methods that include extensions as well as the default cli command.'''
from gettext import gettext as _
commands: list[tuple[str, click.Command]] = []
@@ -334,7 +334,7 @@ _PACKAGE_NAME = 'openllm'
message=f'{_PACKAGE_NAME}, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}',
)
def cli() -> None:
"""\b
'''\b
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
@@ -345,7 +345,7 @@ def cli() -> None:
\b
An open platform for operating large language models in production.
Fine-tune, serve, deploy, and monitor any LLMs with ease.
"""
'''
@cli.command(
@@ -389,13 +389,13 @@ def start_command(
max_model_len: int | None,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
"""Start any LLM as a REST server.
'''Start any LLM as a REST server.
\b
```bash
$ openllm <start|start-http> <model_id> --<options> ...
```
"""
'''
if model_id in openllm.CONFIG_MAPPING:
_model_name = model_id
if deprecated_model_id is not None:
@@ -519,13 +519,13 @@ def start_grpc_command(
max_model_len: int | None,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
"""Start any LLM as a gRPC server.
'''Start any LLM as a gRPC server.
\b
```bash
$ openllm start-grpc <model_id> --<options> ...
```
"""
'''
termui.warning(
'Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n'
)
@@ -955,7 +955,7 @@ def build_command(
force_push: bool,
**_: t.Any,
) -> BuildBentoOutput:
"""Package a given models into a BentoLLM.
'''Package a given models into a BentoLLM.
\b
```bash
@@ -971,7 +971,7 @@ def build_command(
> [!IMPORTANT]
> To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
> target also use the same Python version and architecture as build machine.
"""
'''
from openllm.serialisation.transformers.weights import has_safetensors_weights
if model_id in openllm.CONFIG_MAPPING:
@@ -1167,13 +1167,13 @@ class ModelItem(t.TypedDict):
@cli.command()
@click.option('--show-available', is_flag=True, default=True, hidden=True)
def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
"""List all supported models.
'''List all supported models.
\b
```bash
openllm models
```
"""
'''
result: dict[t.LiteralString, ModelItem] = {
m: ModelItem(
architecture=config.__openllm_architecture__,
@@ -1216,11 +1216,11 @@ def prune_command(
bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
**_: t.Any,
) -> None:
"""Remove all saved models, and bentos built with OpenLLM locally.
'''Remove all saved models, and bentos built with OpenLLM locally.
\b
If a model type is passed, then only prune models for that given model type.
"""
'''
available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
(m, model_store)
for m in bentoml.models.list()
@@ -1326,13 +1326,13 @@ def query_command(
_memoized: DictStrAny,
**_: t.Any,
) -> None:
"""Query a LLM interactively, from a terminal.
'''Query a LLM interactively, from a terminal.
\b
```bash
$ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
```
"""
'''
if server_type == 'grpc':
raise click.ClickException("'grpc' is currently disabled.")
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
@@ -1353,7 +1353,7 @@ def query_command(
@cli.group(cls=Extensions, hidden=True, name='extension')
def extension_command() -> None:
"""Extension for OpenLLM CLI."""
'''Extension for OpenLLM CLI.'''
if __name__ == '__main__':

View File

@@ -71,7 +71,7 @@ def build_container(
@click.command(
'build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help="""Base image builder for BentoLLM.
help='''Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -81,7 +81,7 @@ def build_container(
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
""",
''',
)
@container_registry_option
@click.option(

View File

@@ -24,7 +24,7 @@ if t.TYPE_CHECKING:
def cli(
ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
) -> str | None:
"""Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
'''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
try:
bentomodel = _bento_store.get(bento)
except bentoml.exceptions.NotFound:

View File

@@ -13,7 +13,7 @@ from openllm_cli import termui
@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
@click.pass_context
def cli(ctx: click.Context) -> None:
"""List available bentos built by OpenLLM."""
'''List available bentos built by OpenLLM.'''
mapping = {
k: [
{

View File

@@ -18,7 +18,7 @@ if t.TYPE_CHECKING:
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
@model_name_argument(required=False, shell_complete=model_complete_envvar)
def cli(model_name: str | None) -> DictStrAny:
"""This is equivalent to openllm models --show-available less the nice table."""
'''This is equivalent to openllm models --show-available less the nice table.'''
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [