mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-18 21:54:11 -04:00
fix(style): remove weird break on split item
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -34,7 +34,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
|
||||
options=ModelOptions(),
|
||||
context=openllm.utils.generate_context(framework_name='transformers'),
|
||||
labels={
|
||||
'runtime': 'pt', 'framework': 'openllm'
|
||||
'runtime': 'pt',
|
||||
'framework': 'openllm'
|
||||
},
|
||||
signatures=model_signatures) as bentomodel:
|
||||
snapshot_download(_GENERIC_EMBEDDING_ID,
|
||||
|
||||
@@ -281,22 +281,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
if t.TYPE_CHECKING: __name__: str
|
||||
if t.TYPE_CHECKING and not MYPY:
|
||||
|
||||
def __attrs_init__(self,
|
||||
config: LLMConfig,
|
||||
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
|
||||
quantize: t.Optional[LiteralQuantise],
|
||||
model_id: str,
|
||||
model_decls: TupleAny,
|
||||
model_attrs: DictStrAny,
|
||||
tokenizer_attrs: DictStrAny,
|
||||
tag: bentoml.Tag,
|
||||
adapters_mapping: t.Optional[AdaptersMapping],
|
||||
model_version: t.Optional[str],
|
||||
serialisation: LiteralSerialisation,
|
||||
_local: bool,
|
||||
prompt_template: PromptTemplate | None,
|
||||
system_message: str | None,
|
||||
**attrs: t.Any) -> None:
|
||||
def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig,
|
||||
transformers.GPTQConfig]], quantize: t.Optional[LiteralQuantise], model_id: str, model_decls: TupleAny,
|
||||
model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
|
||||
serialisation: LiteralSerialisation, _local: bool, prompt_template: PromptTemplate | None, system_message: str | None, **attrs: t.Any) -> None:
|
||||
'''Generated __attrs_init__ for openllm.LLM.'''
|
||||
|
||||
config: LLMConfig
|
||||
@@ -540,20 +528,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
|
||||
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
|
||||
|
||||
def __init__(self,
|
||||
*args: t.Any,
|
||||
model_id: str,
|
||||
llm_config: LLMConfig,
|
||||
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
|
||||
_quantize: LiteralQuantise | None,
|
||||
_model_version: str,
|
||||
_tag: bentoml.Tag,
|
||||
_serialisation: LiteralSerialisation,
|
||||
_local: bool,
|
||||
_prompt_template: PromptTemplate | None,
|
||||
_system_message: str | None,
|
||||
_adapters_mapping: AdaptersMapping | None,
|
||||
**attrs: t.Any,
|
||||
def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
|
||||
_quantize: LiteralQuantise | None, _model_version: str, _tag: bentoml.Tag, _serialisation: LiteralSerialisation, _local: bool, _prompt_template: PromptTemplate | None,
|
||||
_system_message: str | None, _adapters_mapping: AdaptersMapping | None, **attrs: t.Any,
|
||||
):
|
||||
'''Initialize the LLM with given pretrained model.
|
||||
|
||||
@@ -651,22 +628,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
|
||||
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
# NOTE: Save the args and kwargs for latter load
|
||||
self.__attrs_init__(llm_config,
|
||||
quantization_config,
|
||||
_quantize,
|
||||
model_id,
|
||||
args, {
|
||||
**model_kwds, **normalized_model_kwds
|
||||
}, {
|
||||
**tokenizer_kwds, **normalized_tokenizer_kwds
|
||||
},
|
||||
_tag,
|
||||
_adapters_mapping,
|
||||
_model_version,
|
||||
_serialisation,
|
||||
_local,
|
||||
_prompt_template,
|
||||
_system_message)
|
||||
self.__attrs_init__(llm_config, quantization_config, _quantize, model_id, args, {
|
||||
**model_kwds,
|
||||
**normalized_model_kwds
|
||||
}, {
|
||||
**tokenizer_kwds,
|
||||
**normalized_tokenizer_kwds
|
||||
}, _tag, _adapters_mapping, _model_version, _serialisation, _local, _prompt_template, _system_message)
|
||||
|
||||
self.llm_post_init()
|
||||
|
||||
@@ -1306,10 +1274,11 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
|
||||
pre = now
|
||||
yield ' '.join(output_text[pre:]) + ' '
|
||||
|
||||
return types.new_class(self.__class__.__name__ + 'Runnable', (_Runnable,), {},
|
||||
lambda ns: ns.update({
|
||||
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), '__module__': self.__module__, '__doc__': self.config['env'].start_docstring
|
||||
}))
|
||||
return types.new_class(self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({
|
||||
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
|
||||
'__module__': self.__module__,
|
||||
'__doc__': self.config['env'].start_docstring
|
||||
}))
|
||||
|
||||
def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
|
||||
def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:
|
||||
|
||||
@@ -120,8 +120,13 @@ async def completion_v1(input_dict: dict[str, t.Any], ctx: bentoml.Context) -> s
|
||||
@svc.api(route='/v1/chat/completions',
|
||||
input=bentoml.io.JSON.from_sample(
|
||||
openllm.utils.bentoml_cattr.unstructure(
|
||||
openllm.openai.ChatCompletionRequest(messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}],
|
||||
model=runner.llm_type))),
|
||||
openllm.openai.ChatCompletionRequest(messages=[{
|
||||
'role': 'system',
|
||||
'content': 'You are a helpful assistant.'
|
||||
}, {
|
||||
'role': 'user',
|
||||
'content': 'Hello!'
|
||||
}], model=runner.llm_type))),
|
||||
output=bentoml.io.Text())
|
||||
async def chat_completion_v1(input_dict: dict[str, t.Any], ctx: bentoml.Context) -> str | t.AsyncGenerator[str, None]:
|
||||
prompt = openllm.openai.messages_to_prompt(input_dict['messages'])
|
||||
@@ -194,32 +199,10 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
|
||||
output=bentoml.io.JSON.from_sample({
|
||||
'embeddings': [
|
||||
0.007917795330286026,
|
||||
-0.014421648345887661,
|
||||
0.00481307040899992,
|
||||
0.007331526838243008,
|
||||
-0.0066398633643984795,
|
||||
0.00945580005645752,
|
||||
0.0087016262114048,
|
||||
-0.010709521360695362,
|
||||
0.012635177001357079,
|
||||
0.010541186667978764,
|
||||
-0.00730888033285737,
|
||||
-0.001783102168701589,
|
||||
0.02339819073677063,
|
||||
-0.010825827717781067,
|
||||
-0.015888236463069916,
|
||||
0.01876218430697918,
|
||||
0.0076906150206923485,
|
||||
0.0009032754460349679,
|
||||
-0.010024012066423893,
|
||||
0.01090280432254076,
|
||||
-0.008668390102684498,
|
||||
0.02070549875497818,
|
||||
0.0014594447566196322,
|
||||
-0.018775740638375282,
|
||||
-0.014814382418990135,
|
||||
0.01796768605709076
|
||||
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362,
|
||||
0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916,
|
||||
0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818,
|
||||
0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076
|
||||
],
|
||||
'num_tokens': 20
|
||||
}))
|
||||
|
||||
@@ -87,15 +87,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
elif backend_envvar == 'tf':
|
||||
if not openllm_core.utils.is_tf_available():
|
||||
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
|
||||
candidates = ('tensorflow',
|
||||
'tensorflow-cpu',
|
||||
'tensorflow-gpu',
|
||||
'tf-nightly',
|
||||
'tf-nightly-cpu',
|
||||
'tf-nightly-gpu',
|
||||
'intel-tensorflow',
|
||||
'intel-tensorflow-avx512',
|
||||
'tensorflow-rocm',
|
||||
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm',
|
||||
'tensorflow-macos',
|
||||
)
|
||||
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
|
||||
@@ -123,14 +115,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
lock_packages=False,
|
||||
extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])
|
||||
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
|
||||
_: FS,
|
||||
workers_per_resource: float,
|
||||
quantize: LiteralString | None,
|
||||
adapter_map: dict[str, str | None] | None,
|
||||
dockerfile_template: str | None,
|
||||
serialisation: LiteralSerialisation,
|
||||
container_registry: LiteralContainerRegistry,
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
|
||||
dockerfile_template: str | None, serialisation: LiteralSerialisation, container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
|
||||
@@ -217,7 +203,11 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
_serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({
|
||||
'_type': llm.llm_type, '_framework': llm.config['env']['backend_value'], 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'
|
||||
'_type': llm.llm_type,
|
||||
'_framework': llm.config['env']['backend_value'],
|
||||
'start_name': llm.config['start_name'],
|
||||
'base_name_or_path': llm.model_id,
|
||||
'bundler': 'openllm.bundle'
|
||||
})
|
||||
if adapter_map: labels.update(adapter_map)
|
||||
if isinstance(workers_per_resource, str):
|
||||
@@ -244,14 +234,7 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
models=[llm_spec],
|
||||
docker=construct_docker_options(llm,
|
||||
llm_fs,
|
||||
workers_per_resource,
|
||||
quantize,
|
||||
adapter_map,
|
||||
dockerfile_template,
|
||||
_serialisation,
|
||||
container_registry,
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, _serialisation, container_registry,
|
||||
container_version_strategy))
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
|
||||
|
||||
@@ -54,9 +54,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
|
||||
# TODO: Support amd.com/gpu on k8s
|
||||
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
|
||||
_bentoml_config_options_opts = [
|
||||
'tracing.sample_rate=1.0',
|
||||
f'api_server.traffic.timeout={server_timeout}',
|
||||
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
|
||||
'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}', f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
|
||||
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
|
||||
]
|
||||
if device:
|
||||
@@ -118,22 +116,9 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
@group.command(**command_attrs)
|
||||
@start_decorator(llm_config, serve_grpc=_serve_grpc)
|
||||
@click.pass_context
|
||||
def start_cmd(ctx: click.Context,
|
||||
/,
|
||||
server_timeout: int,
|
||||
model_id: str | None,
|
||||
model_version: str | None,
|
||||
system_message: str | None,
|
||||
prompt_template_file: t.IO[t.Any] | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
|
||||
device: t.Tuple[str, ...],
|
||||
quantize: LiteralQuantise | None,
|
||||
backend: LiteralBackend,
|
||||
serialisation: LiteralSerialisation | None,
|
||||
cors: bool,
|
||||
adapter_id: str | None,
|
||||
return_process: bool,
|
||||
**attrs: t.Any,
|
||||
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, system_message: str | None, prompt_template_file: t.IO[t.Any] | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], quantize: LiteralQuantise | None, backend: LiteralBackend,
|
||||
serialisation: LiteralSerialisation | None, cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
_serialisation = openllm_core.utils.first_not_none(serialisation, default=llm_config['serialisation'])
|
||||
if _serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
@@ -235,16 +220,10 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
|
||||
composed = openllm.utils.compose(
|
||||
llm_config.to_click_options,
|
||||
_http_server_args if not serve_grpc else _grpc_server_args,
|
||||
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
|
||||
model_id_option(factory=cog.optgroup),
|
||||
model_version_option(factory=cog.optgroup),
|
||||
system_message_option(factory=cog.optgroup),
|
||||
prompt_template_file_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
|
||||
workers_per_resource_option(factory=cog.optgroup),
|
||||
cors_option(factory=cog.optgroup),
|
||||
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
|
||||
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), model_id_option(factory=cog.optgroup),
|
||||
model_version_option(factory=cog.optgroup), system_message_option(factory=cog.optgroup), prompt_template_file_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
|
||||
backend_option(factory=cog.optgroup),
|
||||
cog.optgroup.group('LLM Optimization Options',
|
||||
help='''Optimization related options.
|
||||
@@ -255,9 +234,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
|
||||
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
'''),
|
||||
quantize_option(factory=cog.optgroup),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
'''), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--device',
|
||||
type=openllm.utils.dantic.CUDA,
|
||||
multiple=True,
|
||||
@@ -286,8 +263,8 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
|
||||
multiple=True,
|
||||
callback=_id_callback,
|
||||
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
|
||||
click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
|
||||
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'), click.option('--return-process', is_flag=True, default=False, help='Internal use only.',
|
||||
hidden=True),
|
||||
)
|
||||
return composed(fn)
|
||||
|
||||
|
||||
@@ -22,7 +22,8 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
|
||||
'tag': str(b.tag),
|
||||
'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
|
||||
'models': [{
|
||||
'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
|
||||
'tag': str(m.tag),
|
||||
'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
|
||||
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
|
||||
} for b in tuple(i for i in bentoml.list() if all(
|
||||
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
|
||||
@@ -57,7 +57,9 @@ __lazy = LazyModule(__name__,
|
||||
os.path.abspath('__file__'),
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'CONFIG_MAPPING': CONFIG_MAPPING, 'CONFIG_MAPPING_NAMES': CONFIG_MAPPING_NAMES, 'AutoConfig': AutoConfig,
|
||||
'CONFIG_MAPPING': CONFIG_MAPPING,
|
||||
'CONFIG_MAPPING_NAMES': CONFIG_MAPPING_NAMES,
|
||||
'AutoConfig': AutoConfig,
|
||||
})
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
|
||||
@@ -160,10 +160,9 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
|
||||
|
||||
def items(self) -> ConfigModelItemsView:
|
||||
return t.cast('ConfigModelItemsView',
|
||||
[(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
|
||||
for key in self._model_mapping.keys()
|
||||
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
|
||||
return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
|
||||
for key in self._model_mapping.keys()
|
||||
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
|
||||
|
||||
def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
|
||||
return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
|
||||
|
||||
@@ -23,5 +23,7 @@ sys.modules[__name__] = LazyModule(__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE, 'START_CHATGLM_COMMAND_DOCSTRING': START_CHATGLM_COMMAND_DOCSTRING, 'ChatGLMConfig': ChatGLMConfig
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
|
||||
'START_CHATGLM_COMMAND_DOCSTRING': START_CHATGLM_COMMAND_DOCSTRING,
|
||||
'ChatGLMConfig': ChatGLMConfig
|
||||
})
|
||||
|
||||
@@ -30,5 +30,7 @@ sys.modules[__name__] = LazyModule(__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE, 'START_DOLLY_V2_COMMAND_DOCSTRING': START_DOLLY_V2_COMMAND_DOCSTRING, 'DollyV2Config': DollyV2Config
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
|
||||
'START_DOLLY_V2_COMMAND_DOCSTRING': START_DOLLY_V2_COMMAND_DOCSTRING,
|
||||
'DollyV2Config': DollyV2Config
|
||||
})
|
||||
|
||||
@@ -30,5 +30,7 @@ sys.modules[__name__] = LazyModule(__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE, 'START_FALCON_COMMAND_DOCSTRING': START_FALCON_COMMAND_DOCSTRING, 'FalconConfig': FalconConfig
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
|
||||
'START_FALCON_COMMAND_DOCSTRING': START_FALCON_COMMAND_DOCSTRING,
|
||||
'FalconConfig': FalconConfig
|
||||
})
|
||||
|
||||
@@ -30,5 +30,7 @@ sys.modules[__name__] = LazyModule(__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE, 'START_GPT_NEOX_COMMAND_DOCSTRING': START_GPT_NEOX_COMMAND_DOCSTRING, 'GPTNeoXConfig': GPTNeoXConfig
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
|
||||
'START_GPT_NEOX_COMMAND_DOCSTRING': START_GPT_NEOX_COMMAND_DOCSTRING,
|
||||
'GPTNeoXConfig': GPTNeoXConfig
|
||||
})
|
||||
|
||||
@@ -46,5 +46,7 @@ sys.modules[__name__] = LazyModule(__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
extra_objects={
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE, 'START_OPT_COMMAND_DOCSTRING': START_OPT_COMMAND_DOCSTRING, 'OPTConfig': OPTConfig,
|
||||
'DEFAULT_PROMPT_TEMPLATE': DEFAULT_PROMPT_TEMPLATE,
|
||||
'START_OPT_COMMAND_DOCSTRING': START_OPT_COMMAND_DOCSTRING,
|
||||
'OPTConfig': OPTConfig,
|
||||
})
|
||||
|
||||
@@ -33,7 +33,11 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
|
||||
'max_new_tokens': max_new_tokens,
|
||||
'temperature': temperature,
|
||||
'top_k': top_k,
|
||||
'num_return_sequences': num_return_sequences,
|
||||
'repetition_penalty': repetition_penalty
|
||||
}, {}
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
|
||||
@@ -19,5 +19,8 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
|
||||
'max_new_tokens': max_new_tokens,
|
||||
'temperature': temperature,
|
||||
'top_k': top_k,
|
||||
'num_return_sequences': num_return_sequences
|
||||
}, {}
|
||||
|
||||
@@ -34,11 +34,13 @@ def test_missing_default():
|
||||
make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
|
||||
|
||||
def test_forbidden_access():
|
||||
cl_ = make_llm_config(
|
||||
'ForbiddenAccess', {
|
||||
'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'],
|
||||
},
|
||||
)
|
||||
cl_ = make_llm_config('ForbiddenAccess', {
|
||||
'default_id': 'huggingface/t5-tiny-testing',
|
||||
'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'],
|
||||
'architecture': 'PreTrainedModel',
|
||||
'requirements': ['bentoml'],
|
||||
},
|
||||
)
|
||||
|
||||
assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), '__config__',)
|
||||
assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), 'GenerationConfig',)
|
||||
@@ -128,7 +130,9 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
|
||||
mk.setenv(field_env_key('field1'), str(4.0))
|
||||
mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
|
||||
sent = make_llm_config('OverwriteWithEnvAvailable', {
|
||||
'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel'
|
||||
'default_id': 'asdfasdf',
|
||||
'model_ids': ['asdf', 'asdfasdfads'],
|
||||
'architecture': 'PreTrainedModel'
|
||||
},
|
||||
fields=(('field1', 'float', 3.0),),
|
||||
).model_construct_env(field1=20.0, temperature=0.4)
|
||||
|
||||
@@ -196,7 +196,8 @@ def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode
|
||||
detach=True,
|
||||
device_requests=devs,
|
||||
ports={
|
||||
'3000/tcp': port, '3001/tcp': prom_port
|
||||
'3000/tcp': port,
|
||||
'3001/tcp': prom_port
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user