mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-18 22:55:08 -05:00
fix(sdk): remove broken sdk
codespace now around 2.8k lines Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
|
||||
from openllm_cli import _sdk
|
||||
from . import utils as utils
|
||||
|
||||
if utils.DEBUG:
|
||||
utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
|
||||
else:
|
||||
@@ -12,11 +11,8 @@ else:
|
||||
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
|
||||
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
|
||||
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
|
||||
|
||||
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
|
||||
|
||||
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
|
||||
__lazy = utils.LazyModule(
|
||||
__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
{
|
||||
@@ -34,14 +30,8 @@ __lazy = utils.LazyModule(
|
||||
'_llm': ['LLM'],
|
||||
},
|
||||
extra_objects={
|
||||
'COMPILED': COMPILED,
|
||||
'start': _sdk.start,
|
||||
'start_grpc': _sdk.start_grpc,
|
||||
'build': _sdk.build,
|
||||
'import_model': _sdk.import_model,
|
||||
'list_models': _sdk.list_models,
|
||||
'COMPILED': COMPILED, 'start': _sdk.start, 'build': _sdk.build, #
|
||||
'import_model': _sdk.import_model, 'list_models': _sdk.list_models, #
|
||||
},
|
||||
)
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
|
||||
@@ -1,65 +1,21 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import os
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import logging, os, warnings, typing as t
|
||||
import openllm
|
||||
from openllm_core._typing_compat import LiteralBackend, ParamSpec
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core.utils import first_not_none, getenv, is_vllm_available
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from ._runners import Runner as _Runner
|
||||
|
||||
P = ParamSpec('P')
|
||||
|
||||
__all__ = ['Runner']
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def Runner(
|
||||
model_name: str,
|
||||
ensure_available: bool = True,
|
||||
init_local: bool = False,
|
||||
backend: LiteralBackend | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> _Runner[t.Any, t.Any]:
|
||||
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
|
||||
|
||||
> [!WARNING]
|
||||
> This method is now deprecated and in favor of 'openllm.LLM'
|
||||
|
||||
```python
|
||||
runner = openllm.Runner("dolly-v2")
|
||||
|
||||
@svc.on_startup
|
||||
def download():
|
||||
runner.download_model()
|
||||
```
|
||||
|
||||
if `init_local=True` (For development workflow), it will also enable `ensure_available`.
|
||||
Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
|
||||
|
||||
Args:
|
||||
model_name: Supported model name from 'openllm models'
|
||||
ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
|
||||
If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models
|
||||
are available locally. based on generated tag.
|
||||
backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
|
||||
llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
|
||||
init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
|
||||
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
|
||||
"""
|
||||
from ._llm import LLM
|
||||
|
||||
if llm_config is None:
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
if not ensure_available:
|
||||
logger.warning(
|
||||
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
|
||||
)
|
||||
model_name: str, ensure_available: bool = True, #
|
||||
init_local: bool = False, backend: LiteralBackend | None = None, #
|
||||
llm_config: openllm.LLMConfig | None = None, **attrs: t.Any,
|
||||
):
|
||||
if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
if not ensure_available: logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
|
||||
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
|
||||
_RUNNER_MSG = f'''\
|
||||
warnings.warn(f'''\
|
||||
Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:
|
||||
|
||||
```python
|
||||
@@ -70,22 +26,11 @@ def Runner(
|
||||
@svc.api(...)
|
||||
async def chat(input: str) -> str:
|
||||
async for it in llm.generate_iterator(input): print(it)
|
||||
```
|
||||
'''
|
||||
warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
|
||||
```''', DeprecationWarning, stacklevel=2)
|
||||
attrs.update(
|
||||
{
|
||||
'model_id': model_id,
|
||||
'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
|
||||
'serialisation': getenv(
|
||||
'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
|
||||
),
|
||||
'model_id': model_id, 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
|
||||
'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
|
||||
}
|
||||
)
|
||||
|
||||
backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
|
||||
llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs)
|
||||
return llm.runner
|
||||
|
||||
|
||||
__all__ = ['Runner']
|
||||
return openllm.LLM(backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs).runner
|
||||
|
||||
@@ -47,23 +47,17 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
|
||||
|
||||
@attr.define(slots=False, repr=False, init=False)
|
||||
class LLM(t.Generic[M, T]):
|
||||
async def generate(
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
) -> GenerationOutput:
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt':
|
||||
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
|
||||
async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
|
||||
config = self.config.model_construct_env(**attrs)
|
||||
texts, token_ids = [[]] * config['n'], [[]] * config['n']
|
||||
final_result = None
|
||||
async for result in self.generate_iterator(
|
||||
prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True)
|
||||
):
|
||||
for output in result.outputs:
|
||||
texts[output.index].append(output.text)
|
||||
token_ids[output.index].extend(output.token_ids)
|
||||
final_result = result
|
||||
if final_result is None:
|
||||
raise RuntimeError('No result is returned.')
|
||||
if (final_result := result) is None: raise RuntimeError('No result is returned.')
|
||||
return final_result.with_options(
|
||||
prompt=prompt,
|
||||
outputs=[
|
||||
@@ -72,13 +66,9 @@ class LLM(t.Generic[M, T]):
|
||||
],
|
||||
)
|
||||
|
||||
async def generate_iterator(
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
) -> t.AsyncGenerator[GenerationOutput, None]:
|
||||
async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
|
||||
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
|
||||
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt':
|
||||
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
|
||||
if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
|
||||
|
||||
if isinstance(self.runner._runner_handle, DummyRunnerHandle):
|
||||
if os.getenv('BENTO_PATH') is not None:
|
||||
@@ -87,14 +77,12 @@ class LLM(t.Generic[M, T]):
|
||||
self.runner.init_local(quiet=True)
|
||||
config = self.config.model_construct_env(**attrs)
|
||||
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids = stop_token_ids or []
|
||||
eos_token_id = attrs.get('eos_token_id', config['eos_token_id'])
|
||||
if eos_token_id is not None:
|
||||
if not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
|
||||
stop_token_ids.extend(eos_token_id)
|
||||
if config['eos_token_id'] and config['eos_token_id'] not in stop_token_ids: stop_token_ids.append(config['eos_token_id'])
|
||||
if self.tokenizer.eos_token_id not in stop_token_ids:
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
if eos_token_id and not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
|
||||
stop_token_ids.extend(eos_token_id or [])
|
||||
if (config_eos := config['eos_token_id']) and config_eos not in stop_token_ids: stop_token_ids.append(config_eos)
|
||||
if self.tokenizer.eos_token_id not in stop_token_ids: stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
if stop is None:
|
||||
stop = set()
|
||||
elif isinstance(stop, str):
|
||||
@@ -102,20 +90,16 @@ class LLM(t.Generic[M, T]):
|
||||
else:
|
||||
stop = set(stop)
|
||||
for tid in stop_token_ids:
|
||||
if tid:
|
||||
stop.add(self.tokenizer.decode(tid))
|
||||
if tid: stop.add(self.tokenizer.decode(tid))
|
||||
|
||||
if prompt_token_ids is None:
|
||||
if prompt is None:
|
||||
raise ValueError('Either prompt or prompt_token_ids must be specified.')
|
||||
if prompt is None: raise ValueError('Either prompt or prompt_token_ids must be specified.')
|
||||
prompt_token_ids = self.tokenizer.encode(prompt)
|
||||
|
||||
request_id = gen_random_uuid() if request_id is None else request_id
|
||||
previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
|
||||
try:
|
||||
generator = self.runner.generate_iterator.async_stream(
|
||||
prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)
|
||||
)
|
||||
generator = self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True))
|
||||
except Exception as err:
|
||||
raise RuntimeError(f'Failed to start generation task: {err}') from err
|
||||
|
||||
@@ -134,18 +118,11 @@ class LLM(t.Generic[M, T]):
|
||||
|
||||
# NOTE: If you are here to see how generate_iterator and generate works, see above.
|
||||
# The below are mainly for internal implementation that you don't have to worry about.
|
||||
_model_id: str
|
||||
_revision: t.Optional[str]
|
||||
_model_id: str; _revision: t.Optional[str] #
|
||||
_quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
|
||||
_quantise: t.Optional[LiteralQuantise]
|
||||
_model_decls: t.Tuple[t.Any, ...]
|
||||
__model_attrs: t.Dict[str, t.Any]
|
||||
__tokenizer_attrs: t.Dict[str, t.Any]
|
||||
_tag: bentoml.Tag
|
||||
_adapter_map: t.Optional[AdapterMap]
|
||||
_serialisation: LiteralSerialisation
|
||||
_local: bool
|
||||
_max_model_len: t.Optional[int]
|
||||
_quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
|
||||
__tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
|
||||
_serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #
|
||||
|
||||
__llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
|
||||
__llm_torch_dtype__: 'torch.dtype' = None
|
||||
@@ -180,12 +157,7 @@ class LLM(t.Generic[M, T]):
|
||||
):
|
||||
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
|
||||
if torch_dtype is not None:
|
||||
warnings.warn(
|
||||
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
dtype = torch_dtype
|
||||
warnings.warn('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3); dtype = torch_dtype
|
||||
_local = False
|
||||
if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True
|
||||
backend = getenv('backend', default=backend)
|
||||
@@ -291,7 +263,7 @@ class LLM(t.Generic[M, T]):
|
||||
if is_vllm_available():
|
||||
return 'vllm'
|
||||
elif is_ctranslate_available():
|
||||
return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
|
||||
return 'ctranslate'
|
||||
elif is_ctranslate_available():
|
||||
return 'ctranslate'
|
||||
else:
|
||||
@@ -449,8 +421,7 @@ def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
|
||||
except Exception as err:
|
||||
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
|
||||
with open(config_file, 'r') as file:
|
||||
resolved_config = orjson.loads(file.read())
|
||||
with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read())
|
||||
_peft_type = resolved_config['peft_type'].lower()
|
||||
if _peft_type not in resolved: resolved[_peft_type] = ()
|
||||
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from openllm_core.exceptions import MissingDependencyError
|
||||
from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available
|
||||
|
||||
|
||||
def infer_quantisation_config(llm, quantise, **attrs):
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
import torch, transformers
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
|
||||
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
|
||||
@@ -85,25 +80,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
|
||||
|
||||
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
|
||||
if not is_bitsandbytes_available():
|
||||
raise RuntimeError(
|
||||
'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
|
||||
)
|
||||
raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
|
||||
if quantise == 'int8':
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == 'int4':
|
||||
quantisation_config = create_int4_config()
|
||||
elif quantise == 'gptq':
|
||||
if not is_autogptq_available():
|
||||
raise MissingDependencyError(
|
||||
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
|
||||
)
|
||||
raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
|
||||
else:
|
||||
quantisation_config = create_gptq_config()
|
||||
elif quantise == 'awq':
|
||||
if not is_autoawq_available():
|
||||
raise MissingDependencyError(
|
||||
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
|
||||
)
|
||||
raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
|
||||
else:
|
||||
quantisation_config = create_awq_config()
|
||||
else:
|
||||
|
||||
@@ -1,66 +1,43 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import logging, typing as t
|
||||
import _service_vars as svars
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import bentoml, openllm
|
||||
from openllm_core._schemas import MessageParam
|
||||
from bentoml.io import JSON, Text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](
|
||||
model_id=svars.model_id,
|
||||
model_tag=svars.model_tag,
|
||||
serialisation=svars.serialization,
|
||||
adapter_map=svars.adapter_map,
|
||||
trust_remote_code=svars.trust_remote_code,
|
||||
model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
|
||||
serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
|
||||
)
|
||||
svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
|
||||
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
|
||||
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate',
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=JSON.from_sample(openllm.GenerationOutput.examples()),
|
||||
input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
|
||||
)
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
|
||||
return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
|
||||
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
|
||||
|
||||
@svc.api(
|
||||
route='/v1/generate_stream',
|
||||
input=JSON.from_sample(llm_model_class.examples()),
|
||||
output=Text(content_type='text/event-stream'),
|
||||
input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
|
||||
)
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
|
||||
yield f'data: {it.model_dump_json()}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
|
||||
_Metadata = openllm.MetadataOutput(
|
||||
timeout=llm.config['timeout'],
|
||||
model_name=llm.config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
|
||||
backend=llm.__llm_backend__, model_id=llm.model_id, #
|
||||
configuration=llm.config.model_dump_json().decode(),
|
||||
)
|
||||
|
||||
|
||||
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return _Metadata
|
||||
|
||||
|
||||
class MessagesConverterInput(t.TypedDict):
|
||||
add_generation_prompt: bool
|
||||
messages: t.List[t.Dict[str, t.Any]]
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata
|
||||
|
||||
class MessagesConverterInput(t.TypedDict): add_generation_prompt: bool; messages: t.List[t.Dict[str, t.Any]]
|
||||
|
||||
@svc.api(
|
||||
route='/v1/helpers/messages',
|
||||
@@ -69,18 +46,14 @@ class MessagesConverterInput(t.TypedDict):
|
||||
add_generation_prompt=False,
|
||||
messages=[
|
||||
MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
|
||||
MessageParam(role='user', content='Hi there!'),
|
||||
MessageParam(role='assistant', content='Yes?'),
|
||||
MessageParam(role='user', content='Hi there!'), MessageParam(role='assistant', content='Yes?'), #
|
||||
],
|
||||
)
|
||||
),
|
||||
output=Text(),
|
||||
)
|
||||
def helpers_messages_v1(message: MessagesConverterInput) -> str:
|
||||
add_generation_prompt = message['add_generation_prompt']
|
||||
messages = message['messages']
|
||||
add_generation_prompt, messages = message['add_generation_prompt'], message['messages']
|
||||
return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
||||
|
||||
|
||||
# HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
openllm.mount_entrypoints(svc, llm)
|
||||
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
|
||||
@@ -1,9 +1,2 @@
|
||||
import os, orjson, openllm_core.utils as coreutils
|
||||
|
||||
model_id, model_tag, adapter_map, serialization, trust_remote_code = (
|
||||
os.environ['OPENLLM_MODEL_ID'],
|
||||
None,
|
||||
orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
|
||||
os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
|
||||
coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
|
||||
)
|
||||
model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)
|
||||
|
||||
@@ -4,44 +4,33 @@ import psutil, bentoml, openllm_core.utils as coreutils
|
||||
from bentoml._internal.resource import get_resource, system_resources
|
||||
from bentoml._internal.runner.strategy import THREAD_ENVS
|
||||
|
||||
__all__ = ['CascadingResourceStrategy', 'get_resource']
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _strtoul(s: str) -> int:
|
||||
# Return -1 or positive integer sequence string starts with.
|
||||
if not s:
|
||||
return -1
|
||||
if not s: return -1
|
||||
idx = 0
|
||||
for idx, c in enumerate(s):
|
||||
if not (c.isdigit() or (idx == 0 and c in '+-')):
|
||||
break
|
||||
if idx + 1 == len(s):
|
||||
idx += 1 # noqa: PLW2901
|
||||
if not (c.isdigit() or (idx == 0 and c in '+-')): break
|
||||
if idx + 1 == len(s): idx += 1 # noqa: PLW2901
|
||||
# NOTE: idx will be set via enumerate
|
||||
return int(s[:idx]) if idx > 0 else -1
|
||||
|
||||
|
||||
def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
|
||||
rcs: list[str] = []
|
||||
rcs = []
|
||||
for elem in lst.split(','):
|
||||
# Repeated id results in empty set
|
||||
if elem in rcs:
|
||||
return []
|
||||
if elem in rcs: return []
|
||||
# Anything other but prefix is ignored
|
||||
if not elem.startswith(prefix):
|
||||
break
|
||||
if not elem.startswith(prefix): break
|
||||
rcs.append(elem)
|
||||
return rcs
|
||||
|
||||
|
||||
def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
|
||||
if respect_env:
|
||||
spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
|
||||
if not spec:
|
||||
return None
|
||||
if not spec: return None
|
||||
else:
|
||||
if default_var is None:
|
||||
raise ValueError('spec is required to be not None when parsing spec.')
|
||||
if default_var is None: raise ValueError('spec is required to be not None when parsing spec.')
|
||||
spec = default_var
|
||||
|
||||
if spec.startswith('GPU-'):
|
||||
@@ -55,64 +44,52 @@ def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: boo
|
||||
for el in spec.split(','):
|
||||
x = _strtoul(el.strip())
|
||||
# Repeated ordinal results in empty set
|
||||
if x in rc:
|
||||
return []
|
||||
if x in rc: return []
|
||||
# Negative value aborts the sequence
|
||||
if x < 0:
|
||||
break
|
||||
if x < 0: break
|
||||
rc.append(x)
|
||||
return [str(i) for i in rc]
|
||||
|
||||
|
||||
def _raw_device_uuid_nvml() -> list[str] | None:
|
||||
from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
|
||||
|
||||
try:
|
||||
nvml_h = CDLL('libnvidia-ml.so.1')
|
||||
except Exception:
|
||||
warnings.warn('Failed to find nvidia binding', stacklevel=3)
|
||||
return None
|
||||
warnings.warn('Failed to find nvidia binding', stacklevel=3); return None
|
||||
|
||||
rc = nvml_h.nvmlInit()
|
||||
if rc != 0:
|
||||
warnings.warn("Can't initialize NVML", stacklevel=3)
|
||||
return None
|
||||
warnings.warn("Can't initialize NVML", stacklevel=3); return None
|
||||
dev_count = c_int(-1)
|
||||
rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
|
||||
if rc != 0:
|
||||
warnings.warn('Failed to get available device from system.', stacklevel=3)
|
||||
return None
|
||||
uuids: list[str] = []
|
||||
warnings.warn('Failed to get available device from system.', stacklevel=3); return None
|
||||
uuids = []
|
||||
for idx in range(dev_count.value):
|
||||
dev_id = c_void_p()
|
||||
rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
|
||||
if rc != 0:
|
||||
warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
|
||||
return None
|
||||
warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3); return None
|
||||
buf_len = 96
|
||||
buf = create_string_buffer(buf_len)
|
||||
rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
|
||||
if rc != 0:
|
||||
warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
|
||||
return None
|
||||
warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3); return None
|
||||
uuids.append(buf.raw.decode('ascii').strip('\0'))
|
||||
del nvml_h
|
||||
return uuids
|
||||
|
||||
|
||||
class _ResourceMixin:
|
||||
@staticmethod
|
||||
def from_system(cls) -> list[str]:
|
||||
visible_devices = _parse_cuda_visible_devices()
|
||||
if visible_devices is None:
|
||||
if cls.resource_id == 'amd.com/gpu':
|
||||
if not psutil.LINUX:
|
||||
if coreutils.DEBUG:
|
||||
logger.debug('AMD GPUs is currently only supported on Linux.')
|
||||
return []
|
||||
if not psutil.LINUX: return []
|
||||
# ROCm does not currently have the rocm_smi wheel.
|
||||
# So we need to use the ctypes bindings directly.
|
||||
# we don't want to use CLI because parsing is a pain.
|
||||
# TODO: Use tinygrad/gpuctypes
|
||||
sys.path.append('/opt/rocm/libexec/rocm_smi')
|
||||
try:
|
||||
from ctypes import byref, c_uint32
|
||||
@@ -122,8 +99,7 @@ class _ResourceMixin:
|
||||
|
||||
device_count = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
|
||||
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
|
||||
return [str(i) for i in range(device_count.value)]
|
||||
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)]
|
||||
return []
|
||||
# In this case the binary is not found, returning empty list
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
@@ -140,59 +116,43 @@ class _ResourceMixin:
|
||||
except (ImportError, RuntimeError, AttributeError):
|
||||
return []
|
||||
return visible_devices
|
||||
|
||||
@staticmethod
|
||||
def from_spec(cls, spec) -> list[str]:
|
||||
if isinstance(spec, int):
|
||||
if spec in (-1, 0):
|
||||
return []
|
||||
if spec < -1:
|
||||
raise ValueError('Spec cannot be < -1.')
|
||||
if spec in (-1, 0): return []
|
||||
if spec < -1: raise ValueError('Spec cannot be < -1.')
|
||||
return [str(i) for i in range(spec)]
|
||||
elif isinstance(spec, str):
|
||||
if not spec:
|
||||
return []
|
||||
if spec.isdigit():
|
||||
spec = ','.join([str(i) for i in range(_strtoul(spec))])
|
||||
if not spec: return []
|
||||
if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))])
|
||||
return _parse_cuda_visible_devices(spec, respect_env=False)
|
||||
elif isinstance(spec, list):
|
||||
return [str(x) for x in spec]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
|
||||
)
|
||||
|
||||
raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
|
||||
@staticmethod
|
||||
def validate(cls, val: list[t.Any]) -> None:
|
||||
if cls.resource_id == 'amd.com/gpu':
|
||||
raise RuntimeError(
|
||||
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
|
||||
)
|
||||
raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
|
||||
if not all(isinstance(i, str) for i in val):
|
||||
raise ValueError('Input list should be all string type.')
|
||||
|
||||
try:
|
||||
from cuda import cuda
|
||||
|
||||
err, *_ = cuda.cuInit(0)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
# correctly parse handle
|
||||
for el in val:
|
||||
if el.startswith(('GPU-', 'MIG-')):
|
||||
uuids = _raw_device_uuid_nvml()
|
||||
if uuids is None:
|
||||
raise ValueError('Failed to parse available GPUs UUID')
|
||||
if el not in uuids:
|
||||
raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
|
||||
if uuids is None: raise ValueError('Failed to parse available GPUs UUID')
|
||||
if el not in uuids: raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
|
||||
elif el.isdigit():
|
||||
err, _ = cuda.cuDeviceGet(int(el))
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise ValueError(f'Failed to get device {el}')
|
||||
if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}')
|
||||
except (ImportError, RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
|
||||
return types.new_class(
|
||||
name,
|
||||
@@ -201,22 +161,16 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
|
||||
lambda ns: ns.update(
|
||||
{
|
||||
'resource_id': resource_kind,
|
||||
'from_spec': classmethod(_ResourceMixin.from_spec),
|
||||
'from_system': classmethod(_ResourceMixin.from_system),
|
||||
'validate': classmethod(_ResourceMixin.validate),
|
||||
'__repr_keys__': property(lambda _: {'resource_id'}),
|
||||
'__doc__': inspect.cleandoc(docstring),
|
||||
'__module__': 'openllm._strategies',
|
||||
'from_spec': classmethod(_ResourceMixin.from_spec), 'from_system': classmethod(_ResourceMixin.from_system), #
|
||||
'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), #
|
||||
'__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', #
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
NvidiaGpuResource = _make_resource_class(
|
||||
'NvidiaGpuResource',
|
||||
'nvidia.com/gpu',
|
||||
'''NVIDIA GPU resource.
|
||||
|
||||
This is a modified version of internal's BentoML's NvidiaGpuResource
|
||||
where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
|
||||
)
|
||||
@@ -224,73 +178,53 @@ AmdGpuResource = _make_resource_class(
|
||||
'AmdGpuResource',
|
||||
'amd.com/gpu',
|
||||
'''AMD GPU resource.
|
||||
|
||||
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
|
||||
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
|
||||
)
|
||||
|
||||
|
||||
class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
|
||||
@classmethod
|
||||
def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
|
||||
if resource_request is None:
|
||||
resource_request = system_resources()
|
||||
if resource_request is None: resource_request = system_resources()
|
||||
# use NVIDIA
|
||||
kind = 'nvidia.com/gpu'
|
||||
nvidia_req = get_resource(resource_request, kind)
|
||||
if nvidia_req is not None:
|
||||
return 1
|
||||
if nvidia_req is not None: return 1
|
||||
# use AMD
|
||||
kind = 'amd.com/gpu'
|
||||
amd_req = get_resource(resource_request, kind, validate=False)
|
||||
if amd_req is not None:
|
||||
return 1
|
||||
if amd_req is not None: return 1
|
||||
# use CPU
|
||||
cpus = get_resource(resource_request, 'cpu')
|
||||
if cpus is not None and cpus > 0:
|
||||
if 'cpu' not in runnable_class.SUPPORTED_RESOURCES:
|
||||
logger.warning('No known supported resource available for %s, falling back to using CPU.', runnable_class)
|
||||
|
||||
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
|
||||
if isinstance(workers_per_resource, float) and workers_per_resource < 1.0:
|
||||
raise ValueError('Fractional CPU multi threading support is not yet supported.')
|
||||
if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError('Fractional CPU multi threading support is not yet supported.')
|
||||
return int(workers_per_resource)
|
||||
return math.ceil(cpus) * workers_per_resource
|
||||
|
||||
# this should not be reached by user since we always read system resource as default
|
||||
raise ValueError(
|
||||
f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.'
|
||||
)
|
||||
|
||||
raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.')
|
||||
@classmethod
|
||||
def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
|
||||
cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
|
||||
disabled = cuda_env in ('', '-1')
|
||||
environ: dict[str, t.Any] = {}
|
||||
environ = {}
|
||||
|
||||
if resource_request is None:
|
||||
resource_request = system_resources()
|
||||
if resource_request is None: resource_request = system_resources()
|
||||
# use NVIDIA
|
||||
kind = 'nvidia.com/gpu'
|
||||
typ = get_resource(resource_request, kind)
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
|
||||
if disabled:
|
||||
logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cuda_env
|
||||
return environ
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
|
||||
logger.debug('Environ for worker %s: %s', worker_index, environ)
|
||||
return environ
|
||||
# use AMD
|
||||
kind = 'amd.com/gpu'
|
||||
typ = get_resource(resource_request, kind, validate=False)
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
|
||||
if disabled:
|
||||
logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cuda_env
|
||||
return environ
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
|
||||
environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
|
||||
logger.debug('Environ for worker %s: %s', worker_index, environ)
|
||||
return environ
|
||||
# use CPU
|
||||
cpus = get_resource(resource_request, 'cpu')
|
||||
@@ -298,25 +232,17 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
|
||||
environ['CUDA_VISIBLE_DEVICES'] = '-1' # disable gpu
|
||||
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
|
||||
thread_count = math.ceil(cpus)
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.environ.get(thread_env, str(thread_count))
|
||||
logger.debug('Environ for worker %s: %s', worker_index, environ)
|
||||
for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, str(thread_count))
|
||||
return environ
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.environ.get(thread_env, '1')
|
||||
for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, '1')
|
||||
return environ
|
||||
return environ
|
||||
|
||||
@staticmethod
|
||||
def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
|
||||
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
|
||||
if isinstance(workers_per_resource, float):
|
||||
# NOTE: We hit this branch when workers_per_resource is set to
|
||||
# float, for example 0.5 or 0.25
|
||||
if workers_per_resource > 1:
|
||||
raise ValueError(
|
||||
"Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
|
||||
)
|
||||
# NOTE: We hit this branch when workers_per_resource is set to float, for example 0.5 or 0.25
|
||||
if workers_per_resource > 1: raise ValueError('workers_per_resource > 1 is not supported.')
|
||||
# We are round the assigned resource here. This means if workers_per_resource=.4
|
||||
# then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
|
||||
assigned_resource_per_worker = round(1 / workers_per_resource)
|
||||
@@ -327,21 +253,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
|
||||
worker_index,
|
||||
assigned_resource_per_worker,
|
||||
)
|
||||
raise IndexError(
|
||||
f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
|
||||
)
|
||||
assigned_gpu = gpus[
|
||||
assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
|
||||
]
|
||||
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
|
||||
assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
|
||||
dev = ','.join(assigned_gpu)
|
||||
else:
|
||||
idx = worker_index // workers_per_resource
|
||||
if idx >= len(gpus):
|
||||
raise ValueError(
|
||||
f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
|
||||
)
|
||||
raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
|
||||
dev = str(gpus[idx])
|
||||
return dev
|
||||
|
||||
|
||||
__all__ = ['CascadingResourceStrategy', 'get_resource']
|
||||
|
||||
@@ -4,7 +4,6 @@ from openllm_core._typing_compat import LiteralVersionStrategy
|
||||
from openllm_core.exceptions import OpenLLMException
|
||||
from openllm_core.utils.lazy import VersionInfo, LazyModule
|
||||
|
||||
_OWNER, _REPO = 'bentoml', 'openllm'
|
||||
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
|
||||
class RefResolver:
|
||||
git_hash: str = attr.field()
|
||||
@@ -17,7 +16,7 @@ class RefResolver:
|
||||
if strategy_or_version is None or strategy_or_version == 'release':
|
||||
try:
|
||||
from ghapi.all import GhApi
|
||||
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
|
||||
ghapi = GhApi(owner='bentoml', repo='openllm', authenticate=False)
|
||||
meta = ghapi.repos.get_latest_release()
|
||||
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
|
||||
except Exception as err:
|
||||
@@ -35,6 +34,4 @@ __lazy = LazyModule(
|
||||
{'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
|
||||
extra_objects={'RefResolver': RefResolver}
|
||||
)
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
|
||||
@@ -1,15 +1,7 @@
|
||||
# mypy: disable-error-code="misc"
|
||||
from __future__ import annotations
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import orjson
|
||||
import importlib.metadata, logging, os, pathlib
|
||||
import bentoml, orjson, openllm_core
|
||||
from simple_di import Provide, inject
|
||||
|
||||
import bentoml
|
||||
import openllm_core
|
||||
from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
|
||||
@@ -17,7 +9,7 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
|
||||
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
|
||||
_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
|
||||
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
|
||||
|
||||
def build_editable(path, package='openllm'):
|
||||
@@ -28,7 +20,7 @@ def build_editable(path, package='openllm'):
|
||||
from build.env import IsolatedEnvBuilder
|
||||
module_location = pkg.source_locations(package)
|
||||
if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
|
||||
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
|
||||
pyproject_path = pathlib.Path(module_location).parent.parent / 'pyproject.toml'
|
||||
if os.path.isfile(pyproject_path.__fspath__()):
|
||||
with IsolatedEnvBuilder() as env:
|
||||
builder = ProjectBuilder(pyproject_path.parent)
|
||||
@@ -70,12 +62,9 @@ def create_bento(
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update(
|
||||
{
|
||||
'_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'],
|
||||
'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
|
||||
**{
|
||||
f'{package.replace("-","_")}_version': importlib.metadata.version(package)
|
||||
for package in {'openllm', 'openllm-core', 'openllm-client'}
|
||||
},
|
||||
'_type': llm.llm_type, '_framework': llm.__llm_backend__,
|
||||
'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
|
||||
**{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
|
||||
}
|
||||
)
|
||||
if adapter_map: labels.update(adapter_map)
|
||||
@@ -83,18 +72,15 @@ def create_bento(
|
||||
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
|
||||
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
|
||||
script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
|
||||
__model_id__=llm.model_id,
|
||||
__model_tag__=str(llm.tag),
|
||||
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
|
||||
__model_serialization__=llm.config['serialisation'],
|
||||
__model_id__=llm.model_id, __model_tag__=str(llm.tag), #
|
||||
__model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
|
||||
__model_trust_remote_code__=str(llm.trust_remote_code),
|
||||
)
|
||||
if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
|
||||
llm_fs.writetext('_service_vars.py', script)
|
||||
with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
|
||||
llm_fs.writetext(llm.config['service_name'], service_src)
|
||||
|
||||
bento = bentoml.Bento.create(
|
||||
return bentoml.Bento.create(
|
||||
version=bento_tag.version,
|
||||
build_ctx=llm_fs.getsyspath('/'),
|
||||
build_config=BentoBuildConfig(
|
||||
@@ -108,6 +94,4 @@ def create_bento(
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
|
||||
),
|
||||
)
|
||||
|
||||
return bento.save(bento_store=_bento_store, model_store=_model_store)
|
||||
).save(bento_store=_bento_store, model_store=_model_store)
|
||||
|
||||
@@ -1,10 +1,2 @@
|
||||
def __dir__():
|
||||
import openllm_client as _client
|
||||
|
||||
return sorted(dir(_client))
|
||||
|
||||
|
||||
def __getattr__(it):
|
||||
import openllm_client as _client
|
||||
|
||||
return getattr(_client, it)
|
||||
def __dir__(): import openllm_client as _client; return sorted(dir(_client))
|
||||
def __getattr__(it): import openllm_client as _client; return getattr(_client, it)
|
||||
|
||||
@@ -1,20 +1,11 @@
|
||||
import importlib
|
||||
|
||||
from openllm_core.utils import LazyModule
|
||||
|
||||
_import_structure = {'openai': [], 'hf': [], 'cohere': []}
|
||||
|
||||
|
||||
def mount_entrypoints(svc, llm):
|
||||
for module_name in _import_structure:
|
||||
module = importlib.import_module(f'.{module_name}', __name__)
|
||||
svc = module.mount_to_svc(svc, llm)
|
||||
return svc
|
||||
|
||||
|
||||
__lazy = LazyModule(
|
||||
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
|
||||
)
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
|
||||
@@ -1,17 +1,11 @@
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import traceback
|
||||
import functools, json, logging, traceback
|
||||
from http import HTTPStatus
|
||||
|
||||
import orjson
|
||||
from starlette.applications import Starlette
|
||||
from starlette.responses import JSONResponse, StreamingResponse
|
||||
from starlette.routing import Route
|
||||
|
||||
from openllm_core.utils import DEBUG, converter, gen_random_uuid
|
||||
|
||||
from ._openapi import add_schema_definitions, append_schemas, get_generator
|
||||
from ..protocol.cohere import (
|
||||
Chat,
|
||||
@@ -54,41 +48,31 @@ schemas = get_generator(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def jsonify_attr(obj):
|
||||
return json.dumps(converter.unstructure(obj))
|
||||
|
||||
def jsonify_attr(obj): return json.dumps(converter.unstructure(obj))
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value)
|
||||
|
||||
|
||||
async def check_model(request, model):
|
||||
if request.model is None or request.model == model:
|
||||
return None
|
||||
if request.model is None or request.model == model: return None
|
||||
return error_response(
|
||||
HTTPStatus.NOT_FOUND,
|
||||
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
|
||||
)
|
||||
|
||||
|
||||
def mount_to_svc(svc, llm):
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[
|
||||
Route(
|
||||
'/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
|
||||
),
|
||||
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
|
||||
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
|
||||
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
|
||||
Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
|
||||
],
|
||||
)
|
||||
mount_path = '/cohere'
|
||||
|
||||
svc.mount_asgi_app(app, path=mount_path)
|
||||
return append_schemas(
|
||||
svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
|
||||
)
|
||||
|
||||
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
|
||||
|
||||
@add_schema_definitions
|
||||
async def cohere_generate(req, llm):
|
||||
@@ -181,7 +165,6 @@ def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str
|
||||
messages.append({'role': 'user', 'content': request.message})
|
||||
return messages
|
||||
|
||||
|
||||
@add_schema_definitions
|
||||
async def cohere_chat(req, llm):
|
||||
json_str = await req.body()
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
import functools
|
||||
import logging
|
||||
import functools, logging
|
||||
from http import HTTPStatus
|
||||
|
||||
import orjson
|
||||
from starlette.applications import Starlette
|
||||
from starlette.responses import JSONResponse
|
||||
from starlette.routing import Route
|
||||
|
||||
from openllm_core.utils import converter
|
||||
|
||||
from ._openapi import add_schema_definitions, append_schemas, get_generator
|
||||
from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
|
||||
|
||||
@@ -25,7 +21,6 @@ schemas = get_generator(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def mount_to_svc(svc, llm):
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
@@ -39,13 +34,8 @@ def mount_to_svc(svc, llm):
|
||||
svc.mount_asgi_app(app, path=mount_path)
|
||||
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
|
||||
|
||||
|
||||
def error_response(status_code, message):
|
||||
return JSONResponse(
|
||||
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
|
||||
status_code=status_code.value,
|
||||
)
|
||||
|
||||
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
|
||||
|
||||
@add_schema_definitions
|
||||
async def hf_agent(req, llm):
|
||||
@@ -60,18 +50,14 @@ async def hf_agent(req, llm):
|
||||
stop = request.parameters.pop('stop', ['\n'])
|
||||
try:
|
||||
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
|
||||
return JSONResponse(
|
||||
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
|
||||
)
|
||||
return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
|
||||
except Exception as err:
|
||||
logger.error('Error while generating: %s', err)
|
||||
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
|
||||
|
||||
|
||||
@add_schema_definitions
|
||||
def hf_adapters(req, llm):
|
||||
if not llm.has_adapters:
|
||||
return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
|
||||
if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
|
||||
return JSONResponse(
|
||||
{
|
||||
adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value}
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
from openllm_core.exceptions import (
|
||||
Error as Error,
|
||||
FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError,
|
||||
ForbiddenAttributeError as ForbiddenAttributeError,
|
||||
GpuNotAvailableError as GpuNotAvailableError,
|
||||
Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, #
|
||||
ForbiddenAttributeError as ForbiddenAttributeError, GpuNotAvailableError as GpuNotAvailableError, #
|
||||
OpenLLMException as OpenLLMException, ValidationError as ValidationError, #
|
||||
MissingAnnotationAttributeError as MissingAnnotationAttributeError,
|
||||
MissingDependencyError as MissingDependencyError,
|
||||
OpenLLMException as OpenLLMException,
|
||||
ValidationError as ValidationError,
|
||||
)
|
||||
|
||||
@@ -5,11 +5,6 @@ import typing as t
|
||||
from openllm_core.utils import LazyModule
|
||||
|
||||
_import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from . import cohere as cohere, hf as hf, openai as openai
|
||||
|
||||
if t.TYPE_CHECKING: from . import cohere as cohere, hf as hf, openai as openai
|
||||
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
|
||||
|
||||
@@ -1,36 +1,16 @@
|
||||
import functools, importlib.metadata, openllm_core
|
||||
|
||||
__all__ = ['generate_labels', 'available_devices', 'device_count']
|
||||
|
||||
|
||||
def generate_labels(llm):
|
||||
return {
|
||||
'backend': llm.__llm_backend__,
|
||||
'framework': 'openllm',
|
||||
'model_name': llm.config['model_name'],
|
||||
'architecture': llm.config['architecture'],
|
||||
'serialisation': llm._serialisation,
|
||||
'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], #
|
||||
'architecture': llm.config['architecture'], 'serialisation': llm._serialisation, #
|
||||
**{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
|
||||
}
|
||||
|
||||
|
||||
def available_devices():
|
||||
from ._strategies import NvidiaGpuResource
|
||||
|
||||
return tuple(NvidiaGpuResource.from_system())
|
||||
|
||||
|
||||
def available_devices(): from ._strategies import NvidiaGpuResource; return tuple(NvidiaGpuResource.from_system())
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def device_count() -> int:
|
||||
return len(available_devices())
|
||||
|
||||
|
||||
def device_count() -> int: return len(available_devices())
|
||||
def __dir__():
|
||||
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
|
||||
return sorted(__all__) + sorted(list(coreutils))
|
||||
|
||||
|
||||
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]); return sorted(__all__) + sorted(list(coreutils))
|
||||
def __getattr__(it):
|
||||
if hasattr(openllm_core.utils, it):
|
||||
return getattr(openllm_core.utils, it)
|
||||
if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
|
||||
raise AttributeError(f'module {__name__} has no attribute {it}')
|
||||
|
||||
Reference in New Issue
Block a user