style: google

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-08-30 13:52:00 -04:00
parent e2ba6a92a6
commit b545ad2ad1
98 changed files with 3514 additions and 2094 deletions

View File

@@ -26,11 +26,14 @@ else:
# configuration for bitsandbytes before import
_os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
_warnings.filterwarnings(
"ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
_warnings.filterwarnings(
"ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
_warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
# NOTE: ignore the following warning from ghapi as it is not important for users
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_warnings.filterwarnings("ignore",
message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_import_structure: dict[str, list[str]] = {
"exceptions": [],
@@ -45,8 +48,13 @@ _import_structure: dict[str, list[str]] = {
"_quantisation": ["infer_quantisation_config"],
"_embeddings": ["GenericEmbeddingRunnable"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
"_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
"models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"],
"_generation": [
"StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
"prepare_logits_processor"
],
"models.auto": [
"MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"
],
"models.chatglm": [],
"models.baichuan": [],
"models.dolly_v2": [],
@@ -73,7 +81,8 @@ if _t.TYPE_CHECKING:
from .utils import infer_auto_class as infer_auto_class
try:
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()):
raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
else:
@@ -83,7 +92,8 @@ else:
from .models.baichuan import Baichuan as Baichuan
from .models.chatglm import ChatGLM as ChatGLM
try:
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()):
raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
@@ -91,7 +101,8 @@ else:
_import_structure["models.mpt"].extend(["MPT"])
if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
try:
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()):
raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
@@ -103,7 +114,8 @@ try:
if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_pt_objects"] = [
name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
name for name in dir(utils.dummy_pt_objects)
if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
]
else:
_import_structure["models.flan_t5"].extend(["FlanT5"])
@@ -126,7 +138,9 @@ else:
try:
if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
_import_structure["utils.dummy_vllm_objects"] = [
name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)
]
else:
_import_structure["models.baichuan"].extend(["VLLMBaichuan"])
_import_structure["models.llama"].extend(["VLLMLlama"])
@@ -152,7 +166,9 @@ else:
try:
if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
_import_structure["utils.dummy_flax_objects"] = [
name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)
]
else:
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
_import_structure["models.opt"].extend(["FlaxOPT"])
@@ -164,7 +180,9 @@ else:
try:
if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
_import_structure["utils.dummy_tf_objects"] = [
name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)
]
else:
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
_import_structure["models.opt"].extend(["TFOPT"])
@@ -175,7 +193,10 @@ else:
from .models.opt import TFOPT as TFOPT
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
__lazy = openllm_core.utils.LazyModule(__name__,
globals()["__file__"],
_import_structure,
extra_objects={"COMPILED": COMPILED})
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -26,22 +26,24 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
except bentoml.exceptions.NotFound:
model_signatures = {
k: ModelSignature(batchable=False)
for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
'group_beam_search', 'constrained_beam_search', '__call__')
}
with bentoml.models.create(
ids,
module=MODULE_NAME,
api_version=API_VERSION,
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='transformers'),
labels={
'runtime': 'pt', 'framework': 'openllm'
},
signatures=model_signatures
) as bentomodel:
with bentoml.models.create(ids,
module=MODULE_NAME,
api_version=API_VERSION,
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='transformers'),
labels={
'runtime': 'pt',
'framework': 'openllm'
},
signatures=model_signatures) as bentomodel:
snapshot_download(
_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt']
)
_GENERIC_EMBEDDING_ID,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
return bentomodel
class GenericEmbeddingRunnable(bentoml.Runnable):
@@ -66,7 +68,10 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
model_output = self.model(**encoded_input)
# Perform pooling and normalize
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
return [
openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
num_tokens=int(torch.sum(attention_mask).item()))
]
@staticmethod
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:

View File

@@ -14,23 +14,30 @@ LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer |
transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
return any(
self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
class StopOnTokens(transformers.StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
generation_config = config.generation_config
logits_processor = transformers.LogitsProcessorList()
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0: logits_processor.append(transformers.TemperatureLogitsWarper(generation_config['temperature']))
if generation_config['repetition_penalty'] > 1.0: logits_processor.append(transformers.RepetitionPenaltyLogitsProcessor(generation_config['repetition_penalty']))
if 1e-8 <= generation_config['top_p']: logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p']))
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
logits_processor.append(transformers.TemperatureLogitsWarper(generation_config['temperature']))
if generation_config['repetition_penalty'] > 1.0:
logits_processor.append(transformers.RepetitionPenaltyLogitsProcessor(generation_config['repetition_penalty']))
if 1e-8 <= generation_config['top_p']:
logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p']))
if generation_config['top_k'] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k']))
return logits_processor

View File

File diff suppressed because it is too large Load Diff

View File

@@ -15,21 +15,27 @@ if t.TYPE_CHECKING:
from ._llm import LLM
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
autogptq, torch, transformers = LazyLoader('autogptq', globals(),
'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader(
'transformers', globals(), 'transformers')
logger = logging.getLogger(__name__)
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'],
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
...
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'],
**attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
...
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(
cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -50,13 +56,12 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
if 'lm_head' not in int8_skip_modules and cls.config_class.__openllm_model_type__ == 'causal_lm':
logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__)
int8_skip_modules.append('lm_head')
return transformers.BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
llm_int8_threshhold=int8_threshold,
llm_int8_skip_modules=int8_skip_modules,
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
return transformers.BitsAndBytesConfig(load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
llm_int8_threshhold=int8_threshold,
llm_int8_skip_modules=int8_skip_modules,
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
# 4 bit configuration
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
@@ -66,18 +71,21 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
# NOTE: Quantization setup
# quantize is a openllm.LLM feature, where we can quantize the model
# with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available(): raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
if not is_bitsandbytes_available():
raise RuntimeError(
"Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
)
if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
if is_transformers_supports_kbit():
quantisation_config = transformers.BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant
)
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=int4_compute_dtype,
bnb_4bit_quant_type=int4_quant_type,
bnb_4bit_use_double_quant=int4_use_double_quant)
else:
logger.warning(
"'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
pkg.pkg_version_info('transformers')
)
pkg.pkg_version_info('transformers'))
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'gptq':
if not is_autogptq_available():

View File

@@ -21,11 +21,14 @@ if t.TYPE_CHECKING:
from bentoml._internal.runner.runner import AbstractRunner
from bentoml._internal.runner.runner import RunnerMethod
from openllm_core._typing_compat import TypeAlias
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]],
[t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
warnings.filterwarnings('ignore',
message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
warnings.filterwarnings('ignore',
message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
model = os.environ.get('OPENLLM_MODEL', '{__model_name__}') # openllm: model name
@@ -37,15 +40,23 @@ generic_embedding_runner = bentoml.Runner(
name='llm-generic-embedding',
scheduling_strategy=openllm_core.CascadingResourceStrategy,
max_batch_size=32,
max_latency_ms=300
)
max_latency_ms=300)
runners: list[AbstractRunner] = [runner]
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
_JsonInput = bentoml.io.JSON.from_sample({
'prompt': '',
'llm_config': llm_config.model_dump(flatten=True),
'adapter_name': None
})
@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
@svc.api(route='/v1/generate',
input=_JsonInput,
output=bentoml.io.JSON.from_sample({
'responses': [],
'configuration': llm_config.model_dump(flatten=True)
}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
@@ -56,67 +67,45 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
echo = input_dict.pop('echo', False)
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
return runner.generate_iterator.async_stream(qa_inputs.prompt,
adapter_name=qa_inputs.adapter_name,
echo=echo,
**qa_inputs.llm_config.model_dump())
@svc.api(
route='/v1/metadata',
input=bentoml.io.Text(),
output=bentoml.io.JSON.from_sample({
'model_id': runner.llm.model_id,
'timeout': 3600,
'model_name': llm_config['model_name'],
'framework': runner.llm_framework,
'configuration': '',
'supports_embeddings': runner.supports_embeddings,
'supports_hf_agent': runner.supports_hf_agent
})
)
@svc.api(route='/v1/metadata',
input=bentoml.io.Text(),
output=bentoml.io.JSON.from_sample({
'model_id': runner.llm.model_id,
'timeout': 3600,
'model_name': llm_config['model_name'],
'framework': runner.llm_framework,
'configuration': '',
'supports_embeddings': runner.supports_embeddings,
'supports_hf_agent': runner.supports_hf_agent
}))
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(
timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
framework=llm_config['env']['framework_value'],
model_id=runner.llm.model_id,
configuration=llm_config.model_dump_json().decode(),
supports_embeddings=runner.supports_embeddings,
supports_hf_agent=runner.supports_hf_agent
)
return openllm.MetadataOutput(timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
framework=llm_config['env']['framework_value'],
model_id=runner.llm.model_id,
configuration=llm_config.model_dump_json().decode(),
supports_embeddings=runner.supports_embeddings,
supports_hf_agent=runner.supports_hf_agent)
@svc.api(
route='/v1/embeddings',
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
output=bentoml.io.JSON.from_sample({
'embeddings': [
0.007917795330286026,
-0.014421648345887661,
0.00481307040899992,
0.007331526838243008,
-0.0066398633643984795,
0.00945580005645752,
0.0087016262114048,
-0.010709521360695362,
0.012635177001357079,
0.010541186667978764,
-0.00730888033285737,
-0.001783102168701589,
0.02339819073677063,
-0.010825827717781067,
-0.015888236463069916,
0.01876218430697918,
0.0076906150206923485,
0.0009032754460349679,
-0.010024012066423893,
0.01090280432254076,
-0.008668390102684498,
0.02070549875497818,
0.0014594447566196322,
-0.018775740638375282,
-0.014814382418990135,
0.01796768605709076
],
'num_tokens': 20
})
)
@svc.api(route='/v1/embeddings',
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
output=bentoml.io.JSON.from_sample({
'embeddings': [
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008,
-0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362,
0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918,
0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076,
-0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
-0.014814382418990135, 0.01796768605709076
],
'num_tokens': 20
}))
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type]
responses = (await embed_call.async_run(phrases))[0]
@@ -132,7 +121,8 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
raise openllm.exceptions.OpenLLMException(f'Invalid JSON input received: {err}') from None
stop = input_data.parameters.pop('stop', ['\n'])
try:
return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200)
return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters),
status_code=200)
except NotImplementedError:
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
@@ -142,7 +132,8 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
# general metadata app
async def list_adapter_v1(_: Request) -> Response:
res: dict[str, t.Any] = {}
if runner.peft_adapters['success'] is True: res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()}
if runner.peft_adapters['success'] is True:
res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()}
res.update({'success': runner.peft_adapters['success'], 'error_msg': runner.peft_adapters['error_msg']})
return JSONResponse(res, status_code=200)

View File

@@ -10,7 +10,10 @@ from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
'oci': [
'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name',
'supported_registries', 'RefResolver'
]
}
if t.TYPE_CHECKING:

View File

@@ -43,14 +43,18 @@ logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
def build_editable(path: str,
package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
'''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
# We need to build the package in editable mode, so that we can import it
from build import ProjectBuilder
from build.env import IsolatedEnvBuilder
module_location = openllm_core.utils.pkg.source_locations(package)
if not module_location: raise RuntimeError('Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.')
if not module_location:
raise RuntimeError(
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
)
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
if os.path.isfile(pyproject_path.__fspath__()):
logger.info('Generating built wheels for package %s...', package)
@@ -60,9 +64,14 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope
builder.scripts_dir = env.scripts_dir
env.install(builder.build_system_requires)
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
raise RuntimeError(
'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
llm_fs: FS,
extra_dependencies: tuple[str, ...] | None = None,
adapter_map: dict[str, str | None] | None = None,
) -> PythonOptions:
packages = ['openllm', 'scipy'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
# NOTE: add openllm to the default dependencies
@@ -73,27 +82,24 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
req = llm.config['requirements']
if req is not None: packages.extend(req)
if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false': packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
env = llm.config['env']
framework_envvar = env['framework_value']
if framework_envvar == 'flax':
if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
packages.extend([importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')])
if not openllm_core.utils.is_flax_available():
raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
packages.extend(
[importlib.metadata.version('flax'),
importlib.metadata.version('jax'),
importlib.metadata.version('jaxlib')])
elif framework_envvar == 'tf':
if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
candidates = (
'tensorflow',
'tensorflow-cpu',
'tensorflow-gpu',
'tf-nightly',
'tf-nightly-cpu',
'tf-nightly-gpu',
'intel-tensorflow',
'intel-tensorflow-avx512',
'tensorflow-rocm',
'tensorflow-macos',
)
if not openllm_core.utils.is_tf_available():
raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
)
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for candidate in candidates:
try:
@@ -106,28 +112,28 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
except importlib.metadata.PackageNotFoundError:
pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
else:
if not openllm_core.utils.is_torch_available(): raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
if not openllm_core.utils.is_torch_available():
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
packages.extend([f'torch>={importlib.metadata.version("torch")}'])
wheels: list[str] = []
built_wheels: list[str | None] = [
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
for p in ('openllm_core', 'openllm_client', 'openllm')
]
if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=['https://download.pytorch.org/whl/cu118'])
if all(i for i in built_wheels):
wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(packages=packages,
wheels=wheels,
lock_packages=False,
extra_index_url=['https://download.pytorch.org/whl/cu118'])
def construct_docker_options(
llm: openllm.LLM[t.Any, t.Any],
_: FS,
workers_per_resource: float,
quantize: LiteralString | None,
bettertransformer: bool | None,
adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None,
runtime: t.Literal['ggml', 'transformers'],
serialisation_format: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy
) -> DockerOptions:
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
quantize: LiteralString | None, bettertransformer: bool | None,
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
env: openllm_core.utils.EnvVarMixin = llm.config['env']
@@ -146,12 +152,18 @@ def construct_docker_options(
if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# We need to handle None separately here, as env from subprocess doesn't accept None value.
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
env_dict[_env.runtime] = _env['runtime_value']
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
return DockerOptions(
base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template)
OPENLLM_MODEL_NAME = '# openllm: model name'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -185,47 +197,58 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
from openllm_core.utils import DEBUG
model_name = llm.config['model_name']
logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'],
llm_fs.getsyspath('/'))
with open(_service_file.__fspath__(), 'r') as f:
src_contents = f.readlines()
for it in src_contents:
if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
if OPENLLM_MODEL_NAME in it:
src_contents[src_contents.index(it)] = (
ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
elif OPENLLM_MODEL_ADAPTER_MAP in it:
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(
orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG: logger.info('Generated script:\n%s', script)
llm_fs.writetext(llm.config['service_name'], script)
@inject
def create_bento(
bento_tag: bentoml.Tag,
llm_fs: FS,
llm: openllm.LLM[t.Any, t.Any],
workers_per_resource: str | float,
quantize: LiteralString | None,
bettertransformer: bool | None,
dockerfile_template: str | None,
adapter_map: dict[str, str | None] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
container_registry: LiteralContainerRegistry = 'ecr',
container_version_strategy: LiteralContainerVersionStrategy = 'release',
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
_model_store: ModelStore = Provide[BentoMLContainer.model_store]
) -> bentoml.Bento:
def create_bento(bento_tag: bentoml.Tag,
llm_fs: FS,
llm: openllm.LLM[t.Any, t.Any],
workers_per_resource: str | float,
quantize: LiteralString | None,
bettertransformer: bool | None,
dockerfile_template: str | None,
adapter_map: dict[str, str | None] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
container_registry: LiteralContainerRegistry = 'ecr',
container_version_strategy: LiteralContainerVersionStrategy = 'release',
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
framework_envvar = llm.config['env']['framework_value']
labels = dict(llm.identifying_params)
labels.update({'_type': llm.llm_type, '_framework': framework_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'})
labels.update({
'_type': llm.llm_type,
'_framework': framework_envvar,
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle'
})
if adapter_map: labels.update(adapter_map)
if isinstance(workers_per_resource, str):
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
elif workers_per_resource == 'conserved': workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
elif workers_per_resource == 'conserved':
workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 /
openllm_core.utils.device_count())
else:
try:
workers_per_resource = float(workers_per_resource)
except ValueError:
raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
raise ValueError(
"'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
elif isinstance(workers_per_resource, int):
workers_per_resource = float(workers_per_resource)
logger.info("Building Bento for '%s'", llm.config['start_name'])
@@ -233,19 +256,18 @@ def create_bento(
write_service(llm, adapter_map, llm_fs)
llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
build_config = BentoBuildConfig(
service=f"{llm.config['service_name']}:svc",
name=bento_tag.name,
labels=labels,
description=f"OpenLLM service for {llm.config['start_name']}",
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(
llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
)
)
build_config = BentoBuildConfig(service=f"{llm.config['service_name']}:svc",
name=bento_tag.name,
labels=labels,
description=f"OpenLLM service for {llm.config['start_name']}",
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
bettertransformer, adapter_map, dockerfile_template,
runtime, serialisation_format, container_registry,
container_version_strategy))
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
@@ -261,6 +283,7 @@ def create_bento(
if openllm_core.utils.DEBUG: logger.info('Generated script:\n%s', script)
bento._fs.writetext(service_fs_path, script)
if 'model_store' in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)
if 'model_store' in inspect.signature(bento.save).parameters:
return bento.save(bento_store=_bento_store, model_store=_model_store)
# backward arguments. `model_store` is added recently
return bento.save(bento_store=_bento_store)

View File

@@ -42,7 +42,11 @@ ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
# but in the future, we can infer based on git repo and everything to make it more options for users
# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
# NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'}
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'
}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER = 'bentoml'
@@ -64,7 +68,8 @@ def _commit_time_range(r: int = 5) -> str:
class VersionNotSupported(openllm.exceptions.OpenLLMException):
"""Raised when the stable release is too low that it doesn't include OpenLLM base container."""
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple',
['git_hash', 'version', 'strategy'])
def nightly_resolver(cls: type[RefResolver]) -> str:
# NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -78,7 +83,11 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range()))
return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
# now is the correct behaviour
return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
return orjson.loads(
subprocess.check_output([
docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
'docker://ghcr.io/bentoml/openllm'
]).decode().strip())['Tags'][-2]
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
@@ -98,16 +107,20 @@ class RefResolver:
# NOTE: This strategy will only support openllm>0.2.12
meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
version_str = meta['name'].lstrip('v')
version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
version: tuple[str,
str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
else:
version = ('', version_str)
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
raise VersionNotSupported(
f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'"
)
return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
def from_strategy(cls,
strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release': return cls(*cls._release_ref())
elif strategy_or_version == 'latest': return cls('latest', '0.0.0', 'latest')
@@ -115,7 +128,8 @@ class RefResolver:
_ref = cls._nightly_ref()
return cls(_ref[0], '0.0.0', _ref[-1])
else:
logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version)
logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.',
strategy_or_version)
return cls(*cls._release_ref(version_str=strategy_or_version))
@property
@@ -129,21 +143,27 @@ class RefResolver:
def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
return RefResolver.from_strategy(strategy).tag
def build_container(
registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
version_strategy: LiteralContainerVersionStrategy = 'release',
push: bool = False,
machine: bool = False
) -> dict[str | LiteralContainerRegistry, str]:
def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
version_strategy: LiteralContainerVersionStrategy = 'release',
push: bool = False,
machine: bool = False) -> dict[str | LiteralContainerRegistry, str]:
try:
if not _BUILDER.health(): raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
if openllm_core.utils.device_count() == 0: raise RuntimeError('Building base container requires GPUs (None available)')
if not shutil.which('nvidia-container-runtime'): raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
if openllm_core.utils.device_count() == 0:
raise RuntimeError('Building base container requires GPUs (None available)')
if not shutil.which('nvidia-container-runtime'):
raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
if not _module_location:
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
)
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -152,24 +172,27 @@ def build_container(
registries = [registries] if isinstance(registries, str) else list(registries)
tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries}
try:
outputs = _BUILDER.build(
file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
context_path=pyproject_path.parent.__fspath__(),
tag=tuple(tags.values()),
push=push,
progress='plain' if openllm_core.utils.get_debug_mode() else 'auto',
quiet=machine
)
outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
context_path=pyproject_path.parent.__fspath__(),
tag=tuple(tags.values()),
push=push,
progress='plain' if openllm_core.utils.get_debug_mode() else 'auto',
quiet=machine)
if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}'
) from err
return tags
if t.TYPE_CHECKING:
CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
supported_registries: list[str]
__all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
__all__ = [
'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries',
'RefResolver'
]
def __dir__() -> list[str]:
return sorted(__all__)

View File

@@ -40,27 +40,46 @@ _AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
return [
sc.CompletionItem(str(it.tag), help='Bento')
for it in bentoml.list()
if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
return [
sc.CompletionItem(inflection.dasherize(it), help='Model')
for it in openllm.CONFIG_MAPPING
if it.startswith(incomplete)
]
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float,
device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
_bentoml_config_options_opts = [
'tracing.sample_rate=1.0',
f'api_server.traffic.timeout={server_timeout}',
'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
]
if device:
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if len(device) > 1:
_bentoml_config_options_opts.extend([
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
for idx, dev in enumerate(device)
])
else:
_bentoml_config_options_opts.append(
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(
f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if cors:
_bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_opts.extend(
['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([
f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
])
_bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -82,7 +101,10 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
return None
def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
def start_command_factory(group: click.Group,
model: str,
_context_settings: DictStrAny | None = None,
_serve_grpc: bool = False) -> click.Command:
llm_config = openllm.AutoConfig.for_model(model)
command_attrs: DictStrAny = dict(
name=llm_config['model_name'],
@@ -113,37 +135,29 @@ Available official model_id(s): [default: {llm_config['default_id']}]
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update({
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
'short_help':
'(Disabled because there is no GPU available)',
'help':
f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
})
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
@group.command(**command_attrs)
@start_decorator(llm_config, serve_grpc=_serve_grpc)
@click.pass_context
def start_cmd(
ctx: click.Context,
/,
server_timeout: int,
model_id: str | None,
model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
bettertransformer: bool | None,
runtime: t.Literal['ggml', 'transformers'],
fast: bool,
serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool,
adapter_id: str | None,
return_process: bool,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
'legacy'],
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
if serialisation_format == 'safetensors' and quantize is not None and os.environ.get('OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
termui.echo(
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
fg='yellow'
)
fg='yellow')
adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
config, server_attrs = llm_config.model_validate_click(**attrs)
server_timeout = openllm.utils.first_not_none(server_timeout, default=config['timeout'])
@@ -169,16 +183,21 @@ Available official model_id(s): [default: {llm_config['default_id']}]
wpr = float(wpr)
# Create a new model env to work with the envvar during CLI invocation
env = openllm.utils.EnvVarMixin(
config['model_name'], config.default_implementation(), model_id=model_id or config['default_id'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
)
env = openllm.utils.EnvVarMixin(config['model_name'],
config.default_implementation(),
model_id=model_id or config['default_id'],
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
# NOTE: This is to set current configuration
start_env = os.environ.copy()
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
if fast:
termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg='yellow')
termui.echo(
f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
fg='yellow')
start_env.update({
'OPENLLM_MODEL': model,
@@ -194,18 +213,28 @@ Available official model_id(s): [default: {llm_config['default_id']}]
if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(
model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format
)
llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
model_id=start_env[env.model_id],
model_version=model_version,
llm_config=config,
ensure_available=not fast,
adapter_map=adapter_map,
serialisation=serialisation_format)
start_env.update({env.config: llm.config.model_dump_json().decode()})
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
'_service:svc', **server_attrs)
openllm.utils.analytics.track_start_init(llm.config)
def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
cmd_name = f'openllm build {model_name}'
if adapter_map is not None: cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
if adapter_map is not None:
cmd_name += ' ' + ' '.join([
f'--adapter-id {s}'
for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
])
if not openllm.utils.get_quiet_mode():
termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
if return_process:
server.start(env=start_env, text=True)
@@ -239,30 +268,35 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
return noop
def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
if adapter_map and not openllm.utils.is_peft_available():
ctx.fail(
"Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
if quantize and llm_config.default_implementation() == 'vllm':
ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
ctx.fail(
f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
)
requirements = llm_config['requirements']
if requirements is not None and len(requirements) > 0:
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
if len(missing_requirements) > 0:
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = openllm.utils.compose(
llm_config.to_click_options,
_http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup, model_env=llm_config['env']),
model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
fast_option(factory=cog.optgroup),
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group(
'LLM Optimization Options',
help='''Optimization related options.
'General LLM Options',
help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
fast_option(factory=cog.optgroup),
cog.optgroup.group('LLM Optimization Options',
help='''Optimization related options.
OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -272,23 +306,23 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
),
cog.optgroup.option(
'--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
envvar='CUDA_VISIBLE_DEVICES',
callback=parse_device_callback,
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
show_envvar=True
),
cog.optgroup.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.'),
),
cog.optgroup.option('--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
envvar='CUDA_VISIBLE_DEVICES',
callback=parse_device_callback,
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
show_envvar=True),
cog.optgroup.option('--runtime',
type=click.Choice(['ggml', 'transformers']),
default='transformers',
help='The runtime to use for the given model. Default is transformers.'),
quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
serialisation_option(factory=cog.optgroup),
cog.optgroup.group(
'Fine-tuning related options',
help='''\
cog.optgroup.group('Fine-tuning related options',
help='''\
Note that the argument `--adapter-id` can accept the following format:
- `--adapter-id /path/to/adapter` (local adapter)
@@ -302,23 +336,22 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
$ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora
```
'''
),
cog.optgroup.option(
'--adapter-id',
default=None,
help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
multiple=True,
callback=_id_callback,
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'
),
'''),
cog.optgroup.option('--adapter-id',
default=None,
help='Optional name or path for given LoRA adapter' +
f" to wrap '{llm_config['model_name']}'",
multiple=True,
callback=_id_callback,
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
)
return composed(fn)
return wrapper
def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
def parse_device_callback(ctx: click.Context, param: click.Parameter,
value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
if value is None: return value
if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -337,14 +370,18 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
command = 'serve' if not serve_grpc else 'serve-grpc'
group = cog.optgroup.group(
f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
)
def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
serve_command = cli.commands[command]
# The first variable is the argument bento
# The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
serve_options = [
p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
if p.name not in _IGNORED_OPTIONS
]
for options in reversed(serve_options):
attrs = options.to_info_dict()
# we don't need param_type_name, since it should all be options
@@ -381,73 +418,90 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
cli_option = functools.partial(_click_factory_type, attr='option')
cli_argument = functools.partial(_click_factory_type, attr='argument')
def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
def output_option(f: _AnyCallable | None = None,
*,
default_value: LiteralOutput = 'pretty',
**attrs: t.Any) -> t.Callable[[FC], FC]:
output = ['json', 'pretty', 'porcelain']
def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
return [CompletionItem(it) for it in output]
return cli_option(
'-o',
'--output',
'output',
type=click.Choice(output),
default=default_value,
help='Showing output type.',
show_default=True,
envvar='OPENLLM_OUTPUT',
show_envvar=True,
shell_complete=complete_output_var,
**attrs
)(f)
return cli_option('-o',
'--output',
'output',
type=click.Choice(output),
default=default_value,
help='Showing output type.',
show_default=True,
envvar='OPENLLM_OUTPUT',
show_envvar=True,
shell_complete=complete_output_var,
**attrs)(f)
def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--fast/--no-fast',
show_default=True,
default=False,
envvar='OPENLLM_USE_LOCAL_LATEST',
show_envvar=True,
help='''Whether to skip checking if models is already in store.
return cli_option('--fast/--no-fast',
show_default=True,
default=False,
envvar='OPENLLM_USE_LOCAL_LATEST',
show_envvar=True,
help='''Whether to skip checking if models is already in store.
This is useful if you already downloaded or setup the model beforehand.
''',
**attrs
)(f)
**attrs)(f)
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
return cli_option('--cors/--no-cors',
show_default=True,
default=False,
envvar='OPENLLM_CORS',
show_envvar=True,
help='Enable CORS for the server.',
**attrs)(f)
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--model-id',
type=click.STRING,
default=None,
envvar=model_env.model_id if model_env is not None else None,
show_envvar=model_env is not None,
help='Optional model_id name or path for (fine-tune) weight.',
**attrs
)(f)
def model_id_option(f: _AnyCallable | None = None,
*,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-id',
type=click.STRING,
default=None,
envvar=model_env.model_id if model_env is not None else None,
show_envvar=model_env is not None,
help='Optional model_id name or path for (fine-tune) weight.',
**attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
return cli_option(
'--model-version',
type=click.STRING,
default=None,
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
return cli_argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
required=required,
**attrs)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--quantise',
'--quantize',
'quantize',
type=click.Choice(['int8', 'int4', 'gptq']),
default=None,
envvar=model_env.quantize if model_env is not None else None,
show_envvar=model_env is not None,
help='''Dynamic quantization for running this LLM.
def quantize_option(f: _AnyCallable | None = None,
*,
build: bool = False,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--quantise',
'--quantize',
'quantize',
type=click.Choice(['int8', 'int4', 'gptq']),
default=None,
envvar=model_env.quantize if model_env is not None else None,
show_envvar=model_env is not None,
help='''Dynamic quantization for running this LLM.
The following quantization strategies are supported:
@@ -461,17 +515,18 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
''' + ('''
> [!NOTE] that this will set the mode for serving within deployment.''' if build else '') + '''
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
**attrs
)(f)
**attrs)(f)
def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--workers-per-resource',
default=None,
callback=workers_per_resource_callback,
type=str,
required=False,
help='''Number of workers per resource assigned.
def workers_per_resource_option(f: _AnyCallable | None = None,
*,
build: bool = False,
**attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--workers-per-resource',
default=None,
callback=workers_per_resource_callback,
type=str,
required=False,
help='''Number of workers per resource assigned.
See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
for more information. By default, this is set to 1.
@@ -481,38 +536,37 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
- ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
''' + (
"""\n
''' + ("""\n
> [!NOTE] The workers value passed into 'build' will determine how the LLM can
> be provisioned in Kubernetes as well as in standalone container. This will
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''
),
**attrs
)(f)
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
**attrs)(f)
def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
def bettertransformer_option(f: _AnyCallable | None = None,
*,
build: bool = False,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--bettertransformer',
is_flag=True,
default=None,
envvar=model_env.bettertransformer if model_env is not None else None,
show_envvar=model_env is not None,
help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.'
if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.',
**attrs
)(f)
help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
'Set default environment variable whether to serve this model with FasterTransformer in build time.',
**attrs)(f)
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--serialisation',
'--serialization',
'serialisation_format',
type=click.Choice(['safetensors', 'legacy']),
default='safetensors',
show_default=True,
show_envvar=True,
envvar='OPENLLM_SERIALIZATION',
help='''Serialisation format for save/load LLM.
return cli_option('--serialisation',
'--serialization',
'serialisation_format',
type=click.Choice(['safetensors', 'legacy']),
default='safetensors',
show_default=True,
show_envvar=True,
envvar='OPENLLM_SERIALIZATION',
help='''Serialisation format for save/load LLM.
Currently the following strategies are supported:
@@ -529,28 +583,25 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
> [!NOTE] that GGML format is working in progress.
''',
**attrs
)(f)
**attrs)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help='''The default container registry to get the base image for building BentoLLM.
return cli_option('--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help='''The default container registry to get the base image for building BentoLLM.
Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
\b
> [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
''',
**attrs
)(f)
**attrs)(f)
_wpr_strategies = {'round_robin', 'conserved'}
@@ -562,11 +613,14 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
try:
float(value) # type: ignore[arg-type]
except ValueError:
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
raise click.BadParameter(
f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
ctx, param) from None
else:
return value
def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
if value is None: return value
if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
if value not in openllm.bundle.supported_registries:
raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
return value

View File

@@ -30,25 +30,23 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
def _start(
model_name: str,
/,
*,
model_id: str | None = None,
timeout: int = 30,
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
device: tuple[str, ...] | t.Literal['all'] | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
adapter_map: dict[LiteralString, str | None] | None = None,
framework: LiteralRuntime | None = None,
additional_args: list[str] | None = None,
cors: bool = False,
_serve_grpc: bool = False,
__test__: bool = False,
**_: t.Any
) -> LLMConfig | subprocess.Popen[bytes]:
def _start(model_name: str,
/,
*,
model_id: str | None = None,
timeout: int = 30,
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
device: tuple[str, ...] | t.Literal['all'] | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
adapter_map: dict[LiteralString, str | None] | None = None,
framework: LiteralRuntime | None = None,
additional_args: list[str] | None = None,
cors: bool = False,
_serve_grpc: bool = False,
__test__: bool = False,
**_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
"""Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.
For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
@@ -91,58 +89,66 @@ def _start(
from .entrypoint import start_command
from .entrypoint import start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm_core.utils.EnvVarMixin(
model_name,
openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()),
model_id=model_id,
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime
)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
openllm_core.utils.first_not_none(
framework, default=llm_config.default_implementation()),
model_id=model_id,
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']
args: list[str] = ['--runtime', runtime]
if model_id: args.extend(['--model-id', model_id])
if timeout: args.extend(['--server-timeout', str(timeout)])
if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
if workers_per_resource:
args.extend([
'--workers-per-resource',
str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
])
if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize and bettertransformer:
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(['--quantize', str(quantize)])
elif bettertransformer: args.append('--bettertransformer')
if cors: args.append('--cors')
if adapter_map: args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
if adapter_map:
args.extend(
list(
itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()
])))
if additional_args: args.extend(additional_args)
if __test__: args.append('--return-process')
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
args=args if len(args) > 0 else None, standalone_mode=False
)
return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
model_name,
_context_settings=termui.CONTEXT_SETTINGS,
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None,
standalone_mode=False)
@inject
def _build(
model_name: str,
/,
*,
model_id: str | None = None,
model_version: str | None = None,
bento_version: str | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
adapter_map: dict[str, str | None] | None = None,
build_ctx: str | None = None,
enable_features: tuple[str, ...] | None = None,
workers_per_resource: float | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
dockerfile_template: str | None = None,
overwrite: bool = False,
container_registry: LiteralContainerRegistry | None = None,
container_version_strategy: LiteralContainerVersionStrategy | None = None,
push: bool = False,
containerize: bool = False,
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
additional_args: list[str] | None = None,
bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
) -> bentoml.Bento:
def _build(model_name: str,
/,
*,
model_id: str | None = None,
model_version: str | None = None,
bento_version: str | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
adapter_map: dict[str, str | None] | None = None,
build_ctx: str | None = None,
enable_features: tuple[str, ...] | None = None,
workers_per_resource: float | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
dockerfile_template: str | None = None,
overwrite: bool = False,
container_registry: LiteralContainerRegistry | None = None,
container_version_strategy: LiteralContainerVersionStrategy | None = None,
push: bool = False,
containerize: bool = False,
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
additional_args: list[str] | None = None,
bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
"""Package a LLM into a Bento.
The LLM will be built into a BentoService with the following structure:
@@ -192,8 +198,12 @@ def _build(
Returns:
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation', serialisation_format]
if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
args: list[str] = [
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
serialisation_format
]
if quantize and bettertransformer:
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(['--quantize', quantize])
if bettertransformer: args.append('--bettertransformer')
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
@@ -221,21 +231,21 @@ def _build(
raise OpenLLMException(str(e)) from None
matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
if matched is None:
raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
raise ValueError(
f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
)
return bentoml.get(matched.group(1), _bento_store=bento_store)
def _import_model(
model_name: str,
/,
*,
model_id: str | None = None,
model_version: str | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
implementation: LiteralRuntime = 'pt',
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
additional_args: t.Sequence[str] | None = None
) -> bentoml.Model:
def _import_model(model_name: str,
/,
*,
model_id: str | None = None,
model_version: str | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
implementation: LiteralRuntime = 'pt',
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
"""Import a LLM into local store.
> [!NOTE]
@@ -267,7 +277,10 @@ def _import_model(
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
from .entrypoint import import_command
args = [model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation', serialisation_format,]
args = [
model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
serialisation_format,
]
if model_id is not None: args.append(model_id)
if model_version is not None: args.extend(['--model-version', str(model_version)])
if additional_args is not None: args.extend(additional_args)
@@ -278,5 +291,9 @@ def _list_models() -> dict[str, t.Any]:
'''List all available models within the local store.'''
from .entrypoint import models_command
return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']

View File

@@ -14,10 +14,9 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralContainerRegistry
from openllm_core._typing_compat import LiteralContainerVersionStrategy
@click.command(
'build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help='''Base image builder for BentoLLM.
@click.command('build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help='''Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -27,13 +26,16 @@ if t.TYPE_CHECKING:
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
'''
)
''')
@container_registry_option
@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
@click.option('--version-strategy',
type=click.Choice(['release', 'latest', 'nightly']),
default='nightly',
help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None,
version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping

View File

@@ -24,14 +24,19 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
def cli(ctx: click.Context,
bento: str,
machine: bool,
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
'''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
try:
bentomodel = _bento_store.get(bento)
except bentoml.exceptions.NotFound:
ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
ctx.fail(
f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
)
if machine: return bentomodel.path
# copy and paste this into a new shell
if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)

View File

@@ -19,7 +19,9 @@ from openllm_core.utils import bentoml_cattr
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
@click.command('get_containerfile',
context_settings=termui.CONTEXT_SETTINGS,
help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
@@ -39,7 +41,13 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
# NOTE: if users specify a dockerfile_template, we will
# save it to /env/docker/Dockerfile.template. This is necessary
# for the reconstruction of the Dockerfile.
if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None: docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
build_ctx=bentomodel.path,
conda=options.conda,
bento_fs=bentomodel._fs,
enable_buildkit=True,
add_header=True)
termui.echo(doc, fg='white')
return bentomodel.path

View File

@@ -18,41 +18,51 @@ from openllm_core._prompt import process_prompt
LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@machine_option
@click.option(
'--opt',
help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
required=False,
multiple=True,
callback=opt_callback,
metavar='ARG=VALUE[,ARG=VALUE]'
)
@click.option('--opt',
help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
required=False,
multiple=True,
callback=opt_callback,
metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool,
_memoized: dict[str, t.Any], **_: t.Any) -> str | None:
'''Get the default prompt used by OpenLLM.'''
module = openllm.utils.EnvVarMixin(model_name).module
_memoized = {k: v[0] for k, v in _memoized.items() if v}
try:
template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
prompt_mapping = getattr(module, 'PROMPT_MAPPING', None)
if template is None: raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
if template is None:
raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
if callable(template):
if format is None:
if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None: raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
if prompt_mapping is None: raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
if format not in prompt_mapping: raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
raise click.BadOptionUsage(
'format',
f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
if prompt_mapping is None:
raise click.BadArgumentUsage(
f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
if format not in prompt_mapping:
raise click.BadOptionUsage(
'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
_prompt_template = template(format)
else:
_prompt_template = template
fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
if machine: return repr(fully_formatted)
elif output == 'porcelain': termui.echo(repr(fully_formatted), fg='white')
elif output == 'json': termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
elif output == 'json':
termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
else:
termui.echo(f'== Prompt for {model_name} ==\n', fg='magenta')
termui.echo(fully_formatted, fg='white')

View File

@@ -19,23 +19,27 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
'''List available bentos built by OpenLLM.'''
mapping = {
k: [{
'tag': str(b.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
'tag':
str(b.tag),
'size':
human_readable_size(openllm.utils.calc_dir_size(b.path)),
'models': [{
'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
'tag': str(m.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
} for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(
inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()
)
} for b in tuple(i for i in bentoml.list() if all(
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k
] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
}
mapping = {k: v for k, v in mapping.items() if v}
if output == 'pretty':
import tabulate
tabulate.PRESERVE_WHITESPACE = True
termui.echo(
tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size', 'Models']),
fg='white'
)
termui.echo(tabulate.tabulate(
[(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size', 'Models']),
fg='white')
else:
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
ctx.exit(0)

View File

@@ -25,17 +25,33 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
'''This is equivalent to openllm models --show-available less the nice table.'''
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
] for k in models
k: [
i for i in bentoml.models.list() if 'framework' in i.info.labels and
i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
] for k in models
}
if model_name is not None:
ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
ids_in_local_store = {
k: [
i
for i in v
if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
] for k, v in ids_in_local_store.items()
}
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
local_models = {
k: [{
'tag': str(i.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
} for i in val] for k, val in ids_in_local_store.items()
}
if output == 'pretty':
import tabulate
tabulate.PRESERVE_WHITESPACE = True
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size']),
fg='white')
else:
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
return local_models

View File

@@ -28,12 +28,18 @@ logger = logging.getLogger(__name__)
def load_notebook_metadata() -> DictStrAny:
with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
content = yaml.safe_load(f)
if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
if not all('description' in k for k in content.values()):
raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
return content
@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.option('--port',
envvar='JUPYTER_PORT',
show_envvar=True,
show_default=True,
default=8888,
help='Default port for Jupyter server')
@click.pass_context
def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
"""OpenLLM Playground.
@@ -54,7 +60,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
> This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
"""
if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
raise RuntimeError(
"Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
)
metadata = load_notebook_metadata()
_temp_dir = False
if output_dir is None:
@@ -66,7 +74,8 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
for module in pkgutil.iter_modules(playground.__path__):
if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
logger.debug('Skipping: %s (%s)', module.name,
'File already exists' if not module.ispkg else f'{module.name} is a module')
continue
if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
termui.echo('Generating notebook for: ' + module.name, fg='magenta')
@@ -75,7 +84,10 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
f.cells.insert(0, markdown_cell)
jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
try:
subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
subprocess.check_output([
sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port',
str(port), '--no-browser', '--debug'
])
except subprocess.CalledProcessError as e:
termui.echo(e.output, fg='red')
raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None

View File

@@ -12,8 +12,13 @@ if t.TYPE_CHECKING:
def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
if not openllm.utils.get_quiet_mode():
t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
CONTEXT_SETTINGS: DictStrAny = {
'help_option_names': ['-h', '--help'],
'max_content_width': COLUMNS,
'token_normalize_func': inflection.underscore
}
__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']

View File

@@ -43,7 +43,8 @@ except openllm.exceptions.MissingDependencyError:
pass
else:
_import_structure['modeling_flax_auto'].extend(['AutoFlaxLLM', 'MODEL_FLAX_MAPPING'])
if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
if t.TYPE_CHECKING:
from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
try:
if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
except openllm.exceptions.MissingDependencyError:

View File

@@ -30,10 +30,18 @@ class BaseAutoLLMClass:
_model_mapping: t.ClassVar[_LazyAutoMapping]
def __init__(self, *args: t.Any, **attrs: t.Any):
raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
raise EnvironmentError(
f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
)
@classmethod
def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False,
def for_model(cls,
model: str,
/,
model_id: str | None = None,
model_version: str | None = None,
llm_config: openllm.LLMConfig | None = None,
ensure_available: bool = False,
**attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
'''The lower level API for creating a LLM instance.
@@ -42,7 +50,10 @@ class BaseAutoLLMClass:
>>> llm = openllm.AutoLLM.for_model("flan-t5")
```
'''
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
model_version=model_version,
llm_config=llm_config,
**attrs)
if ensure_available: llm.ensure_model_id_exists()
return llm
@@ -105,7 +116,9 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
This OrderedDict values() and keys() returns the list instead, so you don't
have to do list(mapping.values()) to get the list of values.
"""
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
model_mapping: OrderedDict[LiteralString, LiteralString]):
self._config_mapping = config_mapping
self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
self._model_mapping = model_mapping
@@ -115,7 +128,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]:
if key in self._extra_content: return self._extra_content[key]
model_type = self._reverse_config_mapping[key.__name__]
if model_type in self._model_mapping: return self._load_attr_from_module(model_type, self._model_mapping[model_type])
if model_type in self._model_mapping:
return self._load_attr_from_module(model_type, self._model_mapping[model_type])
# Maybe there was several model types associated with this config.
model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
for mtype in model_types:
@@ -124,7 +138,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
module_name = inflection.underscore(model_type)
if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
if module_name not in self._modules:
self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
return getattribute_from_module(self._modules[module_name], attr)
def __len__(self) -> int:
@@ -138,29 +153,32 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
return ReprMixin.__repr__(self)
def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
yield from ((key, (value, self._model_mapping[key]))
for key, value in self._config_mapping.items()
if key in self._model_mapping)
def __bool__(self) -> bool:
return bool(self.keys())
def keys(self) -> ConfigModelKeysView:
return t.cast(
'ConfigModelKeysView', [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
)
return t.cast('ConfigModelKeysView', [
self._load_attr_from_module(key, name)
for key, name in self._config_mapping.items()
if key in self._model_mapping.keys()
] + list(self._extra_content.keys()))
def values(self) -> ConfigModelValuesView:
return t.cast(
'ConfigModelValuesView', [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
self._extra_content.values()
)
)
return t.cast('ConfigModelValuesView', [
self._load_attr_from_module(key, name)
for key, name in self._model_mapping.items()
if key in self._config_mapping.keys()
] + list(self._extra_content.values()))
def items(self) -> ConfigModelItemsView:
return t.cast(
'ConfigModelItemsView',
[(self._load_attr_from_module(key, self._config_mapping[key]),
self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())
)
return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
for key in self._model_mapping.keys()
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
@@ -172,7 +190,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
def register(self, key: t.Any, value: t.Any) -> None:
if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys():
raise ValueError(f"'{key}' is already used by a OpenLLM model.")
self._extra_content[key] = value
__all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']

View File

@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), (
'opt', 'OPT'
), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
('baichuan', 'Baichuan')])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
class AutoLLM(BaseAutoLLMClass):

View File

@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), (
'opt', 'VLLMOPT'
), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
class AutoVLLM(BaseAutoLLMClass):

View File

@@ -11,5 +11,6 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
import torch
inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
outputs = self.model.generate(**inputs,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

View File

@@ -14,7 +14,9 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
self.model.eval()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision: self.model.half()
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
return self.model.chat(self.tokenizer,
prompt,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
import torch

View File

@@ -10,29 +10,57 @@ from openllm_core.config.configuration_dolly_v2 import END_KEY
from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
from openllm_core.config.configuration_dolly_v2 import get_special_token_id
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
else: torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
else:
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
logger = logging.getLogger(__name__)
@overload
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[True] = True,
**attrs: t.Any) -> transformers.Pipeline:
...
@overload
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[False] = ...,
**attrs: t.Any) -> type[transformers.Pipeline]:
...
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: bool = False,
**attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
# Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
class InstructionTextGenerationPipeline(transformers.Pipeline):
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
def __init__(self,
*args: t.Any,
do_sample: bool = True,
max_new_tokens: int = 256,
top_p: float = 0.92,
top_k: int = 0,
**kwargs: t.Any):
super().__init__(*args,
model=model,
tokenizer=tokenizer,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
top_p=top_p,
top_k=top_k,
**kwargs)
def _sanitize_parameters(self,
return_full_text: bool | None = None,
**generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
preprocess_params: dict[str, t.Any] = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
# append a newline to yield a single token. find whatever token is configured for the response key.
tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
tokenizer_response_key = next(
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
response_key_token_id = None
end_key_token_id = None
if tokenizer_response_key:
@@ -56,7 +84,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
inputs['instruction_text'] = input_
return t.cast(t.Dict[str, t.Any], inputs)
def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
def _forward(self, input_tensors: dict[str, t.Any],
**generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
if t.TYPE_CHECKING: assert self.tokenizer is not None
input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
@@ -65,15 +94,20 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
pad_token_id=self.tokenizer.pad_token_id,
**generate_kwargs
)
**generate_kwargs)
out_b = generated_sequence.shape[0]
if self.framework == 'pt': generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
elif self.framework == 'tf': generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
if self.framework == 'pt':
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
elif self.framework == 'tf':
generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
instruction_text = input_tensors.pop('instruction_text')
return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
def postprocess(self,
model_outputs: dict[str, t.Any],
response_key_token_id: int,
end_key_token_id: int,
return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
_generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
@@ -89,7 +123,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
response_pos = sequence.index(response_key_token_id)
except ValueError:
response_pos = None
if response_pos is None: logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
if response_pos is None:
logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
if response_pos:
# Next find where "### End" is located. The model has been trained to end its responses with this
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
@@ -127,12 +162,20 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.bfloat16
}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
self.tokenizer,
_init=True,
return_full_text=self.config.return_full_text)
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
llm_config = self.config.model_construct_env(**attrs)
with torch.inference_mode():
return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
return self.model(prompt,
return_full_text=llm_config.return_full_text,
generation_config=llm_config.to_generation_config())

View File

@@ -3,32 +3,43 @@ import typing as t
import openllm
if t.TYPE_CHECKING: import torch, transformers
else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
else:
torch, transformers = openllm.utils.LazyLoader('torch', globals(),
'torch'), openllm.utils.LazyLoader('transformers', globals(),
'transformers')
class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
__openllm_internal__ = True
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
return {
'torch_dtype': torch.bfloat16,
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
}, {}
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
eos_token_id, inputs = attrs.pop('eos_token_id',
self.tokenizer.eos_token_id), self.tokenizer(prompt,
return_tensors='pt').to(self.device)
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
return self.tokenizer.batch_decode(
self.model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()
),
skip_special_tokens=True
)
return self.tokenizer.batch_decode(self.model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
skip_special_tokens=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -12,9 +12,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True
)
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
import torch

View File

@@ -9,18 +9,16 @@ if t.TYPE_CHECKING: import transformers
class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
__openllm_internal__ = True
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
repetition_penalty: float | None = None,
decoder_start_token_id: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
repetition_penalty: float | None = None,
decoder_start_token_id: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if decoder_start_token_id is None: decoder_start_token_id = 0
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
@@ -34,13 +32,10 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
return self.tokenizer.batch_decode(
self.model.generate(
self.tokenizer(prompt, return_tensors='np')['input_ids'],
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
decoder_start_token_id=decoder_start_token_id
).sequences,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
return self.tokenizer.batch_decode(self.model.generate(
self.tokenizer(prompt, return_tensors='np')['input_ids'],
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
decoder_start_token_id=decoder_start_token_id).sequences,
skip_special_tokens=True,
clean_up_tokenization_spaces=True)

View File

@@ -8,7 +8,8 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
__openllm_internal__ = True
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(
self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True
)
return self.tokenizer.batch_decode(self.model.generate(
self.tokenizer(prompt, return_tensors='tf').input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -25,11 +25,8 @@ class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNe
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(
self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
)
)
self.model.generate(self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))

View File

@@ -23,13 +23,20 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
masked_embeddings = data * mask
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -12,12 +12,15 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
def get_mpt_config(
model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True
) -> transformers.PretrainedConfig:
def get_mpt_config(model_id_or_path: str,
max_sequence_length: int,
device: torch.device | str | int | None,
device_map: str | None = None,
trust_remote_code: bool = True) -> transformers.PretrainedConfig:
import torch
config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)):
config.init_device = str(device)
if hasattr(config, 'attn_config') and is_triton_available(): config.attn_config['attn_impl'] = 'triton'
else:
logger.debug(
@@ -37,7 +40,10 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32
}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
import torch
@@ -46,12 +52,24 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
torch_dtype = attrs.pop('torch_dtype', self.dtype)
device_map = attrs.pop('device_map', None)
attrs.pop('low_cpu_mem_usage', None)
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
config = get_mpt_config(self.model_id,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
config=config,
torch_dtype=torch_dtype,
trust_remote_code=trust_remote_code,
device_map=device_map,
**attrs)
try:
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
return bentoml.transformers.save_model(self.tag,
model,
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
finally:
torch.cuda.empty_cache()
@@ -60,10 +78,18 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
torch_dtype = attrs.pop('torch_dtype', self.dtype)
device_map = attrs.pop('device_map', None)
trust_remote_code = attrs.pop('trust_remote_code', True)
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
model = transformers.AutoModelForCausalLM.from_pretrained(
self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs
)
config = get_mpt_config(self._bentomodel.path,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
config=config,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
device_map=device_map,
**attrs)
model.tie_weights()
return model

View File

@@ -16,29 +16,35 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
__openllm_internal__ = True
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(
self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)
)
return bentoml.transformers.save_model(self.tag,
transformers.FlaxAutoModelForCausalLM.from_pretrained(
self.model_id, **attrs),
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
repetition_penalty: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
repetition_penalty: float | None = None,
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences,
'repetition_penalty': repetition_penalty
}, {}
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='np'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
skip_special_tokens=True
)
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
do_sample=True,
generation_config=self.config.model_construct_env(
**attrs).to_generation_config()).sequences,
skip_special_tokens=True)

View File

@@ -19,6 +19,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True
)
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -11,17 +11,18 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import transformers
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(
self.tag,
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self)
)
return bentoml.transformers.save_model(self.tag,
transformers.TFOPTForCausalLM.from_pretrained(
self.model_id, trust_remote_code=trust_remote_code, **attrs),
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True
)
self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -10,16 +10,17 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
__openllm_internal__ = True
tokenizer_id = 'local'
def sanitize_parameters(
self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
def sanitize_parameters(self,
prompt: str,
max_new_tokens: int | None = None,
temperature: float | None = None,
top_k: int | None = None,
num_return_sequences: int | None = None,
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences
}, {}

View File

@@ -22,13 +22,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
with torch.inference_mode():
return [
self.tokenizer.decode(
self.model.generate(
**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
)[0],
skip_special_tokens=True
)
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
skip_special_tokens=True)
]

View File

@@ -18,17 +18,29 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32
}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import torch
import transformers
torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
tokenizer.add_special_tokens({
'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
'pad_token': EOD
})
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
torch_dtype=torch_dtype,
device_map=device_map,
**attrs)
try:
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
return bentoml.transformers.save_model(self.tag,
model,
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
finally:
torch.cuda.empty_cache()
@@ -41,17 +53,22 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()
)
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -56,20 +56,34 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ensure_available=True).prepare_for_training(adapter_type="lora", lora_alpha=16, lora_dropout=0.1, r=16, bias="none", target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model, tokenizer = openllm.AutoLLM.for_model("falcon",
model_id=model_args.model_id,
quantize="int4",
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=[
"query_key_value", "dense",
"dense_h_to_4h",
"dense_4h_to_h"
])
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(DATASET_NAME, split="train")
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=model_args.max_sequence_length,
tokenizer=tokenizer,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
)
trainer = SFTTrainer(model=model,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=model_args.max_sequence_length,
tokenizer=tokenizer,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
)
# upcast layernorm in float32 for more stable training
for name, module in trainer.model.named_modules():

View File

@@ -75,10 +75,13 @@ def chunk(sample, chunk_length=2048):
# get max number of chunks for batch
if batch_total_length >= chunk_length:
batch_chunk_length = (batch_total_length//chunk_length) * chunk_length
batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
# Split by chunks of max_len.
result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()}
result = {
k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
for k, t in concatenated_examples.items()
}
# add remainder to global variable for next batch
remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
# prepare labels
@@ -98,33 +101,39 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])
# tokenize and chunk dataset
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True,)
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]),
batched=True,
remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")
return lm_dataset
def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True,
) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
def prepare_for_int4_training(model_id: str,
model_version: str | None = None,
gradient_checkpointing: bool = True,
bf16: bool = True,
) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
from peft.tuners.lora import LoraLayer
llm = openllm.AutoLLM.for_model(
"llama",
model_id=model_id,
model_version=model_version,
ensure_available=True,
quantize="int4",
bnb_4bit_compute_dtype=torch.bfloat16,
use_cache=not gradient_checkpointing,
device_map="auto",
)
llm = openllm.AutoLLM.for_model("llama",
model_id=model_id,
model_version=model_version,
ensure_available=True,
quantize="int4",
bnb_4bit_compute_dtype=torch.bfloat16,
use_cache=not gradient_checkpointing,
device_map="auto",
)
print("Model summary:", llm.model)
# get lora target modules
modules = find_all_linear_names(llm.model)
print(f"Found {len(modules)} modules to quantize: {modules}")
model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules)
model, tokenizer = llm.prepare_for_training(adapter_type="lora",
use_gradient_checkpointing=gradient_checkpointing,
target_modules=modules)
# pre-process the model by upcasting the layer norms in float 32 for
for name, module in model.named_modules():
@@ -177,15 +186,18 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
transformers.set_seed(model_args.seed)
model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,)
model, tokenizer = prepare_for_int4_training(model_args.model_id,
gradient_checkpointing=training_args.gradient_checkpointing,
bf16=training_args.bf16,
)
datasets = prepare_datasets(tokenizer)
trainer = transformers.Trainer(
model=model,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)
trainer = transformers.Trainer(model=model,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)
trainer.train()
@@ -200,10 +212,14 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
del model, trainer
torch.cuda.empty_cache()
model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir,
low_cpu_mem_usage=True,
torch_dtype=torch.float16)
# merge lora with base weights and save
model = model.merge_and_unload()
model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"),
safe_serialization=True,
max_shard_size="2GB")
else:
trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))

View File

@@ -26,13 +26,14 @@ if t.TYPE_CHECKING:
DEFAULT_MODEL_ID = "facebook/opt-6.7b"
def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments):
return transformers.Trainer(
model=model,
train_dataset=dataset_dict["train"],
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any,
training_args: TrainingArguments):
return transformers.Trainer(model=model,
train_dataset=dataset_dict["train"],
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
@dataclasses.dataclass
class TrainingArguments:
@@ -57,7 +58,16 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True).prepare_for_training(adapter_type="lora", r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
model, tokenizer = openllm.AutoLLM.for_model("opt",
model_id=model_args.model_id,
quantize="int8",
ensure_available=True).prepare_for_training(
adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
# ft on english_quotes
data = load_dataset("Abirate/english_quotes")

View File

@@ -64,10 +64,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
raise openllm.exceptions.OpenLLMException(
"Bento model does not have tokenizer. Make sure to save"
" the tokenizer within the model via 'custom_objects'."
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\""
) from None
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
else:
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
trust_remote_code=llm.__llm_trust_remote_code__,
**tokenizer_attrs)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id
@@ -77,12 +78,14 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
return tokenizer
class _Caller(t.Protocol[P]):
def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
...
_extras = ['get', 'import_model', 'save_pretrained', 'load_model']
def _make_dispatch_function(fn: str) -> _Caller[P]:
def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.

View File

@@ -6,4 +6,7 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'),
'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
}
HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token']
HUB_ATTRS = [
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision',
'subfolder', 'use_auth_token'
]

View File

@@ -13,7 +13,11 @@ if t.TYPE_CHECKING:
_conversion_strategy = {'pt': 'ggml'}
def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
def import_model(llm: openllm.LLM[t.Any, t.Any],
*decls: t.Any,
trust_remote_code: bool = True,
**attrs: t.Any,
) -> bentoml.Model:
raise NotImplementedError('Currently work in progress.')
def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
@@ -27,9 +31,12 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
try:
model = bentoml.models.get(llm.tag)
if model.info.module not in ('openllm.serialisation.ggml', __name__):
raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
raise bentoml.exceptions.NotFound(
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
)
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
return model
except bentoml.exceptions.NotFound:
if auto_import:

View File

@@ -46,7 +46,11 @@ logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
@inject
def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
def import_model(llm: openllm.LLM[M, T],
*decls: t.Any,
trust_remote_code: bool,
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
**attrs: t.Any) -> bentoml.Model:
"""Auto detect model type from given model_id and import it to bentoml's model store.
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
@@ -67,80 +71,110 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
_, tokenizer_attrs = llm.llm_parameters
quantize_method = llm._quantize_method
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
default=llm._serialisation_format == 'safetensors')
# Disable safe serialization with vLLM
if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
metadata: DictStrAny = {
'safe_serialisation': safe_serialisation,
'_quantize': quantize_method is not None and quantize_method
}
signatures: DictStrAny = {}
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
else:
# this model might be called with --quantize int4, therefore we need to pop this out
# since saving int4 is not yet supported
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False): attrs.pop('quantization_config')
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
attrs.pop('quantization_config')
if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
trust_remote_code=trust_remote_code,
**hub_attrs,
**tokenizer_attrs)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)]
imported_modules: list[types.ModuleType] = []
bentomodel = bentoml.Model.create(
llm.tag,
module='openllm.serialisation.transformers',
api_version='v1',
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='openllm'),
labels=openllm.utils.generate_labels(llm),
signatures=signatures if signatures else make_model_signatures(llm)
)
bentomodel = bentoml.Model.create(llm.tag,
module='openllm.serialisation.transformers',
api_version='v1',
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='openllm'),
labels=openllm.utils.generate_labels(llm),
signatures=signatures if signatures else make_model_signatures(llm))
with openllm.utils.analytics.set_bentoml_tracking():
try:
bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
tokenizer.save_pretrained(bentomodel.path)
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
model = autogptq.AutoGPTQForCausalLM.from_quantized(
llm.model_id,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=trust_remote_code,
use_safetensors=safe_serialisation,
**hub_attrs,
**attrs,
)
update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
llm.quantization_config),
trust_remote_code=trust_remote_code,
use_safetensors=safe_serialisation,
**hub_attrs,
**attrs,
)
update_model(bentomodel,
metadata={
'_pretrained_class': model.__class__.__name__,
'_framework': model.model.framework
})
model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
else:
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
architecture = architectures[0]
update_model(bentomodel, metadata={'_pretrained_class': architecture})
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id,
*decls,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
else:
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
snapshot_download(llm.model_id,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=HfIgnore.ignore_patterns(llm))
except Exception:
raise
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module,
model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
# NOTE: We need to free up the cache after importing the model
@@ -158,13 +192,15 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
'''
try:
model = bentoml.models.get(llm.tag)
if model.info.module not in (
'openllm.serialisation.transformers'
'bentoml.transformers', 'bentoml._internal.frameworks.transformers', __name__
): # NOTE: backward compatible with previous version of OpenLLM.
raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
if model.info.module not in ('openllm.serialisation.transformers'
'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
__name__): # NOTE: backward compatible with previous version of OpenLLM.
raise bentoml.exceptions.NotFound(
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
)
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
return model
except bentoml.exceptions.NotFound as err:
if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
@@ -177,44 +213,50 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
'''
config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
safe_serialization = openllm.utils.first_not_none(
t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), attrs.pop('safe_serialization', None), default=llm._serialisation_format == 'safetensors'
)
safe_serialization = openllm.utils.first_not_none(t.cast(
t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
attrs.pop('safe_serialization', None),
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(
llm._bentomodel.path,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=llm.__llm_trust_remote_code__,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs
)
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
llm.quantization_config),
trust_remote_code=llm.__llm_trust_remote_code__,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs)
device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
model = infer_autoclass_from_llm(llm, config).from_pretrained(
llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs
).eval()
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
*decls,
config=config,
trust_remote_code=llm.__llm_trust_remote_code__,
device_map=device_map,
**hub_attrs,
**attrs).eval()
# BetterTransformer is currently only supported on PyTorch.
if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
return t.cast('M', model)
def save_pretrained(
llm: openllm.LLM[M, T],
save_directory: str,
is_main_process: bool = True,
state_dict: DictStrAny | None = None,
save_function: t.Any | None = None,
push_to_hub: bool = False,
max_shard_size: int | str = '10GB',
safe_serialization: bool = False,
variant: str | None = None,
**attrs: t.Any
) -> None:
def save_pretrained(llm: openllm.LLM[M, T],
save_directory: str,
is_main_process: bool = True,
state_dict: DictStrAny | None = None,
save_function: t.Any | None = None,
push_to_hub: bool = False,
max_shard_size: int | str = '10GB',
safe_serialization: bool = False,
variant: str | None = None,
**attrs: t.Any) -> None:
save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
@@ -222,25 +264,31 @@ def save_pretrained(
if llm.__llm_implementation__ == 'vllm': safe_serialization = False
if llm._quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM): raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory, use_safetensors=safe_serialization)
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
use_safetensors=safe_serialization)
elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
raise RuntimeError("vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized.")
raise RuntimeError(
"vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
)
elif isinstance(llm.model, transformers.Pipeline):
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
else:
# We can safely cast here since it will be the PreTrainedModel protocol.
t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(
save_directory,
is_main_process=is_main_process,
state_dict=state_dict,
save_function=save_function,
push_to_hub=push_to_hub,
max_shard_size=max_shard_size,
safe_serialization=safe_serialization,
variant=variant,
**model_save_attrs
)
t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
is_main_process=is_main_process,
state_dict=state_dict,
save_function=save_function,
push_to_hub=push_to_hub,
max_shard_size=max_shard_size,
safe_serialization=safe_serialization,
variant=variant,
**model_save_attrs)
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)

View File

@@ -23,11 +23,14 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
'transformers'), openllm_core.utils.LazyLoader(
'torch', globals(), 'torch')
_object_setattr = object.__setattr__
def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
def process_config(model_id: str, trust_remote_code: bool,
**attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
'''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
Args:
@@ -44,19 +47,27 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
if not isinstance(config, transformers.PretrainedConfig):
copied_attrs = copy.deepcopy(attrs)
if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
config, attrs = transformers.AutoConfig.from_pretrained(model_id,
return_unused_kwargs=True,
trust_remote_code=trust_remote_code,
**hub_attrs,
**copied_attrs)
return config, hub_attrs, attrs
def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
__cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
if __cls is None: raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
__cls = getattr(transformers,
openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
if __cls is None:
raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
return __cls
def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass:
if llm.config['trust_remote_code']:
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if not hasattr(config, 'auto_map'):
raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping')
raise ValueError(
f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping'
)
# in case this model doesn't use the correct auto class for model type, for example like chatglm
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
if autoclass not in config.auto_map: autoclass = 'AutoModel'
@@ -69,14 +80,24 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
def check_unintialised_params(model: torch.nn.Module) -> None:
unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
if len(unintialized) > 0: raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
if len(unintialized) > 0:
raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
based.update(metadata)
_object_setattr(bentomodel, '_info', ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based
))
_object_setattr(
bentomodel, '_info',
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag,
module=bentomodel.info.module,
labels=bentomodel.info.labels,
options=bentomodel.info.options.to_dict(),
signatures=bentomodel.info.signatures,
context=bentomodel.info.context,
api_version=bentomodel.info.api_version,
creation_time=bentomodel.info.creation_time,
metadata=based))
return bentomodel
# NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
@@ -84,9 +105,13 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_implementation__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',)
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
'group_beam_search', 'constrained_beam_search',
)
elif llm.__llm_implementation__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
'contrastive_search',
)
else:
infer_fn += ('generate',)
return {k: default_config for k in infer_fn}

View File

@@ -25,7 +25,8 @@ class HfIgnore:
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
elif llm.__llm_implementation__ == 'flax': base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax
elif llm.__llm_implementation__ == 'flax':
base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax
else:
base = [cls.tf, cls.flax]
if has_safetensors_weights(llm.model_id): base.append(cls.pt)

View File

@@ -15,9 +15,11 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
@contextlib.contextmanager
def build_bento(
model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, runtime: t.Literal['ggml', 'transformers'] = 'transformers', cleanup: bool = False
) -> t.Iterator[bentoml.Bento]:
def build_bento(model: str,
model_id: str | None = None,
quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
logger.info('Building BentoML for %s', model)
bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
yield bento
@@ -26,7 +28,10 @@ def build_bento(
bentoml.bentos.delete(bento.tag)
@contextlib.contextmanager
def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]:
def build_container(bento: bentoml.Bento | str | bentoml.Tag,
image_tag: str | None = None,
cleanup: bool = False,
**attrs: t.Any) -> t.Iterator[str]:
if isinstance(bento, bentoml.Bento): bento_tag = bento.tag
else: bento_tag = bentoml.Tag.from_taglike(bento)
if image_tag is None: image_tag = str(bento_tag)
@@ -42,22 +47,23 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N
subprocess.check_output([executable, 'rmi', '-f', image_tag])
@contextlib.contextmanager
def prepare(
model: str,
model_id: str | None = None,
implementation: LiteralRuntime = 'pt',
deployment_mode: t.Literal['container', 'local'] = 'local',
clean_context: contextlib.ExitStack | None = None,
cleanup: bool = True
) -> t.Iterator[str]:
def prepare(model: str,
model_id: str | None = None,
implementation: LiteralRuntime = 'pt',
deployment_mode: t.Literal['container', 'local'] = 'local',
clean_context: contextlib.ExitStack | None = None,
cleanup: bool = True) -> t.Iterator[str]:
if clean_context is None:
clean_context = contextlib.ExitStack()
cleanup = True
llm = openllm.infer_auto_class(implementation).for_model(model, model_id=model_id, ensure_available=True)
bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{llm.tag.version}')
if not bentoml.list(bento_tag): bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
else: bento = bentoml.get(bento_tag)
if not bentoml.list(bento_tag):
bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
else:
bento = bentoml.get(bento_tag)
container_name = f'openllm-{model}-{llm.llm_type}'.replace('-', '_')
if deployment_mode == 'container': container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
if deployment_mode == 'container':
container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
yield container_name
if cleanup: clean_context.close()

View File

@@ -19,9 +19,17 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralRuntime
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
return {'runtime': llm.runtime, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation_format': llm._serialisation_format}
return {
'runtime': llm.runtime,
'framework': 'openllm',
'model_name': llm.config['model_name'],
'architecture': llm.config['architecture'],
'serialisation_format': llm._serialisation_format
}
def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
def infer_auto_class(
implementation: LiteralRuntime
) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
import openllm
if implementation == 'tf': return openllm.AutoTFLLM
elif implementation == 'flax': return openllm.AutoFlaxLLM
@@ -29,7 +37,10 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o
elif implementation == 'vllm': return openllm.AutoVLLM
else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
__all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects']
__all__ = [
'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
'dummy_vllm_objects'
]
def __dir__() -> t.Sequence[str]:
return sorted(__all__)

View File

@@ -16,19 +16,32 @@ env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_na
def model_settings(draw: st.DrawFn):
'''Strategy for generating ModelSettings objects.'''
kwargs: dict[str, t.Any] = {
'default_id': st.text(min_size=1),
'model_ids': st.lists(st.text(), min_size=1),
'architecture': st.text(min_size=1),
'url': st.text(),
'requires_gpu': st.booleans(),
'trust_remote_code': st.booleans(),
'requirements': st.none() | st.lists(st.text(), min_size=1),
'default_implementation': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']),
'runtime': st.sampled_from(['transformers', 'ggml']),
'name_type': st.sampled_from(['dasherize', 'lowercase']),
'timeout': st.integers(min_value=3600),
'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
'default_id':
st.text(min_size=1),
'model_ids':
st.lists(st.text(), min_size=1),
'architecture':
st.text(min_size=1),
'url':
st.text(),
'requires_gpu':
st.booleans(),
'trust_remote_code':
st.booleans(),
'requirements':
st.none() | st.lists(st.text(), min_size=1),
'default_implementation':
st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
'model_type':
st.sampled_from(['causal_lm', 'seq2seq_lm']),
'runtime':
st.sampled_from(['transformers', 'ggml']),
'name_type':
st.sampled_from(['dasherize', 'lowercase']),
'timeout':
st.integers(min_value=3600),
'workers_per_resource':
st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
}
return draw(st.builds(ModelSettings, **kwargs))

View File

@@ -24,19 +24,29 @@ from ._strategies._configuration import make_llm_config
from ._strategies._configuration import model_settings
# XXX: @aarnphm fixes TypedDict behaviour in 3.11
@pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this')
@pytest.mark.skipif(sys.version_info[:2] == (3, 11),
reason='TypedDict in 3.11 behaves differently, so we need to fix this')
def test_missing_default():
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingDefaultId', {'name_type': 'lowercase', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
make_llm_config(
'MissingArchitecture', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing'],
'requirements': ['bentoml'],
},
)
def test_forbidden_access():
cl_ = make_llm_config(
'ForbiddenAccess', {
'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'],
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'],
'architecture': 'PreTrainedModel',
'requirements': ['bentoml'],
},
)
@@ -69,9 +79,16 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
assert attr.has(cl_)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int,
input_temperature: float):
cl_ = make_llm_config('ComplexLLM',
gen_settings,
fields=(('field1', 'float', field1),),
generation_fields=(('temperature', temperature),),
)
sent = cl_()
assert sent.model_dump()['field1'] == field1
assert sent.model_dump()['generation_config']['temperature'] == temperature
@@ -94,7 +111,10 @@ def patch_env(**attrs: t.Any):
yield
def test_struct_envvar():
with patch_env(**{field_env_key('env_llm', 'field1'): '4', field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',}):
with patch_env(**{
field_env_key('env_llm', 'field1'): '4',
field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
}):
class EnvLLM(openllm.LLMConfig):
__config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -112,6 +132,7 @@ def test_struct_envvar():
assert overwrite_default['temperature'] == 0.2
def test_struct_provided_fields():
class EnvLLM(openllm.LLMConfig):
__config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
field1: int = 2
@@ -127,11 +148,13 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
with monkeypatch.context() as mk:
mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
sent = make_llm_config(
'OverwriteWithEnvAvailable', {
'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel'
}, fields=(('field1', 'float', 3.0),),
).model_construct_env(field1=20.0, temperature=0.4)
sent = make_llm_config('OverwriteWithEnvAvailable', {
'default_id': 'asdfasdf',
'model_ids': ['asdf', 'asdfasdfads'],
'architecture': 'PreTrainedModel'
},
fields=(('field1', 'float', 3.0),),
).model_construct_env(field1=20.0, temperature=0.4)
assert sent.generation_config.temperature == 0.4
assert sent.field1 == 20.0

View File

@@ -10,23 +10,37 @@ import openllm
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralRuntime
_FRAMEWORK_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',}
_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',}
_FRAMEWORK_MAPPING = {
'flan_t5': 'google/flan-t5-small',
'opt': 'facebook/opt-125m',
'baichuan': 'baichuan-inc/Baichuan-7B',
}
_PROMPT_MAPPING = {
'qa':
'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',
}
def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
def parametrise_local_llm(
model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
runtime_impl: tuple[LiteralRuntime, ...] = tuple()
if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
llm = openllm.Runner(model, model_id=_FRAMEWORK_MAPPING[model], ensure_available=True, implementation=framework, init_local=True,)
llm = openllm.Runner(model,
model_id=_FRAMEWORK_MAPPING[model],
ensure_available=True,
implementation=framework,
init_local=True,
)
yield prompt, llm
def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
if os.getenv('GITHUB_ACTIONS') is None:
if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
metafunc.parametrize('prompt,llm',
[(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
# If no tests are collected, pytest exists with code 5, which makes the CI fail.

View File

@@ -40,7 +40,13 @@ if t.TYPE_CHECKING:
from openllm.client import BaseAsyncClient
class ResponseComparator(JSONSnapshotExtension):
def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData:
def serialize(self,
data: SerializableData,
*,
exclude: PropertyFilter | None = None,
matcher: PropertyMatcher | None = None,
) -> SerializedData:
if openllm.utils.LazyType(ListAny).isinstance(data):
data = [d.unmarshaled for d in data]
else:
@@ -49,6 +55,7 @@ class ResponseComparator(JSONSnapshotExtension):
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()
def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
try:
data = orjson.loads(data)
@@ -73,9 +80,11 @@ class ResponseComparator(JSONSnapshotExtension):
return s == t
def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config))
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
eq_config(s.marshaled_config, t.marshaled_config))
return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
return len(serialized_data) == len(snapshot_data) and all(
[eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
@pytest.fixture()
def response_snapshot(snapshot: SnapshotAssertion):
@@ -124,8 +133,14 @@ class LocalHandle(_Handle):
return self.process.poll() is None
class HandleProtocol(t.Protocol):
@contextlib.contextmanager
def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]:
def __call__(*,
model: str,
model_id: str,
image_tag: str,
quantize: t.AnyStr | None = None,
) -> t.Generator[_Handle, None, None]:
...
@attr.define(init=False)
@@ -133,7 +148,9 @@ class DockerHandle(_Handle):
container_name: str
docker_client: docker.DockerClient
def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, deployment_mode: t.Literal['container', 'local'],):
def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int,
deployment_mode: t.Literal['container', 'local'],
):
self.__attrs_init__(port, deployment_mode, container_name, docker_client)
def status(self) -> bool:
@@ -141,16 +158,29 @@ class DockerHandle(_Handle):
return container.status in ['running', 'created']
@contextlib.contextmanager
def _local_handle(
model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
):
def _local_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
with openllm.utils.reserve_free_port() as port:
pass
if not _serve_grpc:
proc = openllm.start(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
proc = openllm.start(model,
model_id=model_id,
quantize=quantize,
additional_args=['--port', str(port)],
__test__=True)
else:
proc = openllm.start_grpc(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
proc = openllm.start_grpc(model,
model_id=model_id,
quantize=quantize,
additional_args=['--port', str(port)],
__test__=True)
yield LocalHandle(proc, port, deployment_mode)
proc.terminate()
@@ -164,9 +194,14 @@ def _local_handle(
proc.stderr.close()
@contextlib.contextmanager
def _container_handle(
model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
):
def _container_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
envvar = openllm.utils.EnvVarMixin(model)
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
@@ -191,11 +226,18 @@ def _container_handle(
gpus = openllm.utils.device_count() or -1
devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None
container = client.containers.run(
image_tag, command=args, name=container_name, environment=env, auto_remove=False, detach=True, device_requests=devs, ports={
'3000/tcp': port, '3001/tcp': prom_port
},
)
container = client.containers.run(image_tag,
command=args,
name=container_name,
environment=env,
auto_remove=False,
detach=True,
device_requests=devs,
ports={
'3000/tcp': port,
'3001/tcp': prom_port
},
)
yield DockerHandle(client, container.name, port, deployment_mode)

View File

@@ -16,8 +16,11 @@ model = 'flan_t5'
model_id = 'google/flan-t5-small'
@pytest.fixture(scope='module')
def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
clean_context: contextlib.ExitStack,
):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
clean_context=clean_context) as image_tag:
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
yield handle

View File

@@ -16,8 +16,11 @@ model = 'opt'
model_id = 'facebook/opt-125m'
@pytest.fixture(scope='module')
def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
clean_context: contextlib.ExitStack,
):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
clean_context=clean_context) as image_tag:
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
yield handle

View File

@@ -15,7 +15,9 @@ if t.TYPE_CHECKING:
HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5'
actions_xfail = functools.partial(
pytest.mark.xfail, condition=os.getenv('GITHUB_ACTIONS') is not None, reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
pytest.mark.xfail,
condition=os.getenv('GITHUB_ACTIONS') is not None,
reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
)
@actions_xfail
@@ -46,7 +48,9 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
@pytest.fixture()
def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
file.write_text(
"{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}"
)
return file
@pytest.mark.usefixtures('dockerfile_template')

View File

@@ -71,9 +71,11 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
mcls.setenv('CUDA_VISIBLE_DEVICES', '')
assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],
).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
assert pytest.raises(ValueError, NvidiaGpuResource.validate,
['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls: