mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-02 21:56:10 -05:00
feat: continuous batching with vLLM (#349)
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: continuous batching Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add changeloe Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add one shot generation Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -115,7 +115,7 @@ openai = ["openai", "tiktoken"]
|
||||
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
|
||||
starcoder = ["bitsandbytes"]
|
||||
vllm = ["vllm>=0.1.6", "ray"]
|
||||
vllm = ["vllm>=0.1.7", "ray"]
|
||||
|
||||
[tool.hatch.version]
|
||||
fallback-version = "0.0.0"
|
||||
|
||||
@@ -9,7 +9,6 @@ import openllm
|
||||
from openllm.exceptions import OpenLLMException
|
||||
from openllm_core._configuration import _object_getattribute
|
||||
from openllm_core._configuration import _setattr_class
|
||||
from openllm_core._schema import unmarshal_vllm_outputs
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import ListStr
|
||||
from openllm_core._typing_compat import M
|
||||
@@ -22,6 +21,7 @@ from openllm_core.utils import LazyLoader
|
||||
from openllm_core.utils import codegen
|
||||
from openllm_core.utils import device_count
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import is_torch_available
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -38,35 +38,37 @@ else:
|
||||
def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
|
||||
trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
|
||||
return fn(self, *decls, trust_remote_code=first_not_none(trust_remote_code, default=self.trust_remote_code), **attrs)
|
||||
|
||||
return inner
|
||||
|
||||
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
|
||||
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.AsyncLLMEngine]:
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
|
||||
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.AsyncLLMEngine:
|
||||
if self.__llm_backend__ == 'vllm':
|
||||
num_gpus, dev = 1, device_count()
|
||||
if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
|
||||
# TODO: Do some more processing with token_id once we support token streaming
|
||||
try:
|
||||
return vllm.LLMEngine.from_engine_args(
|
||||
vllm.EngineArgs(model=self._bentomodel.path,
|
||||
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
|
||||
tokenizer_mode='auto',
|
||||
tensor_parallel_size=num_gpus,
|
||||
dtype='auto',
|
||||
worker_use_ray=False))
|
||||
return vllm.AsyncLLMEngine.from_engine_args(
|
||||
vllm.AsyncEngineArgs(model=self._bentomodel.path,
|
||||
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
|
||||
tokenizer_mode='auto',
|
||||
tensor_parallel_size=num_gpus,
|
||||
dtype='auto',
|
||||
disable_log_requests=not get_debug_mode(),
|
||||
worker_use_ray=False,
|
||||
engine_use_ray=False))
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
|
||||
else:
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
return fn(self, *decls, **attrs)
|
||||
|
||||
return inner
|
||||
|
||||
@@ -108,12 +110,6 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
|
||||
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
|
||||
lines.extend([f'{cached_func_name}=cls.{func}', func_call, _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')])
|
||||
|
||||
# assign vLLM implementation
|
||||
if cls.__llm_backend__ == 'vllm':
|
||||
vllm_func = {f'_vllm_{it}': fn for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))}
|
||||
globs.update(vllm_func)
|
||||
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
|
||||
|
||||
interface_anns = codegen.get_annotations(LLMInterface)
|
||||
|
||||
# cached attribute initialisation
|
||||
@@ -131,46 +127,3 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
|
||||
lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
|
||||
|
||||
return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations={'cls': 't.Type[LLM]', 'return': None})
|
||||
|
||||
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
|
||||
return generation_result[0]['outputs'][0]['text']
|
||||
|
||||
def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
|
||||
prompt: str,
|
||||
/,
|
||||
*,
|
||||
echo: bool = False,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
**attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
stop_: set[str] = set()
|
||||
if isinstance(stop, str) and stop != '': stop_.add(stop)
|
||||
elif isinstance(stop, list) and stop != []: stop_.update(stop)
|
||||
for tid in stop_token_ids:
|
||||
if tid: stop_.add(self.tokenizer.decode(tid))
|
||||
|
||||
if self.config['temperature'] <= 1e-5: top_p = 1.0
|
||||
else: top_p = self.config['top_p']
|
||||
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
|
||||
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
for request_output in self.model.step():
|
||||
prompt = request_output.prompt
|
||||
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
|
||||
else: text_outputs = [output.text for output in request_output.outputs]
|
||||
yield {'text': text_outputs, 'error_code': 0}
|
||||
if request_output.finished: break
|
||||
|
||||
def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
outputs: list[vllm.RequestOutput] = []
|
||||
# TODO: support prompt_token_ids
|
||||
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
outputs.extend([r for r in self.model.step() if r.finished])
|
||||
return [unmarshal_vllm_outputs(i) for i in outputs]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# mypy: disable-error-code="name-defined,attr-defined"
|
||||
from __future__ import annotations
|
||||
import abc
|
||||
import asyncio
|
||||
import gc
|
||||
import inspect
|
||||
import logging
|
||||
@@ -22,6 +23,7 @@ import openllm_core
|
||||
from bentoml._internal.models.model import ModelSignature
|
||||
from openllm_core._configuration import FineTuneConfig
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core._prompt import process_prompt
|
||||
from openllm_core._schema import EmbeddingsOutput
|
||||
from openllm_core._typing_compat import AdaptersMapping
|
||||
from openllm_core._typing_compat import AdaptersTuple
|
||||
@@ -46,7 +48,7 @@ from openllm_core.utils import ReprMixin
|
||||
from openllm_core.utils import apply
|
||||
from openllm_core.utils import bentoml_cattr
|
||||
from openllm_core.utils import codegen
|
||||
from openllm_core.utils import device_count
|
||||
from openllm_core.utils import ensure_exec_coro
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import generate_hash_from_file
|
||||
from openllm_core.utils import is_peft_available
|
||||
@@ -58,7 +60,6 @@ from openllm_core.utils import validate_is_path
|
||||
from ._assign import make_llm_attributes
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from .exceptions import ForbiddenAttributeError
|
||||
from .exceptions import GpuNotAvailableError
|
||||
from .exceptions import OpenLLMException
|
||||
from .utils import infer_auto_class
|
||||
|
||||
@@ -67,6 +68,7 @@ if t.TYPE_CHECKING:
|
||||
import peft
|
||||
import torch
|
||||
import transformers
|
||||
import vllm
|
||||
|
||||
from openllm_core._configuration import PeftType
|
||||
from openllm_core.utils.representation import ReprArgs
|
||||
@@ -742,16 +744,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
@property
|
||||
def model(self) -> M:
|
||||
# Run check for GPU
|
||||
if self.config['requires_gpu'] and device_count() < 1:
|
||||
raise GpuNotAvailableError(f'{self} only supports running with GPU (None available).') from None
|
||||
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
|
||||
if self.__llm_model__ is None:
|
||||
model = self.load_model(*self._model_decls, **self._model_attrs)
|
||||
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
||||
if self.__llm_backend__ == 'pt' and is_torch_available():
|
||||
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit and not isinstance(model, transformers.Pipeline):
|
||||
try:
|
||||
model = model.to('cuda')
|
||||
except Exception as err:
|
||||
@@ -971,92 +970,90 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
from ._generation import prepare_logits_processor
|
||||
|
||||
len_prompt = len(prompt)
|
||||
config = self.config.model_construct_env(**attrs)
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
|
||||
logits_processor = prepare_logits_processor(self.config)
|
||||
logits_processor = prepare_logits_processor(config)
|
||||
|
||||
input_ids = self.tokenizer(prompt).input_ids
|
||||
with torch.inference_mode():
|
||||
input_ids = self.tokenizer(prompt).input_ids
|
||||
|
||||
if context_length is None: context_length = get_context_length(self.model.config)
|
||||
max_src_len = context_length - self.config['max_new_tokens'] - 1
|
||||
if context_length is None: context_length = get_context_length(self.model.config)
|
||||
max_src_len = context_length - config['max_new_tokens'] - 1
|
||||
|
||||
input_ids = input_ids[-max_src_len:]
|
||||
output_ids = list(input_ids)
|
||||
input_echo_len = len(input_ids)
|
||||
input_ids = input_ids[-max_src_len:]
|
||||
output_ids = list(input_ids)
|
||||
input_echo_len = len(input_ids)
|
||||
|
||||
past_key_values = out = token = None
|
||||
for i in range(self.config['max_new_tokens']):
|
||||
torch.cuda.synchronize()
|
||||
if i == 0: # prefill
|
||||
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
else: # decoding
|
||||
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values)
|
||||
past_key_values = out = token = None
|
||||
finish_reason = None
|
||||
for i in range(config['max_new_tokens']):
|
||||
torch.cuda.synchronize()
|
||||
if i == 0: # prefill
|
||||
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
|
||||
else: # decoding
|
||||
out = self.model(torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values)
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
torch.cuda.synchronize()
|
||||
|
||||
if logits_processor:
|
||||
if self.config['repetition_penalty'] > 1.0:
|
||||
tmp_output_ids: t.Any = torch.as_tensor([output_ids], device=self.device)
|
||||
if logits_processor:
|
||||
if config['repetition_penalty'] > 1.0:
|
||||
tmp_output_ids: t.Any = torch.as_tensor([output_ids], device=self.device)
|
||||
else:
|
||||
tmp_output_ids = None
|
||||
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
|
||||
else:
|
||||
tmp_output_ids = None
|
||||
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
|
||||
else:
|
||||
last_token_logits = logits[0, -1, :]
|
||||
last_token_logits = logits[0, -1, :]
|
||||
|
||||
# Switch to CPU by avoiding some bugs in mps backend.
|
||||
if self.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu')
|
||||
# Switch to CPU by avoiding some bugs in mps backend.
|
||||
if self.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu')
|
||||
|
||||
if self.config['temperature'] < 1e-5 or self.config['top_p'] < 1e-8:
|
||||
token = int(torch.argmax(last_token_logits)) # greedy
|
||||
else:
|
||||
probs = torch.softmax(last_token_logits, dim=-1)
|
||||
token = int(torch.multinomial(probs, num_samples=1))
|
||||
output_ids.append(token)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
if token in stop_token_ids: stopped = True
|
||||
else: stopped = False
|
||||
|
||||
# Yield the output tokens
|
||||
if i % stream_interval == 0 or i == self.config['max_new_tokens'] - 1 or stopped:
|
||||
if echo:
|
||||
tmp_output_ids = output_ids
|
||||
rfind_start = len_prompt
|
||||
if config['temperature'] < 1e-5 or config['top_p'] < 1e-8:
|
||||
token = int(torch.argmax(last_token_logits)) # greedy
|
||||
else:
|
||||
tmp_output_ids = output_ids[input_echo_len:]
|
||||
rfind_start = 0
|
||||
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
|
||||
probs = torch.softmax(last_token_logits, dim=-1)
|
||||
indices = torch.multinomial(probs, num_samples=2)
|
||||
token = int(indices.tolist()[0])
|
||||
output_ids.append(token)
|
||||
|
||||
partially_stopped = False
|
||||
if stop:
|
||||
if isinstance(stop, str):
|
||||
pos = output.rfind(stop, rfind_start)
|
||||
if pos != -1: output, stopped = output[:pos], True
|
||||
else: partially_stopped = is_partial_stop(output, stop)
|
||||
elif isinstance(stop, t.Iterable):
|
||||
for each_stop in stop:
|
||||
pos = output.rfind(each_stop, rfind_start)
|
||||
if pos != -1:
|
||||
output, stopped = output[:pos], True
|
||||
break
|
||||
else:
|
||||
partially_stopped = is_partial_stop(output, each_stop)
|
||||
if partially_stopped: break
|
||||
else: raise ValueError('Invalid stop field type.')
|
||||
stopped = token in stop_token_ids
|
||||
|
||||
# Prevent yielding partial stop sequence
|
||||
if not partially_stopped:
|
||||
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
|
||||
if stopped: break
|
||||
# Yield the output tokens
|
||||
if i % stream_interval == 0 or i == config['max_new_tokens'] - 1 or stopped:
|
||||
if echo:
|
||||
tmp_output_ids = output_ids
|
||||
rfind_start = len_prompt
|
||||
else:
|
||||
tmp_output_ids = output_ids[input_echo_len:]
|
||||
rfind_start = 0
|
||||
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
|
||||
|
||||
# Finish stream event, which contains finish reason
|
||||
if i == self.config['max_new_tokens'] - 1: finish_reason = 'length'
|
||||
elif stopped: finish_reason = 'stop'
|
||||
else: finish_reason = None
|
||||
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
|
||||
partially_stopped = False
|
||||
if stop:
|
||||
if isinstance(stop, str):
|
||||
pos = output.rfind(stop, rfind_start)
|
||||
if pos != -1: output, stopped = output[:pos], True
|
||||
else: partially_stopped = is_partial_stop(output, stop)
|
||||
elif isinstance(stop, t.Iterable):
|
||||
for each_stop in stop:
|
||||
pos = output.rfind(each_stop, rfind_start)
|
||||
if pos != -1:
|
||||
output, stopped = output[:pos], True
|
||||
break
|
||||
else:
|
||||
partially_stopped = is_partial_stop(output, each_stop)
|
||||
if partially_stopped: break
|
||||
else: raise ValueError('Invalid stop field type.')
|
||||
|
||||
# Prevent yielding partial stop sequence
|
||||
if not partially_stopped:
|
||||
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
|
||||
if stopped: break
|
||||
else: finish_reason = 'length' # finish stream events
|
||||
if stopped: finish_reason = 'stop'
|
||||
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
|
||||
# Clean
|
||||
del past_key_values, out
|
||||
gc.collect()
|
||||
@@ -1192,7 +1189,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
|
||||
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
||||
adapter_name = attrs.pop('adapter_name', None)
|
||||
if adapter_name is not None: __self.set_adapter(adapter_name)
|
||||
if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
|
||||
return self.generate(prompt, **attrs)
|
||||
|
||||
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
|
||||
@@ -1207,8 +1203,7 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
|
||||
if adapter_name is not None: __self.set_adapter(adapter_name)
|
||||
pre = 0
|
||||
for outputs in self.generate_iterator(prompt, request_id=openllm_core.utils.gen_random_uuid(), **attrs):
|
||||
output_text = outputs['text'][0] if __self.backend == 'vllm' else outputs['text']
|
||||
output_text = output_text.strip().split(' ')
|
||||
output_text = outputs['text'].strip().split(' ')
|
||||
now = len(output_text) - 1
|
||||
if now > pre:
|
||||
yield ' '.join(output_text[pre:now]) + ' '
|
||||
@@ -1216,13 +1211,81 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
|
||||
yield ' '.join(output_text[pre:]) + ' '
|
||||
return ' '.join(output_text) + ' '
|
||||
|
||||
return types.new_class(
|
||||
self.__class__.__name__ + 'Runnable', (_Runnable,), {},
|
||||
lambda ns: ns.update({
|
||||
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
|
||||
'__module__': self.__module__,
|
||||
'__doc__': self.config['env'].start_docstring
|
||||
}))
|
||||
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
|
||||
def vllm_generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
||||
stop: str | t.Iterable[str] | None = attrs.pop('stop', None)
|
||||
stop_token_ids: list[int] | None = attrs.pop('stop_token_ids', None)
|
||||
adapter_name = attrs.pop('adapter_name', None)
|
||||
if adapter_name is not None: __self.set_adapter(adapter_name)
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
stop_: set[str] = set()
|
||||
if isinstance(stop, str) and stop != '': stop_.add(stop)
|
||||
elif isinstance(stop, list) and stop != []: stop_.update(stop)
|
||||
for tid in stop_token_ids:
|
||||
if tid: stop_.add(self.tokenizer.decode(tid))
|
||||
|
||||
if self.config['temperature'] <= 1e-5: top_p = 1.0
|
||||
else: top_p = self.config['top_p']
|
||||
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
|
||||
sampling_params = config.to_sampling_config()
|
||||
|
||||
async def loop() -> list[str]:
|
||||
async for request_output in t.cast('vllm.AsyncLLMEngine', self.model).generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id):
|
||||
pass
|
||||
return [output.text for output in request_output.outputs]
|
||||
|
||||
try:
|
||||
return asyncio.run(loop())
|
||||
except RuntimeError:
|
||||
try:
|
||||
return ensure_exec_coro(loop())
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
|
||||
async def vllm_generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[str, None]:
|
||||
# TODO: System prompt support
|
||||
pre = 0
|
||||
prompt = process_prompt(prompt, None, False)
|
||||
echo = attrs.pop('echo', False)
|
||||
stop: str | t.Iterable[str] | None = attrs.pop('stop', None)
|
||||
stop_token_ids: list[int] | None = attrs.pop('stop_token_ids', None)
|
||||
adapter_name = attrs.pop('adapter_name', None)
|
||||
if adapter_name is not None: __self.set_adapter(adapter_name)
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
stop_: set[str] = set()
|
||||
if isinstance(stop, str) and stop != '': stop_.add(stop)
|
||||
elif isinstance(stop, list) and stop != []: stop_.update(stop)
|
||||
for tid in stop_token_ids:
|
||||
if tid: stop_.add(self.tokenizer.decode(tid))
|
||||
|
||||
if self.config['temperature'] <= 1e-5: top_p = 1.0
|
||||
else: top_p = self.config['top_p']
|
||||
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
|
||||
sampling_params = config.to_sampling_config()
|
||||
async for request_output in t.cast('vllm.AsyncLLMEngine', self.model).generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id):
|
||||
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
|
||||
else: text_outputs = [output.text for output in request_output.outputs]
|
||||
output_text = text_outputs[0]
|
||||
output_text = output_text.strip().split(' ')
|
||||
now = len(output_text) - 1
|
||||
if now > pre:
|
||||
yield ' '.join(output_text[pre:now]) + ' '
|
||||
pre = now
|
||||
yield ' '.join(output_text[pre:]) + ' '
|
||||
|
||||
return types.new_class(self.__class__.__name__ + 'Runnable', (_Runnable,), {},
|
||||
lambda ns: ns.update({
|
||||
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), '__module__': self.__module__, '__doc__': self.config['env'].start_docstring
|
||||
}))
|
||||
|
||||
def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
|
||||
def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:
|
||||
|
||||
@@ -47,14 +47,24 @@ _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
|
||||
config = qa_inputs.llm_config.model_dump()
|
||||
responses = await runner.generate.async_run(qa_inputs.prompt, **{'adapter_name': qa_inputs.adapter_name, **config})
|
||||
if runner.backend == 'vllm':
|
||||
responses = await runner.vllm_generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, request_id=openllm_core.utils.gen_random_uuid(), **config)
|
||||
else:
|
||||
responses = await runner.generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **config)
|
||||
return openllm.GenerationOutput(responses=responses, configuration=config)
|
||||
|
||||
@svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream'))
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
echo = input_dict.pop('echo', False)
|
||||
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
|
||||
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
|
||||
if runner.backend == 'vllm':
|
||||
return runner.vllm_generate_iterator.async_stream(qa_inputs.prompt,
|
||||
adapter_name=qa_inputs.adapter_name,
|
||||
echo=echo,
|
||||
request_id=openllm_core.utils.gen_random_uuid(),
|
||||
**qa_inputs.llm_config.model_dump())
|
||||
else:
|
||||
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
|
||||
|
||||
@svc.api(route='/v1/metadata',
|
||||
input=bentoml.io.Text(),
|
||||
|
||||
@@ -112,15 +112,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
\b
|
||||
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
|
||||
''',
|
||||
)
|
||||
|
||||
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
command_attrs.update({
|
||||
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
|
||||
})
|
||||
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
|
||||
''')
|
||||
|
||||
@group.command(**command_attrs)
|
||||
@start_decorator(llm_config, serve_grpc=_serve_grpc)
|
||||
@@ -230,19 +222,6 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
return start_cmd
|
||||
|
||||
def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
|
||||
context_settings = command_attrs.pop('context_settings', {})
|
||||
context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
|
||||
command_attrs['context_settings'] = context_settings
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
@group.command(**command_attrs)
|
||||
def noop(**_: t.Any) -> LLMConfig:
|
||||
termui.echo('No GPU available, therefore this command is disabled', fg='red')
|
||||
openllm.utils.analytics.track_start_init(llm_config)
|
||||
return llm_config
|
||||
|
||||
return noop
|
||||
|
||||
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
|
||||
composed = openllm.utils.compose(
|
||||
|
||||
@@ -651,8 +651,6 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
json_data[m] = {
|
||||
'architecture': config['architecture'],
|
||||
'model_id': config['model_ids'],
|
||||
'cpu': not config['requires_gpu'],
|
||||
'gpu': True,
|
||||
'backend': backend,
|
||||
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
|
||||
}
|
||||
@@ -680,13 +678,11 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
import tabulate
|
||||
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
# llm, architecture, url, model_id, installation, cpu, gpu, backend
|
||||
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
|
||||
# llm, architecture, url, model_id, installation, backend
|
||||
data: list[str | tuple[str, str, list[str], str, tuple[LiteralBackend, ...]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)])
|
||||
column_widths = [
|
||||
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
|
||||
]
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], v['backend'])])
|
||||
column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4)]
|
||||
|
||||
if len(data) == 0 and len(failed_initialized) > 0:
|
||||
termui.echo('Exception found while parsing models:\n', fg='yellow')
|
||||
@@ -868,6 +864,7 @@ def query_command(
|
||||
res = client.query(prompt, return_response='raw', **{**client.configuration, **_memoized})
|
||||
if output == 'pretty':
|
||||
response = client.config.postprocess_generate(prompt, res['responses'])
|
||||
if isinstance(response, dict) and 'text' in response: response = response['text']
|
||||
termui.echo('\n\n==Responses==\n', fg='white')
|
||||
termui.echo(response, fg=generated_fg)
|
||||
elif output == 'json':
|
||||
|
||||
@@ -14,19 +14,3 @@ class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNe
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
|
||||
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
|
||||
import transformers
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
|
||||
if self.config.use_half_precision: model.half()
|
||||
return model
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
@@ -31,16 +30,3 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
|
||||
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
# NOTE: support fine-tuning starcoder
|
||||
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
|
||||
@@ -20,7 +20,6 @@ def model_settings(draw: st.DrawFn):
|
||||
'model_ids': st.lists(st.text(), min_size=1),
|
||||
'architecture': st.text(min_size=1),
|
||||
'url': st.text(),
|
||||
'requires_gpu': st.booleans(),
|
||||
'trust_remote_code': st.booleans(),
|
||||
'requirements': st.none() | st.lists(st.text(), min_size=1),
|
||||
'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
|
||||
|
||||
Reference in New Issue
Block a user