feat: continuous batching with vLLM (#349)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* feat: continuous batching

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* chore: add changeloe

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* chore: add one shot generation

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-14 03:09:36 -04:00
committed by GitHub
parent e35e143093
commit ad9107958d
22 changed files with 336 additions and 232 deletions

View File

@@ -60,12 +60,14 @@ repos:
openllm-client/src/openllm_client/pb.*|
.github/.*|
cz.py |
bench.py |
hatch_build.py
)$
additional_dependencies:
- click==8.1.3
- peft
- bentoml==1.1.1
- build==0.10.0
- transformers>=4.31.0
- pandas-stubs
- types-psutil

139
bench.py Executable file
View File

@@ -0,0 +1,139 @@
from __future__ import annotations
import asyncio
import json
import aiohttp
import openllm
async def send_request(url, prompt, session, model, **attrs):
headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
config = openllm.AutoConfig.for_model(model).model_construct_env(**attrs).model_dump()
data = {'prompt': prompt, 'llm_config': config, 'adapter_name': None}
async with session.post(url, headers=headers, data=json.dumps(data)) as response:
result = await response.text()
print('-' * 10 + '\n\n prompt:', prompt, '\nGeneration:', result, '\n\n' + '-' * 10)
async def main():
url = 'http://localhost:3000/v1/generate_stream'
# len=100
prompts = [
'What is the meaning of life?',
'Explain the concept of quantum entanglement.',
'Describe the process of photosynthesis.',
'What are the benefits of regular exercise?',
'How does the internet work?',
'Discuss the impact of climate change on ecosystems.',
'Explain the principles of supply and demand in economics.',
'What is the history of the Roman Empire?',
'Describe the structure of a cell.',
'Discuss the pros and cons of renewable energy sources.',
'Explain the theory of relativity.',
'What is the role of DNA in genetics?',
'Describe the art movement of the Renaissance.',
'Discuss the causes of World War I.',
'What are the major functions of the human brain?',
'Explain the process of evolution by natural selection.',
'Describe the cultural significance of the Great Wall of China.',
'What is the impact of social media on society?',
'Discuss the life and works of Shakespeare.',
'Explain the concept of artificial intelligence.',
'What are the different types of chemical reactions?',
"Describe the structure of the Earth's atmosphere.",
'Discuss the history of the civil rights movement.',
'What are the economic implications of globalization?',
'Explain the principles of good nutrition.',
'Describe the major functions of the immune system.',
'Discuss the impact of colonialism on Africa.',
'What is the process of cellular respiration?',
'Explain the importance of biodiversity.',
'Discuss the causes and consequences of the Industrial Revolution.',
'What are the fundamental principles of democracy?',
'Describe the major components of a computer.',
'Explain the concept of human rights.',
'What is the role of enzymes in biological reactions?',
'Discuss the history of space exploration.',
'What are the ethical considerations in medical research?',
'Describe the cultural significance of the Pyramids of Egypt.',
'Explain the principles of classical physics.',
'What is the impact of climate change on weather patterns?',
'Discuss the major events of the American Revolution.',
'What are the effects of pollution on the environment?',
'Describe the process of protein synthesis.',
'Explain the concept of sustainable agriculture.',
'What is the history of the European Union?',
'Discuss the impact of the Renaissance on art and culture.',
'What are the key principles of marketing?',
'Explain the structure of the periodic table.',
'Describe the major types of renewable energy.',
'Discuss the causes and consequences of the French Revolution.',
'What is the role of the United Nations in international relations?',
'Explain the principles of game theory in economics.',
'What are the stages of human development?',
'Describe the cultural significance of the Taj Mahal.',
'Discuss the major themes in the works of Ernest Hemingway.',
'What is the impact of automation on the workforce?',
'Explain the concept of genetic engineering.',
'What are the different types of chemical bonds?',
"Describe the layers of the Earth's atmosphere.",
"Discuss the history of the women's suffrage movement.",
'What are the economic factors influencing consumer behavior?',
'Explain the principles of conflict resolution.',
'What is the role of neurotransmitters in the nervous system?',
'Discuss the impact of colonialism on India.',
'What is the process of mitosis?',
'Explain the importance of water conservation.',
'Describe the cultural significance of the Acropolis in Athens.',
'Discuss the major philosophical ideas of Plato.',
'What are the principles of investment in finance?',
'Explain the structure of a virus.',
'What is the history of the United Nations?',
'Discuss the impact of technology on modern art.',
'What are the key concepts in cognitive psychology?',
'Describe the major types of non-renewable energy sources.',
'Explain the causes and consequences of the Russian Revolution.',
'What is the role of the World Health Organization in global health?',
'Discuss the principles of ethics in business.',
'What are the stages of the water cycle?',
'Explain the concept of social justice.',
'What is the impact of deforestation on climate change?',
'Describe the process of meiosis.',
'Discuss the cultural significance of the Sistine Chapel ceiling.',
'What are the major themes in the novels of Jane Austen?',
'Explain the role of branding in marketing.',
'What is the history of the Internet?',
'Discuss the impact of artificial intelligence on society.',
'What are the principles of statistical analysis in research?',
'Explain the structure of an atom.',
'What is the significance of the Theory of Evolution by Charles Darwin?',
'Describe the major types of renewable energy.',
'Discuss the causes and consequences of the American Civil War.',
'What is the role of the International Monetary Fund in global economics?',
'Explain the principles of environmental conservation.',
'What are the stages of the rock cycle?',
'Describe the concept of cultural relativism.',
'Discuss the major contributions of Leonardo da Vinci to art and science.',
'What is the impact of globalization on cultural diversity?',
'Explain the process of genetic inheritance.',
'What are the different forms of government in the world?',
'Describe the major types of pollution.',
'Discuss the history of the labor movement.',
'What are the principles of sustainable urban planning?',
'Explain the role of hormones in the endocrine system.',
'What is the cultural significance of the Great Barrier Reef?',
'Discuss the major ideas of Friedrich Nietzsche.',
'What is the impact of social media on political movements?',
'Explain the concept of quantum computing.',
'What are the principles of international diplomacy?',
'Describe the major types of ocean ecosystems.',
'Discuss the causes and consequences of the Cold War.',
'What is the role of the World Trade Organization in global trade?',
'Explain the principles of behavioral psychology.',
'What are the stages of the nitrogen cycle?',
'Describe the concept of cultural appropriation.',
'Discuss the major works of Vincent van Gogh.',
]
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[send_request(url, prompt, session, 'llama', max_new_tokens=4096, top_p=0.21) for _, prompt in enumerate(prompts)])
if __name__ == '__main__': asyncio.run(main())

3
changelog.d/349.feat.md Normal file
View File

@@ -0,0 +1,3 @@
Added support for continuous batching via vLLM
Currently benchmark shows that 100 concurrent requests shows around 1218 TPS on 1 A100 running meta-llama/Llama-2-13b-chat-hf

View File

@@ -465,7 +465,6 @@ class ModelSettings(t.TypedDict, total=False):
# meta
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: NotRequired[str]
requirements: t.Optional[ListStr]
@@ -523,7 +522,6 @@ class _ModelSettingsAttr:
'cpu': 'pt', 'nvidia.com/gpu': 'pt'
},
name_type='dasherize',
requires_gpu=False,
url='',
model_type='causal_lm',
trust_remote_code=False,
@@ -541,7 +539,6 @@ class _ModelSettingsAttr:
architecture: str
default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: str
requirements: t.Optional[ListStr]
@@ -737,8 +734,6 @@ class _ConfigAttr:
'''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
__openllm_url__: str = Field(None)
'''The resolved url for this LLMConfig.'''
__openllm_requires_gpu__: bool = Field(None)
'''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.'''
__openllm_trust_remote_code__: bool = Field(None)
'''Whether to always trust remote code'''
__openllm_service_name__: str = Field(None)
@@ -932,7 +927,6 @@ class LLMConfig(_ConfigAttr):
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
"requires_gpu": True,
"timeout": 3600000,
"url": "https://falconllm.tii.ae/",
"requirements": ["einops", "xformers", "safetensors"],
@@ -1108,8 +1102,6 @@ class LLMConfig(_ConfigAttr):
@overload
def __getitem__(self, item: t.Literal['url']) -> str: ...
@overload
def __getitem__(self, item: t.Literal['requires_gpu']) -> bool: ...
@overload
def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool: ...
@overload
def __getitem__(self, item: t.Literal['service_name']) -> str: ...

View File

@@ -25,7 +25,7 @@ if t.TYPE_CHECKING:
from .utils.lazy import VersionInfo
M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel]')
M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.AsyncLLMEngine, peft.PeftModel]')
T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')
def get_literal_args(typ: t.Any) -> tuple[str, ...]:
@@ -94,7 +94,9 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
vllm_generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
vllm_generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]
class LLMRunner(bentoml.Runner, t.Generic[M, T]):
__doc__: str
@@ -111,7 +113,9 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]]
generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
vllm_generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
vllm_generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]
def __init__(self,
runnable_class: type[LLMRunnable[M, T]],

View File

@@ -40,7 +40,6 @@ class BaichuanConfig(openllm_core.LLMConfig):
'name_type': 'lowercase',
'trust_remote_code': True,
'timeout': 3600000,
'requires_gpu': True,
'url': 'https://github.com/baichuan-inc/Baichuan-7B',
'requirements': ['cpm-kernels', 'sentencepiece'],
'architecture': 'BaiChuanForCausalLM',

View File

@@ -44,7 +44,6 @@ class ChatGLMConfig(openllm_core.LLMConfig):
'name_type': 'lowercase',
'trust_remote_code': True,
'timeout': 3600000,
'requires_gpu': True,
'url': 'https://github.com/THUDM/ChatGLM-6B',
'requirements': ['cpm-kernels', 'sentencepiece'],
'architecture': 'ChatGLMForConditionalGeneration',

View File

@@ -41,7 +41,6 @@ class FalconConfig(openllm_core.LLMConfig):
__config__ = {
'name_type': 'lowercase',
'trust_remote_code': True,
'requires_gpu': True,
'timeout': int(36e6),
'url': 'https://falconllm.tii.ae/',
'requirements': ['einops', 'xformers'],

View File

@@ -45,7 +45,6 @@ class GPTNeoXConfig(openllm_core.LLMConfig):
__config__ = {
'model_name': 'gpt_neox',
'start_name': 'gpt-neox',
'requires_gpu': True,
'architecture': 'GPTNeoXForCausalLM',
'url': 'https://github.com/EleutherAI/gpt-neox',
'default_id': 'eleutherai/gpt-neox-20b',

View File

@@ -10,7 +10,7 @@ START_OPT_COMMAND_DOCSTRING = '''\
Run a LLMServer for OPT model.
\b
> See more information about falcon at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
> See more information about OPT at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
\b
## Usage

View File

@@ -36,11 +36,9 @@ class StarCoderConfig(openllm_core.LLMConfig):
"""
__config__ = {
'name_type': 'lowercase',
'requires_gpu': True,
'url': 'https://github.com/bigcode-project/starcoder',
'architecture': 'GPTBigCodeForCausalLM',
'requirements': ['bitsandbytes'],
'workers_per_resource': 0.5,
'default_id': 'bigcode/starcoder',
'model_ids': ['bigcode/starcoder', 'bigcode/starcoderbase']
}

View File

@@ -115,7 +115,7 @@ openai = ["openai", "tiktoken"]
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
vllm = ["vllm>=0.1.6", "ray"]
vllm = ["vllm>=0.1.7", "ray"]
[tool.hatch.version]
fallback-version = "0.0.0"

View File

@@ -9,7 +9,6 @@ import openllm
from openllm.exceptions import OpenLLMException
from openllm_core._configuration import _object_getattribute
from openllm_core._configuration import _setattr_class
from openllm_core._schema import unmarshal_vllm_outputs
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import ListStr
from openllm_core._typing_compat import M
@@ -22,6 +21,7 @@ from openllm_core.utils import LazyLoader
from openllm_core.utils import codegen
from openllm_core.utils import device_count
from openllm_core.utils import first_not_none
from openllm_core.utils import get_debug_mode
from openllm_core.utils import is_torch_available
if t.TYPE_CHECKING:
@@ -38,35 +38,37 @@ else:
def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
(model_decls, model_attrs), _ = self.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
return fn(self, *decls, trust_remote_code=first_not_none(trust_remote_code, default=self.trust_remote_code), **attrs)
return inner
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.AsyncLLMEngine]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.AsyncLLMEngine:
if self.__llm_backend__ == 'vllm':
num_gpus, dev = 1, device_count()
if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
# TODO: Do some more processing with token_id once we support token streaming
try:
return vllm.LLMEngine.from_engine_args(
vllm.EngineArgs(model=self._bentomodel.path,
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
tokenizer_mode='auto',
tensor_parallel_size=num_gpus,
dtype='auto',
worker_use_ray=False))
return vllm.AsyncLLMEngine.from_engine_args(
vllm.AsyncEngineArgs(model=self._bentomodel.path,
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
tokenizer_mode='auto',
tensor_parallel_size=num_gpus,
dtype='auto',
disable_log_requests=not get_debug_mode(),
worker_use_ray=False,
engine_use_ray=False))
except Exception as err:
traceback.print_exc()
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
else:
(model_decls, model_attrs), _ = self.llm_parameters
return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return fn(self, *decls, **attrs)
return inner
@@ -108,12 +110,6 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
lines.extend([f'{cached_func_name}=cls.{func}', func_call, _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')])
# assign vLLM implementation
if cls.__llm_backend__ == 'vllm':
vllm_func = {f'_vllm_{it}': fn for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))}
globs.update(vllm_func)
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
interface_anns = codegen.get_annotations(LLMInterface)
# cached attribute initialisation
@@ -131,46 +127,3 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations={'cls': 't.Type[LLM]', 'return': None})
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]['outputs'][0]['text']
def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
prompt: str,
/,
*,
echo: bool = False,
stop: str | t.Iterable[str] | None = None,
stop_token_ids: list[int] | None = None,
**attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
stop_: set[str] = set()
if isinstance(stop, str) and stop != '': stop_.add(stop)
elif isinstance(stop, list) and stop != []: stop_.update(stop)
for tid in stop_token_ids:
if tid: stop_.add(self.tokenizer.decode(tid))
if self.config['temperature'] <= 1e-5: top_p = 1.0
else: top_p = self.config['top_p']
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
while self.model.has_unfinished_requests():
for request_output in self.model.step():
prompt = request_output.prompt
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
else: text_outputs = [output.text for output in request_output.outputs]
yield {'text': text_outputs, 'error_code': 0}
if request_output.finished: break
def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
while self.model.has_unfinished_requests():
outputs.extend([r for r in self.model.step() if r.finished])
return [unmarshal_vllm_outputs(i) for i in outputs]

View File

@@ -1,6 +1,7 @@
# mypy: disable-error-code="name-defined,attr-defined"
from __future__ import annotations
import abc
import asyncio
import gc
import inspect
import logging
@@ -22,6 +23,7 @@ import openllm_core
from bentoml._internal.models.model import ModelSignature
from openllm_core._configuration import FineTuneConfig
from openllm_core._configuration import LLMConfig
from openllm_core._prompt import process_prompt
from openllm_core._schema import EmbeddingsOutput
from openllm_core._typing_compat import AdaptersMapping
from openllm_core._typing_compat import AdaptersTuple
@@ -46,7 +48,7 @@ from openllm_core.utils import ReprMixin
from openllm_core.utils import apply
from openllm_core.utils import bentoml_cattr
from openllm_core.utils import codegen
from openllm_core.utils import device_count
from openllm_core.utils import ensure_exec_coro
from openllm_core.utils import first_not_none
from openllm_core.utils import generate_hash_from_file
from openllm_core.utils import is_peft_available
@@ -58,7 +60,6 @@ from openllm_core.utils import validate_is_path
from ._assign import make_llm_attributes
from ._quantisation import infer_quantisation_config
from .exceptions import ForbiddenAttributeError
from .exceptions import GpuNotAvailableError
from .exceptions import OpenLLMException
from .utils import infer_auto_class
@@ -67,6 +68,7 @@ if t.TYPE_CHECKING:
import peft
import torch
import transformers
import vllm
from openllm_core._configuration import PeftType
from openllm_core.utils.representation import ReprArgs
@@ -742,16 +744,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
@property
def model(self) -> M:
# Run check for GPU
if self.config['requires_gpu'] and device_count() < 1:
raise GpuNotAvailableError(f'{self} only supports running with GPU (None available).') from None
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
if self.__llm_model__ is None:
model = self.load_model(*self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_backend__ == 'pt' and is_torch_available():
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit and not isinstance(model, transformers.Pipeline):
try:
model = model.to('cuda')
except Exception as err:
@@ -971,92 +970,90 @@ class LLM(LLMInterface[M, T], ReprMixin):
from ._generation import prepare_logits_processor
len_prompt = len(prompt)
config = self.config.model_construct_env(**attrs)
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
logits_processor = prepare_logits_processor(self.config)
logits_processor = prepare_logits_processor(config)
input_ids = self.tokenizer(prompt).input_ids
with torch.inference_mode():
input_ids = self.tokenizer(prompt).input_ids
if context_length is None: context_length = get_context_length(self.model.config)
max_src_len = context_length - self.config['max_new_tokens'] - 1
if context_length is None: context_length = get_context_length(self.model.config)
max_src_len = context_length - config['max_new_tokens'] - 1
input_ids = input_ids[-max_src_len:]
output_ids = list(input_ids)
input_echo_len = len(input_ids)
input_ids = input_ids[-max_src_len:]
output_ids = list(input_ids)
input_echo_len = len(input_ids)
past_key_values = out = token = None
for i in range(self.config['max_new_tokens']):
torch.cuda.synchronize()
if i == 0: # prefill
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
logits = out.logits
past_key_values = out.past_key_values
else: # decoding
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values)
past_key_values = out = token = None
finish_reason = None
for i in range(config['max_new_tokens']):
torch.cuda.synchronize()
if i == 0: # prefill
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
else: # decoding
out = self.model(torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values)
logits = out.logits
past_key_values = out.past_key_values
torch.cuda.synchronize()
if logits_processor:
if self.config['repetition_penalty'] > 1.0:
tmp_output_ids: t.Any = torch.as_tensor([output_ids], device=self.device)
if logits_processor:
if config['repetition_penalty'] > 1.0:
tmp_output_ids: t.Any = torch.as_tensor([output_ids], device=self.device)
else:
tmp_output_ids = None
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
else:
tmp_output_ids = None
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
else:
last_token_logits = logits[0, -1, :]
last_token_logits = logits[0, -1, :]
# Switch to CPU by avoiding some bugs in mps backend.
if self.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu')
# Switch to CPU by avoiding some bugs in mps backend.
if self.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu')
if self.config['temperature'] < 1e-5 or self.config['top_p'] < 1e-8:
token = int(torch.argmax(last_token_logits)) # greedy
else:
probs = torch.softmax(last_token_logits, dim=-1)
token = int(torch.multinomial(probs, num_samples=1))
output_ids.append(token)
torch.cuda.synchronize()
if token in stop_token_ids: stopped = True
else: stopped = False
# Yield the output tokens
if i % stream_interval == 0 or i == self.config['max_new_tokens'] - 1 or stopped:
if echo:
tmp_output_ids = output_ids
rfind_start = len_prompt
if config['temperature'] < 1e-5 or config['top_p'] < 1e-8:
token = int(torch.argmax(last_token_logits)) # greedy
else:
tmp_output_ids = output_ids[input_echo_len:]
rfind_start = 0
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
probs = torch.softmax(last_token_logits, dim=-1)
indices = torch.multinomial(probs, num_samples=2)
token = int(indices.tolist()[0])
output_ids.append(token)
partially_stopped = False
if stop:
if isinstance(stop, str):
pos = output.rfind(stop, rfind_start)
if pos != -1: output, stopped = output[:pos], True
else: partially_stopped = is_partial_stop(output, stop)
elif isinstance(stop, t.Iterable):
for each_stop in stop:
pos = output.rfind(each_stop, rfind_start)
if pos != -1:
output, stopped = output[:pos], True
break
else:
partially_stopped = is_partial_stop(output, each_stop)
if partially_stopped: break
else: raise ValueError('Invalid stop field type.')
stopped = token in stop_token_ids
# Prevent yielding partial stop sequence
if not partially_stopped:
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
if stopped: break
# Yield the output tokens
if i % stream_interval == 0 or i == config['max_new_tokens'] - 1 or stopped:
if echo:
tmp_output_ids = output_ids
rfind_start = len_prompt
else:
tmp_output_ids = output_ids[input_echo_len:]
rfind_start = 0
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
# Finish stream event, which contains finish reason
if i == self.config['max_new_tokens'] - 1: finish_reason = 'length'
elif stopped: finish_reason = 'stop'
else: finish_reason = None
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
partially_stopped = False
if stop:
if isinstance(stop, str):
pos = output.rfind(stop, rfind_start)
if pos != -1: output, stopped = output[:pos], True
else: partially_stopped = is_partial_stop(output, stop)
elif isinstance(stop, t.Iterable):
for each_stop in stop:
pos = output.rfind(each_stop, rfind_start)
if pos != -1:
output, stopped = output[:pos], True
break
else:
partially_stopped = is_partial_stop(output, each_stop)
if partially_stopped: break
else: raise ValueError('Invalid stop field type.')
# Prevent yielding partial stop sequence
if not partially_stopped:
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
if stopped: break
else: finish_reason = 'length' # finish stream events
if stopped: finish_reason = 'stop'
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
# Clean
del past_key_values, out
gc.collect()
@@ -1192,7 +1189,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
return self.generate(prompt, **attrs)
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
@@ -1207,8 +1203,7 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
if adapter_name is not None: __self.set_adapter(adapter_name)
pre = 0
for outputs in self.generate_iterator(prompt, request_id=openllm_core.utils.gen_random_uuid(), **attrs):
output_text = outputs['text'][0] if __self.backend == 'vllm' else outputs['text']
output_text = output_text.strip().split(' ')
output_text = outputs['text'].strip().split(' ')
now = len(output_text) - 1
if now > pre:
yield ' '.join(output_text[pre:now]) + ' '
@@ -1216,13 +1211,81 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
yield ' '.join(output_text[pre:]) + ' '
return ' '.join(output_text) + ' '
return types.new_class(
self.__class__.__name__ + 'Runnable', (_Runnable,), {},
lambda ns: ns.update({
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
'__module__': self.__module__,
'__doc__': self.config['env'].start_docstring
}))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def vllm_generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
stop: str | t.Iterable[str] | None = attrs.pop('stop', None)
stop_token_ids: list[int] | None = attrs.pop('stop_token_ids', None)
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
stop_: set[str] = set()
if isinstance(stop, str) and stop != '': stop_.add(stop)
elif isinstance(stop, list) and stop != []: stop_.update(stop)
for tid in stop_token_ids:
if tid: stop_.add(self.tokenizer.decode(tid))
if self.config['temperature'] <= 1e-5: top_p = 1.0
else: top_p = self.config['top_p']
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
sampling_params = config.to_sampling_config()
async def loop() -> list[str]:
async for request_output in t.cast('vllm.AsyncLLMEngine', self.model).generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id):
pass
return [output.text for output in request_output.outputs]
try:
return asyncio.run(loop())
except RuntimeError:
try:
return ensure_exec_coro(loop())
except Exception:
raise
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
async def vllm_generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[str, None]:
# TODO: System prompt support
pre = 0
prompt = process_prompt(prompt, None, False)
echo = attrs.pop('echo', False)
stop: str | t.Iterable[str] | None = attrs.pop('stop', None)
stop_token_ids: list[int] | None = attrs.pop('stop_token_ids', None)
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
stop_: set[str] = set()
if isinstance(stop, str) and stop != '': stop_.add(stop)
elif isinstance(stop, list) and stop != []: stop_.update(stop)
for tid in stop_token_ids:
if tid: stop_.add(self.tokenizer.decode(tid))
if self.config['temperature'] <= 1e-5: top_p = 1.0
else: top_p = self.config['top_p']
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
sampling_params = config.to_sampling_config()
async for request_output in t.cast('vllm.AsyncLLMEngine', self.model).generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id):
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
else: text_outputs = [output.text for output in request_output.outputs]
output_text = text_outputs[0]
output_text = output_text.strip().split(' ')
now = len(output_text) - 1
if now > pre:
yield ' '.join(output_text[pre:now]) + ' '
pre = now
yield ' '.join(output_text[pre:]) + ' '
return types.new_class(self.__class__.__name__ + 'Runnable', (_Runnable,), {},
lambda ns: ns.update({
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), '__module__': self.__module__, '__doc__': self.config['env'].start_docstring
}))
def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:

View File

@@ -47,14 +47,24 @@ _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
responses = await runner.generate.async_run(qa_inputs.prompt, **{'adapter_name': qa_inputs.adapter_name, **config})
if runner.backend == 'vllm':
responses = await runner.vllm_generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, request_id=openllm_core.utils.gen_random_uuid(), **config)
else:
responses = await runner.generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **config)
return openllm.GenerationOutput(responses=responses, configuration=config)
@svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
echo = input_dict.pop('echo', False)
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
if runner.backend == 'vllm':
return runner.vllm_generate_iterator.async_stream(qa_inputs.prompt,
adapter_name=qa_inputs.adapter_name,
echo=echo,
request_id=openllm_core.utils.gen_random_uuid(),
**qa_inputs.llm_config.model_dump())
else:
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
@svc.api(route='/v1/metadata',
input=bentoml.io.Text(),

View File

@@ -112,15 +112,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
\b
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
''',
)
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update({
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
})
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
''')
@group.command(**command_attrs)
@start_decorator(llm_config, serve_grpc=_serve_grpc)
@@ -230,19 +222,6 @@ Available official model_id(s): [default: {llm_config['default_id']}]
return start_cmd
def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
context_settings = command_attrs.pop('context_settings', {})
context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
command_attrs['context_settings'] = context_settings
# NOTE: The model requires GPU, therefore we will return a dummy command
@group.command(**command_attrs)
def noop(**_: t.Any) -> LLMConfig:
termui.echo('No GPU available, therefore this command is disabled', fg='red')
openllm.utils.analytics.track_start_init(llm_config)
return llm_config
return noop
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = openllm.utils.compose(

View File

@@ -651,8 +651,6 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
json_data[m] = {
'architecture': config['architecture'],
'model_id': config['model_ids'],
'cpu': not config['requires_gpu'],
'gpu': True,
'backend': backend,
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
}
@@ -680,13 +678,11 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
import tabulate
tabulate.PRESERVE_WHITESPACE = True
# llm, architecture, url, model_id, installation, cpu, gpu, backend
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
# llm, architecture, url, model_id, installation, backend
data: list[str | tuple[str, str, list[str], str, tuple[LiteralBackend, ...]]] = []
for m, v in json_data.items():
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '' if not v['cpu'] else '', '', v['backend'],)])
column_widths = [
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
]
data.extend([(m, v['architecture'], v['model_id'], v['installation'], v['backend'])])
column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4)]
if len(data) == 0 and len(failed_initialized) > 0:
termui.echo('Exception found while parsing models:\n', fg='yellow')
@@ -868,6 +864,7 @@ def query_command(
res = client.query(prompt, return_response='raw', **{**client.configuration, **_memoized})
if output == 'pretty':
response = client.config.postprocess_generate(prompt, res['responses'])
if isinstance(response, dict) and 'text' in response: response = response['text']
termui.echo('\n\n==Responses==\n', fg='white')
termui.echo(response, fg=generated_fg)
elif output == 'json':

View File

@@ -14,19 +14,3 @@ class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNe
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
import transformers
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
if self.config.use_half_precision: model.half()
return model
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))

View File

@@ -1,5 +1,4 @@
from __future__ import annotations
import logging
import typing as t
import bentoml
@@ -31,16 +30,3 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
finally:
torch.cuda.empty_cache()
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
import torch
with torch.inference_mode():
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
# NOTE: support fine-tuning starcoder
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

View File

@@ -20,7 +20,6 @@ def model_settings(draw: st.DrawFn):
'model_ids': st.lists(st.text(), min_size=1),
'architecture': st.text(min_size=1),
'url': st.text(),
'requires_gpu': st.booleans(),
'trust_remote_code': st.booleans(),
'requirements': st.none() | st.lists(st.text(), min_size=1),
'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),

View File

@@ -133,7 +133,7 @@ AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile']
PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
GGML_DEPS = ['ctransformers']
GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0']
VLLM_DEPS = ['vllm>=0.1.6', 'ray']
VLLM_DEPS = ['vllm>=0.1.7', 'ray']
_base_requirements: dict[str, t.Any] = {
inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__

View File

@@ -49,7 +49,6 @@ _value_docstring = {
```''',
'default_backend': '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''',
'url': 'The resolved url for this LLMConfig.',
'requires_gpu': 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.',
'trust_remote_code': 'Whether to always trust remote code',
'service_name': "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"",
'requirements': 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.',