fix(serving): vllm distributed size (#285)

* chore(weights): ignore gguf pattern for non GGML backend

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* chore: correct fix num_gpus to be divisble by 2

This depends on the attention_heads from given models

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-01 12:37:10 -04:00
committed by GitHub
parent b7af7765d4
commit 608de0b667
2 changed files with 14 additions and 6 deletions

View File

@@ -1,6 +1,7 @@
'''LLM assignment magik.'''
from __future__ import annotations
import functools
import math
import traceback
import typing as t
@@ -50,13 +51,15 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
if self.__llm_backend__ == 'vllm':
num_gpus, dev = 1, device_count()
if dev >= 2: num_gpus = dev if dev // 2 == 0 else math.ceil(dev / 2)
# TODO: Do some more processing with token_id once we support token streaming
try:
return vllm.LLMEngine.from_engine_args(
vllm.EngineArgs(model=self._bentomodel.path,
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
tokenizer_mode='auto',
tensor_parallel_size=1 if device_count() < 2 else device_count(),
tensor_parallel_size=num_gpus,
dtype='auto',
worker_use_ray=False))
except Exception as err:

View File

@@ -20,16 +20,21 @@ class HfIgnore:
pt = '*.bin'
tf = '*.h5'
flax = '*.msgpack'
gguf = '*.gguf'
@classmethod
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt]
if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors, cls.gguf]
elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt, cls.gguf]
elif llm.__llm_backend__ == 'flax':
base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax
else:
base = [cls.tf, cls.flax]
base = [cls.tf, cls.pt, cls.safetensors, cls.gguf] # as of current, safetensors is not supported with flax
elif llm.__llm_backend__ == 'pt':
base = [cls.tf, cls.flax, cls.gguf]
if has_safetensors_weights(llm.model_id): base.append(cls.pt)
elif llm.__llm_backend__ == 'ggml':
base = [cls.tf, cls.flax, cls.pt, cls.safetensors]
else:
raise ValueError('Unknown backend (should never happen at all.)')
# filter out these files, since we probably don't need them for now.
base.extend(['*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
return base