diff --git a/openllm-core/src/openllm_core/config/configuration_phi.py b/openllm-core/src/openllm_core/config/configuration_phi.py index 7ce8e41e..3bfb0221 100644 --- a/openllm-core/src/openllm_core/config/configuration_phi.py +++ b/openllm-core/src/openllm_core/config/configuration_phi.py @@ -22,11 +22,18 @@ class PhiConfig(openllm_core.LLMConfig): metadata_config: ModelSettings = pydantic.Field( default={ 'url': 'https://arxiv.org/abs/2309.05463', - 'architecture': 'PhiForCausalLM', + 'architecture': 'Phi3ForCausalLM', 'trust_remote_code': True, - 'default_id': 'microsoft/phi-2', + 'default_id': 'microsoft/Phi-3-mini-4k-instruct', 'serialisation': 'safetensors', - 'model_ids': ['microsoft/phi-2', 'microsoft/phi-1_5'], + 'model_ids': [ + 'microsoft/Phi-3-mini-4k-instruct', + 'microsoft/Phi-3-mini-128k-instruct', + 'microsoft/Phi-3-small-8k-instruct', + 'microsoft/Phi-3-small-128k-instruct', + 'microsoft/Phi-3-medium-4k-instruct', + 'microsoft/Phi-3-medium-128k-instruct', + ], 'fine_tune_strategies': ( {'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'}, ), diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py index fe379f46..4b3faaa3 100644 --- a/openllm-python/src/_openllm_tiny/_entrypoint.py +++ b/openllm-python/src/_openllm_tiny/_entrypoint.py @@ -436,7 +436,7 @@ def build_command( service_config = dict( resources={ 'gpu' if device else 'cpu': len(device) if device else '1', - 'gpu_type': recommended_instance_type(model_id, bentomodel), + 'gpu_type': recommended_instance_type(model_id, bentomodel, serialisation), }, traffic=dict(timeout=timeout), ) diff --git a/openllm-python/src/_openllm_tiny/_helpers.py b/openllm-python/src/_openllm_tiny/_helpers.py index ae60fa63..5d968fcb 100644 --- a/openllm-python/src/_openllm_tiny/_helpers.py +++ b/openllm-python/src/_openllm_tiny/_helpers.py @@ -18,8 +18,9 @@ from openllm_core.protocol.openai import ( LogProbs, UsageInfo, ) +from openllm_core._typing_compat import LiteralSerialisation from starlette.requests import Request -from huggingface_hub import scan_cache_dir +from huggingface_hub import HfApi if t.TYPE_CHECKING: import bentoml @@ -328,12 +329,21 @@ class OpenAI: RECOMMENDED_MAPPING = {'nvidia-l4': 24e9, 'nvidia-a10g': 24e9, 'nvidia-tesla-a100': 40e9, 'nvidia-a100-80gb': 80e9} -def recommended_instance_type(model_id: str, bentomodel: bentoml.Model | None = None): +def recommended_instance_type( + model_id: str, bentomodel: bentoml.Model | None = None, serialisation: LiteralSerialisation = 'safetensors' +): + extensions = 'safetensors' if serialisation == 'safetensors' else 'pt' + api = HfApi() + if bentomodel is not None: size = sum(f.stat().st_size for f in pathlib.Path(resolve_filepath(model_id)).glob('**/*') if f.is_file()) else: - info = next(filter(lambda repo: repo.repo_id == model_id, scan_cache_dir().repos)) - size = info.size_on_disk + size = sum( + i.size + for i in api.get_paths_info( + model_id, list(filter(lambda x: x.endswith(f'.{extensions}'), api.list_repo_files(model_id))) + ) + ) # find the first occurence of the gpu_type in the recommended mapping such that "size" should be less than or equal to 70% of the recommended size for gpu, max_size in RECOMMENDED_MAPPING.items():