diff --git a/.python-version-default b/.python-version-default index 2c073331..c8cfe395 100644 --- a/.python-version-default +++ b/.python-version-default @@ -1 +1 @@ -3.11 +3.10 diff --git a/README.md b/README.md index c75696ee..35dcc008 100644 --- a/README.md +++ b/README.md @@ -290,6 +290,48 @@ You can specify any of the following ChatGLM models via `openllm start`:
+Dbrx + + +### Quickstart + + + +> **Note:** Dbrx requires to install with: +> ```bash +> pip install "openllm[dbrx]" +> ``` + + +Run the following command to quickly spin up a Dbrx server: + +```bash +TRUST_REMOTE_CODE=True openllm start databricks/dbrx-instruct +``` +In a different terminal, run the following command to interact with the server: + +```bash +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + + +> **Note:** Any Dbrx variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dbrx) to see more Dbrx-compatible models. + + + +### Supported models + +You can specify any of the following Dbrx models via `openllm start`: + + +- [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) +- [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base) + +
+ +
+ DollyV2 diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py index c5426989..8a27ad71 100644 --- a/openllm-core/src/openllm_core/config/__init__.py +++ b/openllm-core/src/openllm_core/config/__init__.py @@ -3,6 +3,7 @@ from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig from .configuration_baichuan import BaichuanConfig as BaichuanConfig from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig +from .configuration_dbrx import DbrxConfig as DbrxConfig from .configuration_dolly_v2 import DollyV2Config as DollyV2Config from .configuration_falcon import FalconConfig as FalconConfig from .configuration_flan_t5 import FlanT5Config as FlanT5Config diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 14af8565..e9aa7f27 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -28,6 +28,7 @@ ModelType: t.TypeAlias = t.Literal[ 'gemma', 'gpt_neox', 'dolly_v2', + 'dbrx', 'stablelm', 'llama', 'mpt', @@ -49,6 +50,7 @@ CONFIG_MAPPING_NAMES: OrderedDict[ModelType, str] = OrderedDict( ('falcon', 'FalconConfig'), ('gpt_neox', 'GPTNeoXConfig'), ('gemma', 'GemmaConfig'), + ('dbrx', 'DbrxConfig'), ('dolly_v2', 'DollyV2Config'), ('stablelm', 'StableLMConfig'), ('llama', 'LlamaConfig'), @@ -134,6 +136,9 @@ class AutoConfig: def for_model(cls, model_name: t.Literal['chatglm'], **attrs: t.Any) -> openllm_core.config.ChatGLMConfig: ... @t.overload @classmethod + def for_model(cls, model_name: t.Literal['dbrx'], **attrs: t.Any) -> openllm_core.config.DbrxConfig: ... + @t.overload + @classmethod def for_model(cls, model_name: t.Literal['dolly_v2'], **attrs: t.Any) -> openllm_core.config.DollyV2Config: ... @t.overload @classmethod diff --git a/openllm-core/src/openllm_core/config/configuration_dbrx.py b/openllm-core/src/openllm_core/config/configuration_dbrx.py new file mode 100644 index 00000000..dcafc2f0 --- /dev/null +++ b/openllm-core/src/openllm_core/config/configuration_dbrx.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import openllm_core, pydantic +from openllm_core._configuration import ModelSettings + + +class DbrxConfig(openllm_core.LLMConfig): + """DBRX is a mixture-of-experts (MoE) large language model trained from scratch by Databricks. + + Refer to [Databricks's DBRX page](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962) for more information. + """ + + model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=()) + + metadata_config: ModelSettings = pydantic.Field( + default={ + 'timeout': 3600000, + 'trust_remote_code': True, + 'url': 'https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962', + 'architecture': 'DbrxForCausalLM', + 'default_id': 'databricks/dbrx-instruct', + 'model_ids': ['databricks/dbrx-instruct', 'databricks/dbrx-base'], + }, + repr=False, + exclude=True, + ) + + # NOTE: from get_special_token_id(self.tokenizer, END_KEY) + generation_config: openllm_core.GenerationConfig = pydantic.Field( + default=openllm_core.GenerationConfig.model_construct(temperature=0.9, top_p=0.92, top_k=5, max_new_tokens=256) + ) diff --git a/openllm-python/README.md b/openllm-python/README.md index c75696ee..35dcc008 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -290,6 +290,48 @@ You can specify any of the following ChatGLM models via `openllm start`:
+Dbrx + + +### Quickstart + + + +> **Note:** Dbrx requires to install with: +> ```bash +> pip install "openllm[dbrx]" +> ``` + + +Run the following command to quickly spin up a Dbrx server: + +```bash +TRUST_REMOTE_CODE=True openllm start databricks/dbrx-instruct +``` +In a different terminal, run the following command to interact with the server: + +```bash +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + + +> **Note:** Any Dbrx variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dbrx) to see more Dbrx-compatible models. + + + +### Supported models + +You can specify any of the following Dbrx models via `openllm start`: + + +- [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) +- [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base) + +
+ +
+ DollyV2 diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 1a3b041d..5b92306d 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "openllm-client>=0.5.0-alpha.2", "openllm-core>=0.5.0-alpha.2", "safetensors", + "vllm>=0.4.0", "optimum>=1.12.0", "accelerate", "ghapi", @@ -99,12 +100,13 @@ all = ["openllm[full]"] awq = ["autoawq"] baichuan = ["cpm-kernels"] chatglm = ["cpm-kernels"] +dbrx = ["cpm-kernels"] dolly-v2 = ["cpm-kernels"] falcon = ["xformers"] fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"] flan-t5 = ["xformers"] full = [ - "openllm[agents,awq,baichuan,chatglm,dolly-v2,falcon,fine-tune,flan-t5,gemma,ggml,gpt-neox,gptq,grpc,llama,mistral,mixtral,mpt,openai,opt,phi,playground,qwen,stablelm,starcoder,vllm,yi]", + "openllm[agents,awq,baichuan,chatglm,dbrx,dolly-v2,falcon,fine-tune,flan-t5,gemma,ggml,gpt-neox,gptq,grpc,llama,mistral,mixtral,mpt,openai,opt,phi,playground,qwen,stablelm,starcoder,vllm,yi]", ] gemma = ["xformers"] ggml = ["ctransformers"] @@ -122,7 +124,7 @@ playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] qwen = ["cpm-kernels", "tiktoken"] stablelm = ["cpm-kernels", "tiktoken"] starcoder = ["bitsandbytes"] -vllm = ["vllm==0.3.2"] +vllm = ["vllm==0.4.0"] yi = ["bitsandbytes"] [tool.hatch.version] diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py index f4b57101..887c685f 100644 --- a/openllm-python/src/_openllm_tiny/_entrypoint.py +++ b/openllm-python/src/_openllm_tiny/_entrypoint.py @@ -78,7 +78,7 @@ def parse_device_callback( '--version', '-v', package_name=_PACKAGE_NAME, - message=f'{_PACKAGE_NAME}, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}', + message=f'{_PACKAGE_NAME}, %(version)s\nPython ({platform.python_implementation()}) {platform.python_version()}', ) def cli() -> None: """\b diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi index 59e8990c..ac0d00c9 100644 --- a/openllm-python/src/openllm/__init__.pyi +++ b/openllm-python/src/openllm/__init__.pyi @@ -13,7 +13,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease. # fmt: off # update-config-stubs.py: import stubs start from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient -from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig +from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DbrxConfig as DbrxConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput, MessageParam as MessageParam from openllm_core.utils import api as api diff --git a/tools/dependencies.py b/tools/dependencies.py index fed93fa1..f80aec3b 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -80,9 +80,9 @@ class Classifier: ] base.append(Classifier.create_classifier('language', 'Python', '3', 'Only')) base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version]) - base.extend( - [Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation] - ) + base.extend([ + Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation + ]) return base @staticmethod @@ -154,7 +154,7 @@ PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat'] GGML_DEPS = ['ctransformers'] AWQ_DEPS = ['autoawq'] GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2'] -VLLM_DEPS = ['vllm==0.3.2'] +VLLM_DEPS = ['vllm==0.4.0'] _base_requirements: dict[str, t.Any] = { inflection.dasherize(name): config_cls()['requirements'] @@ -167,9 +167,9 @@ _locals = locals().copy() # NOTE: update this table when adding new external dependencies # sync with openllm.utils.OPTIONAL_DEPENDENCIES -_base_requirements.update( - {v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES} -) +_base_requirements.update({ + v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES +}) _base_requirements = {k: v for k, v in sorted(_base_requirements.items())} @@ -182,35 +182,34 @@ def correct_style(it: t.Any) -> t.Any: def create_classifiers() -> Array: arr = correct_style(tomlkit.array()) - arr.extend( - [ - Classifier.create_status_classifier(5), - Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA'), - Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '12'), - Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.8'), - Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.7'), - Classifier.apache(), - Classifier.create_classifier('topic', 'Scientific/Engineering', 'Artificial Intelligence'), - Classifier.create_classifier('topic', 'Software Development', 'Libraries'), - Classifier.create_classifier('os', 'OS Independent'), - Classifier.create_classifier('audience', 'Developers'), - Classifier.create_classifier('audience', 'Science/Research'), - Classifier.create_classifier('audience', 'System Administrators'), - Classifier.create_classifier('typing', 'Typed'), - *Classifier.create_python_classifier(), - ] - ) + arr.extend([ + Classifier.create_status_classifier(5), + Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA'), + Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '12'), + Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.8'), + Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.7'), + Classifier.apache(), + Classifier.create_classifier('topic', 'Scientific/Engineering', 'Artificial Intelligence'), + Classifier.create_classifier('topic', 'Software Development', 'Libraries'), + Classifier.create_classifier('os', 'OS Independent'), + Classifier.create_classifier('audience', 'Developers'), + Classifier.create_classifier('audience', 'Science/Research'), + Classifier.create_classifier('audience', 'System Administrators'), + Classifier.create_classifier('typing', 'Typed'), + *Classifier.create_python_classifier(), + ]) return arr.multiline(True) def create_optional_table() -> Table: all_array = tomlkit.array() - all_array.append(f"openllm[{','.join([k for k,v in _base_requirements.items() if v])}]") + all_array.append(f"openllm[{','.join([k for k, v in _base_requirements.items() if v])}]") table = tomlkit.table(is_super_table=True) - _base_requirements.update( - {'full': correct_style(all_array.multiline(True)), 'all': tomlkit.array('["openllm[full]"]')} - ) + _base_requirements.update({ + 'full': correct_style(all_array.multiline(True)), + 'all': tomlkit.array('["openllm[full]"]'), + }) table.update({k: v for k, v in sorted(_base_requirements.items()) if v}) table.add(tomlkit.nl()) @@ -244,29 +243,27 @@ def build_system() -> Table: def keywords() -> Array: arr = correct_style(tomlkit.array()) - arr.extend( - [ - 'MLOps', - 'AI', - 'BentoML', - 'Model Serving', - 'Model Deployment', - 'LLMOps', - 'Falcon', - 'Vicuna', - 'Llama 2', - 'Fine tuning', - 'Serverless', - 'Large Language Model', - 'Generative AI', - 'StableLM', - 'Alpaca', - 'PyTorch', - 'Mistral', - 'vLLM', - 'Transformers', - ] - ) + arr.extend([ + 'MLOps', + 'AI', + 'BentoML', + 'Model Serving', + 'Model Deployment', + 'LLMOps', + 'Falcon', + 'Vicuna', + 'Llama 2', + 'Fine tuning', + 'Serverless', + 'Large Language Model', + 'Generative AI', + 'StableLM', + 'Alpaca', + 'PyTorch', + 'Mistral', + 'vLLM', + 'Transformers', + ]) return arr.multiline(True) @@ -293,6 +290,7 @@ def main(args) -> int: Dependencies(name='openllm-client', lower_constraint=release_version), Dependencies(name='openllm-core', lower_constraint=release_version), Dependencies(name='safetensors'), + Dependencies(name='vllm', lower_constraint='0.4.0'), Dependencies(name='optimum', lower_constraint='1.12.0'), Dependencies(name='accelerate'), Dependencies(name='ghapi'),