diff --git a/README.md b/README.md index 6af2e6aa..dcd0b5c0 100644 --- a/README.md +++ b/README.md @@ -414,25 +414,6 @@ openllm start databricks/dolly-v2-3b --backend vllm openllm start databricks/dolly-v2-3b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start databricks/dolly-v2-3b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -511,25 +492,6 @@ openllm start tiiuae/falcon-7b --backend vllm openllm start tiiuae/falcon-7b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start tiiuae/falcon-7b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -651,25 +613,6 @@ openllm start eleutherai/gpt-neox-20b --backend vllm openllm start eleutherai/gpt-neox-20b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start eleutherai/gpt-neox-20b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -748,25 +691,6 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm openllm start meta-llama/Llama-2-70b-chat-hf --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -837,25 +761,6 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -937,25 +842,6 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1028,25 +914,6 @@ openllm start facebook/opt-125m --backend vllm openllm start facebook/opt-125m --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start facebook/opt-125m --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1118,6 +985,86 @@ TRUST_REMOTE_CODE=True openllm start microsoft/phi-1_5 --backend pt
+Qwen + + +### Quickstart + + + +> **Note:** Qwen requires to install with: +> ```bash +> pip install "openllm[qwen]" +> ``` + + +Run the following command to quickly spin up a Qwen server: + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat +``` +In a different terminal, run the following command to interact with the server: + +```bash +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + + +> **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models. + + + +### Supported models + +You can specify any of the following Qwen models via `openllm start`: + + +- [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat) +- [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8) +- [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4) +- [qwen/Qwen-14B-Chat](https://huggingface.co/qwen/Qwen-14B-Chat) +- [qwen/Qwen-14B-Chat-Int8](https://huggingface.co/qwen/Qwen-14B-Chat-Int8) +- [qwen/Qwen-14B-Chat-Int4](https://huggingface.co/qwen/Qwen-14B-Chat-Int4) + +### Supported backends + +OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch. + + + +> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`. + + + +- vLLM (Recommended): + + +To install vLLM, run `pip install "openllm[vllm]"` + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend vllm +``` + + +> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving. + + + +> **Note:** Currently, adapters are yet to be supported with vLLM. + + +- PyTorch: + + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend pt +``` + +
+ +
+ StableLM @@ -1184,25 +1131,6 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1279,25 +1207,6 @@ openllm start bigcode/starcoder --backend vllm openllm start bigcode/starcoder --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start bigcode/starcoder --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py index 68dad614..2c37949e 100644 --- a/openllm-core/src/openllm_core/config/__init__.py +++ b/openllm-core/src/openllm_core/config/__init__.py @@ -16,6 +16,7 @@ from .configuration_mistral import MistralConfig as MistralConfig from .configuration_mpt import MPTConfig as MPTConfig from .configuration_opt import OPTConfig as OPTConfig from .configuration_phi import PhiConfig as PhiConfig +from .configuration_qwen import QwenConfig as QwenConfig from .configuration_stablelm import StableLMConfig as StableLMConfig from .configuration_starcoder import StarCoderConfig as StarCoderConfig from .configuration_yi import YiConfig as YiConfig diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 431d6691..6b9259b4 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -35,6 +35,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ('mpt', 'MPTConfig'), ('opt', 'OPTConfig'), ('phi', 'PhiConfig'), + ('qwen', 'QwenConfig'), ('starcoder', 'StarCoderConfig'), ('mistral', 'MistralConfig'), ('yi', 'YiConfig'), @@ -145,6 +146,9 @@ class AutoConfig: def for_model(cls, model_name: t.Literal['phi'], **attrs: t.Any) -> openllm_core.config.PhiConfig: ... @t.overload @classmethod + def for_model(cls, model_name: t.Literal['qwen'], **attrs: t.Any) -> openllm_core.config.QwenConfig: ... + @t.overload + @classmethod def for_model(cls, model_name: t.Literal['stablelm'], **attrs: t.Any) -> openllm_core.config.StableLMConfig: ... @t.overload @classmethod diff --git a/openllm-core/src/openllm_core/config/configuration_qwen.py b/openllm-core/src/openllm_core/config/configuration_qwen.py new file mode 100644 index 00000000..a8929ff3 --- /dev/null +++ b/openllm-core/src/openllm_core/config/configuration_qwen.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import openllm_core + +class QwenConfig(openllm_core.LLMConfig): + """Qwen-7B is the 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen), + proposed by Alibaba Cloud. Qwen-14B is a Transformer-based large language model, + which is pretrained on a large volume of data, including web texts, books, codes, etc. + Additionally, based on the pretrained Qwen-14B, we release Qwen-14B-Chat, a large-model-based AI assistant, + which is trained with alignment techniques. + Refer to [Qwen's GitHub page](https://github.com/QwenLM/Qwen) for more information. + """ + + __config__ = { + 'name_type': 'lowercase', + 'trust_remote_code': True, + 'timeout': 3600000, + 'url': 'https://github.com/QwenLM/Qwen', + 'requirements': ['cpm-kernels', 'tiktoken'], + 'backend': ('pt', 'vllm'), + 'architecture': 'QWenLMHeadModel', + 'default_id': 'qwen/Qwen-7B-Chat', + 'model_ids': [ + 'qwen/Qwen-7B-Chat', + 'qwen/Qwen-7B-Chat-Int8', + 'qwen/Qwen-7B-Chat-Int4', + 'qwen/Qwen-14B-Chat', + 'qwen/Qwen-14B-Chat-Int8', + 'qwen/Qwen-14B-Chat-Int4', + ], + } + + class GenerationConfig: + max_new_tokens: int = 2048 + top_p: float = 0.7 + temperature: float = 0.95 diff --git a/openllm-python/README.md b/openllm-python/README.md index 6af2e6aa..dcd0b5c0 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -414,25 +414,6 @@ openllm start databricks/dolly-v2-3b --backend vllm openllm start databricks/dolly-v2-3b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start databricks/dolly-v2-3b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -511,25 +492,6 @@ openllm start tiiuae/falcon-7b --backend vllm openllm start tiiuae/falcon-7b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start tiiuae/falcon-7b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -651,25 +613,6 @@ openllm start eleutherai/gpt-neox-20b --backend vllm openllm start eleutherai/gpt-neox-20b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start eleutherai/gpt-neox-20b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -748,25 +691,6 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm openllm start meta-llama/Llama-2-70b-chat-hf --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -837,25 +761,6 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -937,25 +842,6 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1028,25 +914,6 @@ openllm start facebook/opt-125m --backend vllm openllm start facebook/opt-125m --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start facebook/opt-125m --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1118,6 +985,86 @@ TRUST_REMOTE_CODE=True openllm start microsoft/phi-1_5 --backend pt
+Qwen + + +### Quickstart + + + +> **Note:** Qwen requires to install with: +> ```bash +> pip install "openllm[qwen]" +> ``` + + +Run the following command to quickly spin up a Qwen server: + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat +``` +In a different terminal, run the following command to interact with the server: + +```bash +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + + +> **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models. + + + +### Supported models + +You can specify any of the following Qwen models via `openllm start`: + + +- [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat) +- [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8) +- [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4) +- [qwen/Qwen-14B-Chat](https://huggingface.co/qwen/Qwen-14B-Chat) +- [qwen/Qwen-14B-Chat-Int8](https://huggingface.co/qwen/Qwen-14B-Chat-Int8) +- [qwen/Qwen-14B-Chat-Int4](https://huggingface.co/qwen/Qwen-14B-Chat-Int4) + +### Supported backends + +OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch. + + + +> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`. + + + +- vLLM (Recommended): + + +To install vLLM, run `pip install "openllm[vllm]"` + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend vllm +``` + + +> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving. + + + +> **Note:** Currently, adapters are yet to be supported with vLLM. + + +- PyTorch: + + +```bash +TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend pt +``` + +
+ +
+ StableLM @@ -1184,25 +1131,6 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
@@ -1279,25 +1207,6 @@ openllm start bigcode/starcoder --backend vllm openllm start bigcode/starcoder --backend pt ``` -- CTranslate2 (*experimental*): - - -```bash -openllm start bigcode/starcoder --backend ctranslate -``` - - -> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 - - - -> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. - - - -> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. - -
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 976df512..6543b19f 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -109,7 +109,7 @@ ctranslate = ["ctranslate2>=3.22.0"] falcon = ["xformers"] fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"] full = [ - "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,starcoder,vllm]", + "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,qwen,starcoder,vllm]", ] ggml = ["ctransformers"] gptq = ["auto-gptq[triton]>=0.4.2"] @@ -117,6 +117,7 @@ grpc = ["bentoml[grpc]>=1.1.10"] mpt = ["triton"] openai = ["openai[datalib]>=1", "tiktoken"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] +qwen = ["cpm-kernels", "tiktoken"] starcoder = ["bitsandbytes"] vllm = ["vllm>=0.2.2"] diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi index 6b2cfc18..3cf1d2fb 100644 --- a/openllm-python/src/openllm/__init__.pyi +++ b/openllm-python/src/openllm/__init__.pyi @@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease. ''' # update-config-stubs.py: import stubs start -from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig +from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig # update-config-stubs.py: import stubs stop from openllm_cli._sdk import ( diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index ff6a57e2..0960b6a6 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -36,8 +36,6 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]: tokenizer.pad_token_id = config.eos_token_id elif tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id - else: - tokenizer.add_special_tokens({'pad_token': '[PAD]'}) return tokenizer def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[LLM[M, T], P], TypeGuard[M | T | Model]]: diff --git a/tools/update-readme.py b/tools/update-readme.py index c3ed2423..e8cb8ae6 100755 --- a/tools/update-readme.py +++ b/tools/update-readme.py @@ -94,25 +94,6 @@ openllm query 'What are large language models?' ```""", ] ) - if 'ctranslate' in it['backend']: - details_block.extend( - [ - '\n- CTranslate2 (*experimental*):\n\n', - f"""\ -```bash -{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend ctranslate -```""", - *markdown_noteblock( - 'Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16' - ), - *markdown_noteblock( - 'We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.' - ), - *markdown_importantblock( - 'CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.' - ), - ] - ) details_block.append('\n
\n\n') content.append('\n'.join(details_block))