feat(models): Support qwen (#742)

* support qwen * support qwen * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci * Update openllm-core/src/openllm_core/config/configuration_qwen.py Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * chore: update correct readme and supports qwen models Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Co-authored-by: root <yansheng105@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-11-30 19:54:17 +08:00
parent c63bb82000
commit 3cb7f14fc1
9 changed files with 204 additions and 365 deletions
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -414,25 +414,6 @@ openllm start databricks/dolly-v2-3b --backend vllm
 openllm start databricks/dolly-v2-3b --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start databricks/dolly-v2-3b --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -511,25 +492,6 @@ openllm start tiiuae/falcon-7b --backend vllm
 openllm start tiiuae/falcon-7b --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start tiiuae/falcon-7b --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -651,25 +613,6 @@ openllm start eleutherai/gpt-neox-20b --backend vllm
 openllm start eleutherai/gpt-neox-20b --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start eleutherai/gpt-neox-20b --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -748,25 +691,6 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm
 openllm start meta-llama/Llama-2-70b-chat-hf --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -837,25 +761,6 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm
 openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -937,25 +842,6 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm
 TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -1028,25 +914,6 @@ openllm start facebook/opt-125m --backend vllm
 openllm start facebook/opt-125m --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start facebook/opt-125m --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -1118,6 +985,86 @@ TRUST_REMOTE_CODE=True openllm start microsoft/phi-1_5 --backend pt

 <details>

+<summary>Qwen</summary>
+
+
+### Quickstart
+
+
+
+> **Note:** Qwen requires to install with:
+> ```bash
+> pip install "openllm[qwen]"
+> ```
+
+
+Run the following command to quickly spin up a Qwen server:
+
+```bash
+TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat
+```
+In a different terminal, run the following command to interact with the server:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'What are large language models?'
+```
+
+
+> **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Qwen models via `openllm start`:
+
+
+- [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat)
+- [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8)
+- [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4)
+- [qwen/Qwen-14B-Chat](https://huggingface.co/qwen/Qwen-14B-Chat)
+- [qwen/Qwen-14B-Chat-Int8](https://huggingface.co/qwen/Qwen-14B-Chat-Int8)
+- [qwen/Qwen-14B-Chat-Int4](https://huggingface.co/qwen/Qwen-14B-Chat-Int4)
+
+### Supported backends
+
+OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
+
+
+
+> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
+
+
+
+- vLLM (Recommended):
+
+
+To install vLLM, run `pip install "openllm[vllm]"`
+
+```bash
+TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend vllm
+```
+
+
+> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
+
+
+
+> **Note:** Currently, adapters are yet to be supported with vLLM.
+
+
+- PyTorch:
+
+
+```bash
+TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend pt
+```
+
+</details>
+
+<details>
+
 <summary>StableLM</summary>


@@ -1184,25 +1131,6 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm
 openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
@@ -1279,25 +1207,6 @@ openllm start bigcode/starcoder --backend vllm
 openllm start bigcode/starcoder --backend pt
 ```

- CTranslate2 (*experimental*):
-
-
-```bash
-openllm start bigcode/starcoder --backend ctranslate
-```
-
-
-> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
-
-
-
-> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
-
-
-
-> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
-
-
 </details>

 <details>
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -109,7 +109,7 @@ ctranslate = ["ctranslate2>=3.22.0"]
 falcon = ["xformers"]
 fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"]
 full = [
-    "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,starcoder,vllm]",
+    "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,qwen,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
 gptq = ["auto-gptq[triton]>=0.4.2"]
@@ -117,6 +117,7 @@ grpc = ["bentoml[grpc]>=1.1.10"]
 mpt = ["triton"]
 openai = ["openai[datalib]>=1", "tiktoken"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
+qwen = ["cpm-kernels", "tiktoken"]
 starcoder = ["bitsandbytes"]
 vllm = ["vllm>=0.2.2"]

--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 '''

 # update-config-stubs.py: import stubs start
-from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 # update-config-stubs.py: import stubs stop

 from openllm_cli._sdk import (
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -36,8 +36,6 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
      tokenizer.pad_token_id = config.eos_token_id
    elif tokenizer.eos_token_id is not None:
      tokenizer.pad_token_id = tokenizer.eos_token_id
-    else:
-      tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  return tokenizer

 def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[LLM[M, T], P], TypeGuard[M | T | Model]]: