mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-02 21:02:43 -04:00
feat(models): Support qwen (#742)
* support qwen * support qwen * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci * Update openllm-core/src/openllm_core/config/configuration_qwen.py Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * chore: update correct readme and supports qwen models Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Co-authored-by: root <yansheng105@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
251
README.md
251
README.md
@@ -414,25 +414,6 @@ openllm start databricks/dolly-v2-3b --backend vllm
|
||||
openllm start databricks/dolly-v2-3b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start databricks/dolly-v2-3b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -511,25 +492,6 @@ openllm start tiiuae/falcon-7b --backend vllm
|
||||
openllm start tiiuae/falcon-7b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start tiiuae/falcon-7b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -651,25 +613,6 @@ openllm start eleutherai/gpt-neox-20b --backend vllm
|
||||
openllm start eleutherai/gpt-neox-20b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start eleutherai/gpt-neox-20b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -748,25 +691,6 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm
|
||||
openllm start meta-llama/Llama-2-70b-chat-hf --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -837,25 +761,6 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm
|
||||
openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -937,25 +842,6 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm
|
||||
TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1028,25 +914,6 @@ openllm start facebook/opt-125m --backend vllm
|
||||
openllm start facebook/opt-125m --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start facebook/opt-125m --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1118,6 +985,86 @@ TRUST_REMOTE_CODE=True openllm start microsoft/phi-1_5 --backend pt
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Qwen</summary>
|
||||
|
||||
|
||||
### Quickstart
|
||||
|
||||
|
||||
|
||||
> **Note:** Qwen requires to install with:
|
||||
> ```bash
|
||||
> pip install "openllm[qwen]"
|
||||
> ```
|
||||
|
||||
|
||||
Run the following command to quickly spin up a Qwen server:
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat
|
||||
```
|
||||
In a different terminal, run the following command to interact with the server:
|
||||
|
||||
```bash
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models.
|
||||
|
||||
|
||||
|
||||
### Supported models
|
||||
|
||||
You can specify any of the following Qwen models via `openllm start`:
|
||||
|
||||
|
||||
- [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat)
|
||||
- [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8)
|
||||
- [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4)
|
||||
- [qwen/Qwen-14B-Chat](https://huggingface.co/qwen/Qwen-14B-Chat)
|
||||
- [qwen/Qwen-14B-Chat-Int8](https://huggingface.co/qwen/Qwen-14B-Chat-Int8)
|
||||
- [qwen/Qwen-14B-Chat-Int4](https://huggingface.co/qwen/Qwen-14B-Chat-Int4)
|
||||
|
||||
### Supported backends
|
||||
|
||||
OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
|
||||
|
||||
|
||||
|
||||
> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
|
||||
|
||||
|
||||
|
||||
- vLLM (Recommended):
|
||||
|
||||
|
||||
To install vLLM, run `pip install "openllm[vllm]"`
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend vllm
|
||||
```
|
||||
|
||||
|
||||
> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
|
||||
|
||||
|
||||
|
||||
> **Note:** Currently, adapters are yet to be supported with vLLM.
|
||||
|
||||
|
||||
- PyTorch:
|
||||
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend pt
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>StableLM</summary>
|
||||
|
||||
|
||||
@@ -1184,25 +1131,6 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm
|
||||
openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1279,25 +1207,6 @@ openllm start bigcode/starcoder --backend vllm
|
||||
openllm start bigcode/starcoder --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start bigcode/starcoder --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
@@ -16,6 +16,7 @@ from .configuration_mistral import MistralConfig as MistralConfig
|
||||
from .configuration_mpt import MPTConfig as MPTConfig
|
||||
from .configuration_opt import OPTConfig as OPTConfig
|
||||
from .configuration_phi import PhiConfig as PhiConfig
|
||||
from .configuration_qwen import QwenConfig as QwenConfig
|
||||
from .configuration_stablelm import StableLMConfig as StableLMConfig
|
||||
from .configuration_starcoder import StarCoderConfig as StarCoderConfig
|
||||
from .configuration_yi import YiConfig as YiConfig
|
||||
|
||||
@@ -35,6 +35,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
('mpt', 'MPTConfig'),
|
||||
('opt', 'OPTConfig'),
|
||||
('phi', 'PhiConfig'),
|
||||
('qwen', 'QwenConfig'),
|
||||
('starcoder', 'StarCoderConfig'),
|
||||
('mistral', 'MistralConfig'),
|
||||
('yi', 'YiConfig'),
|
||||
@@ -145,6 +146,9 @@ class AutoConfig:
|
||||
def for_model(cls, model_name: t.Literal['phi'], **attrs: t.Any) -> openllm_core.config.PhiConfig: ...
|
||||
@t.overload
|
||||
@classmethod
|
||||
def for_model(cls, model_name: t.Literal['qwen'], **attrs: t.Any) -> openllm_core.config.QwenConfig: ...
|
||||
@t.overload
|
||||
@classmethod
|
||||
def for_model(cls, model_name: t.Literal['stablelm'], **attrs: t.Any) -> openllm_core.config.StableLMConfig: ...
|
||||
@t.overload
|
||||
@classmethod
|
||||
|
||||
36
openllm-core/src/openllm_core/config/configuration_qwen.py
Normal file
36
openllm-core/src/openllm_core/config/configuration_qwen.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import openllm_core
|
||||
|
||||
class QwenConfig(openllm_core.LLMConfig):
|
||||
"""Qwen-7B is the 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen),
|
||||
proposed by Alibaba Cloud. Qwen-14B is a Transformer-based large language model,
|
||||
which is pretrained on a large volume of data, including web texts, books, codes, etc.
|
||||
Additionally, based on the pretrained Qwen-14B, we release Qwen-14B-Chat, a large-model-based AI assistant,
|
||||
which is trained with alignment techniques.
|
||||
Refer to [Qwen's GitHub page](https://github.com/QwenLM/Qwen) for more information.
|
||||
"""
|
||||
|
||||
__config__ = {
|
||||
'name_type': 'lowercase',
|
||||
'trust_remote_code': True,
|
||||
'timeout': 3600000,
|
||||
'url': 'https://github.com/QwenLM/Qwen',
|
||||
'requirements': ['cpm-kernels', 'tiktoken'],
|
||||
'backend': ('pt', 'vllm'),
|
||||
'architecture': 'QWenLMHeadModel',
|
||||
'default_id': 'qwen/Qwen-7B-Chat',
|
||||
'model_ids': [
|
||||
'qwen/Qwen-7B-Chat',
|
||||
'qwen/Qwen-7B-Chat-Int8',
|
||||
'qwen/Qwen-7B-Chat-Int4',
|
||||
'qwen/Qwen-14B-Chat',
|
||||
'qwen/Qwen-14B-Chat-Int8',
|
||||
'qwen/Qwen-14B-Chat-Int4',
|
||||
],
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
max_new_tokens: int = 2048
|
||||
top_p: float = 0.7
|
||||
temperature: float = 0.95
|
||||
251
openllm-python/README.md
generated
251
openllm-python/README.md
generated
@@ -414,25 +414,6 @@ openllm start databricks/dolly-v2-3b --backend vllm
|
||||
openllm start databricks/dolly-v2-3b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start databricks/dolly-v2-3b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -511,25 +492,6 @@ openllm start tiiuae/falcon-7b --backend vllm
|
||||
openllm start tiiuae/falcon-7b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start tiiuae/falcon-7b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -651,25 +613,6 @@ openllm start eleutherai/gpt-neox-20b --backend vllm
|
||||
openllm start eleutherai/gpt-neox-20b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start eleutherai/gpt-neox-20b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -748,25 +691,6 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm
|
||||
openllm start meta-llama/Llama-2-70b-chat-hf --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -837,25 +761,6 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend vllm
|
||||
openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start HuggingFaceH4/zephyr-7b-alpha --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -937,25 +842,6 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm
|
||||
TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1028,25 +914,6 @@ openllm start facebook/opt-125m --backend vllm
|
||||
openllm start facebook/opt-125m --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start facebook/opt-125m --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1118,6 +985,86 @@ TRUST_REMOTE_CODE=True openllm start microsoft/phi-1_5 --backend pt
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Qwen</summary>
|
||||
|
||||
|
||||
### Quickstart
|
||||
|
||||
|
||||
|
||||
> **Note:** Qwen requires to install with:
|
||||
> ```bash
|
||||
> pip install "openllm[qwen]"
|
||||
> ```
|
||||
|
||||
|
||||
Run the following command to quickly spin up a Qwen server:
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat
|
||||
```
|
||||
In a different terminal, run the following command to interact with the server:
|
||||
|
||||
```bash
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models.
|
||||
|
||||
|
||||
|
||||
### Supported models
|
||||
|
||||
You can specify any of the following Qwen models via `openllm start`:
|
||||
|
||||
|
||||
- [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat)
|
||||
- [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8)
|
||||
- [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4)
|
||||
- [qwen/Qwen-14B-Chat](https://huggingface.co/qwen/Qwen-14B-Chat)
|
||||
- [qwen/Qwen-14B-Chat-Int8](https://huggingface.co/qwen/Qwen-14B-Chat-Int8)
|
||||
- [qwen/Qwen-14B-Chat-Int4](https://huggingface.co/qwen/Qwen-14B-Chat-Int4)
|
||||
|
||||
### Supported backends
|
||||
|
||||
OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
|
||||
|
||||
|
||||
|
||||
> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
|
||||
|
||||
|
||||
|
||||
- vLLM (Recommended):
|
||||
|
||||
|
||||
To install vLLM, run `pip install "openllm[vllm]"`
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend vllm
|
||||
```
|
||||
|
||||
|
||||
> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
|
||||
|
||||
|
||||
|
||||
> **Note:** Currently, adapters are yet to be supported with vLLM.
|
||||
|
||||
|
||||
- PyTorch:
|
||||
|
||||
|
||||
```bash
|
||||
TRUST_REMOTE_CODE=True openllm start qwen/Qwen-7B-Chat --backend pt
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>StableLM</summary>
|
||||
|
||||
|
||||
@@ -1184,25 +1131,6 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm
|
||||
openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -1279,25 +1207,6 @@ openllm start bigcode/starcoder --backend vllm
|
||||
openllm start bigcode/starcoder --backend pt
|
||||
```
|
||||
|
||||
- CTranslate2 (*experimental*):
|
||||
|
||||
|
||||
```bash
|
||||
openllm start bigcode/starcoder --backend ctranslate
|
||||
```
|
||||
|
||||
|
||||
> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
|
||||
|
||||
|
||||
|
||||
> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
|
||||
|
||||
|
||||
|
||||
> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
@@ -109,7 +109,7 @@ ctranslate = ["ctranslate2>=3.22.0"]
|
||||
falcon = ["xformers"]
|
||||
fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"]
|
||||
full = [
|
||||
"openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,starcoder,vllm]",
|
||||
"openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,qwen,starcoder,vllm]",
|
||||
]
|
||||
ggml = ["ctransformers"]
|
||||
gptq = ["auto-gptq[triton]>=0.4.2"]
|
||||
@@ -117,6 +117,7 @@ grpc = ["bentoml[grpc]>=1.1.10"]
|
||||
mpt = ["triton"]
|
||||
openai = ["openai[datalib]>=1", "tiktoken"]
|
||||
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
|
||||
qwen = ["cpm-kernels", "tiktoken"]
|
||||
starcoder = ["bitsandbytes"]
|
||||
vllm = ["vllm>=0.2.2"]
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
|
||||
'''
|
||||
|
||||
# update-config-stubs.py: import stubs start
|
||||
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
|
||||
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
|
||||
# update-config-stubs.py: import stubs stop
|
||||
|
||||
from openllm_cli._sdk import (
|
||||
|
||||
@@ -36,8 +36,6 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
|
||||
tokenizer.pad_token_id = config.eos_token_id
|
||||
elif tokenizer.eos_token_id is not None:
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
else:
|
||||
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
||||
return tokenizer
|
||||
|
||||
def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[LLM[M, T], P], TypeGuard[M | T | Model]]:
|
||||
|
||||
@@ -94,25 +94,6 @@ openllm query 'What are large language models?'
|
||||
```""",
|
||||
]
|
||||
)
|
||||
if 'ctranslate' in it['backend']:
|
||||
details_block.extend(
|
||||
[
|
||||
'\n- CTranslate2 (*experimental*):\n\n',
|
||||
f"""\
|
||||
```bash
|
||||
{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend ctranslate
|
||||
```""",
|
||||
*markdown_noteblock(
|
||||
'Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16'
|
||||
),
|
||||
*markdown_noteblock(
|
||||
'We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.'
|
||||
),
|
||||
*markdown_importantblock(
|
||||
'CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.'
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
details_block.append('\n</details>\n\n')
|
||||
content.append('\n'.join(details_block))
|
||||
|
||||
Reference in New Issue
Block a user