mirror of
https://github.com/bentoml/OpenLLM.git
synced 2025-12-23 23:57:46 -05:00
refactor(cli): cleanup API (#592)
* chore: remove unused imports Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * refactor(cli): update to only need model_id Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: `openllm start model-id` Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: add changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update changelog notice Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update correct config and running tools Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update backward compat options and treat JSON outputs corespondingly Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -9,7 +9,5 @@ charset = utf-8
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
[openllm-python/src/openllm/cli/entrypoint.py]
|
||||
indent_size = unset
|
||||
[openllm-client/src/openllm_client/pb/v1/*]
|
||||
indent_size = unset
|
||||
|
||||
@@ -25,7 +25,6 @@ repos:
|
||||
- id: ruff
|
||||
verbose: true
|
||||
args: [--exit-non-zero-on-fix, --show-fixes]
|
||||
types_or: [pyi, python3, jupyter]
|
||||
- id: ruff-format
|
||||
verbose: true
|
||||
types: [pyi]
|
||||
@@ -80,4 +79,4 @@ repos:
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
entry: mypy -m openllm_client
|
||||
entry: mypy --strict openllm-client/src/openllm_client/__init__.pyi openllm-core/src/openllm_core/_typing_compat.py
|
||||
|
||||
16
.style.yapf
Normal file
16
.style.yapf
Normal file
@@ -0,0 +1,16 @@
|
||||
[style]
|
||||
BASED_ON_STYLE = google
|
||||
INDENT_WIDTH = 2
|
||||
JOIN_MULTIPLE_LINES = true
|
||||
COLUMN_LIMIT = 192
|
||||
USE_TABS = false
|
||||
BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
|
||||
BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
|
||||
DISABLE_ENDING_COMMA_HEURISTIC = true
|
||||
BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
|
||||
BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
|
||||
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
|
||||
ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
|
||||
ALLOW_MULTILINE_DICTIONARY_KEYS = false
|
||||
ALLOW_SPLIT_BEFORE_DICT_VALUE = false
|
||||
COALESCE_BRACKETS = true
|
||||
4
.yapfignore
Normal file
4
.yapfignore
Normal file
@@ -0,0 +1,4 @@
|
||||
openllm-python/src/openllm/playground/
|
||||
openllm-python/src/openllm/models/__init__.py
|
||||
openllm-client/src/openllm_client/pb/**
|
||||
examples/
|
||||
@@ -28,6 +28,9 @@ Before you can start developing, you'll need to set up your environment:
|
||||
> Note that `hatch run setup` will symlink the python version from `.python-version-default` to `.python-version` in the project root.
|
||||
> Therefore any tools that understand `.python-version` will use the correct Python version.
|
||||
|
||||
> [!NOTE]
|
||||
> When in doubt, set `OPENLLMDEVDEBUG=5` to see all generation debug logs and outputs
|
||||
|
||||
1. Ensure you have [Git](https://git-scm.com/), and
|
||||
[Python3.8+](https://www.python.org/downloads/) installed.
|
||||
2. Fork the OpenLLM repository from GitHub.
|
||||
|
||||
140
README.md
140
README.md
@@ -106,14 +106,13 @@ Options:
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
Commands:
|
||||
build Package a given models into a Bento.
|
||||
build Package a given models into a BentoLLM.
|
||||
import Setup LLM interactively.
|
||||
instruct Instruct agents interactively for given tasks, from a...
|
||||
models List all supported models.
|
||||
prune Remove all saved models, (and optionally bentos) built with...
|
||||
query Ask a LLM interactively, from a terminal.
|
||||
start Start any LLM as a REST server.
|
||||
start-grpc Start any LLM as a gRPC server.
|
||||
prune Remove all saved models, (and optionally bentos) built with OpenLLM locally.
|
||||
query Query a LLM interactively, from a terminal.
|
||||
start Start a LLMServer for any supported LLM.
|
||||
start-grpc Start a gRPC LLMServer for any supported LLM.
|
||||
|
||||
Extensions:
|
||||
build-base-container Base image builder for BentoLLM.
|
||||
@@ -130,7 +129,7 @@ Extensions:
|
||||
OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start an [OPT](https://huggingface.co/docs/transformers/model_doc/opt) server, run the following:
|
||||
|
||||
```bash
|
||||
openllm start opt
|
||||
openllm start facebook/opt-1.3b
|
||||
```
|
||||
|
||||
This starts the server at [http://0.0.0.0:3000/](http://0.0.0.0:3000/). OpenLLM downloads the model to the BentoML local Model Store if they have not been registered before. To view your local models, run `bentoml models list`.
|
||||
@@ -153,7 +152,7 @@ openllm query 'Explain to me the difference between "further" and "farther"'
|
||||
OpenLLM seamlessly supports many models and their variants. You can specify different variants of the model to be served by providing the `--model-id` option. For example:
|
||||
|
||||
```bash
|
||||
openllm start opt --model-id facebook/opt-2.7b
|
||||
openllm start facebook/opt-2.7b
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -174,7 +173,7 @@ OpenLLM currently supports the following models. By default, OpenLLM doesn't inc
|
||||
Run the following commands to quickly spin up a Llama 2 server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start mistral --model-id HuggingFaceH4/zephyr-7b-beta
|
||||
openllm start HuggingFaceH4/zephyr-7b-beta
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -199,14 +198,14 @@ You can specify any of the following Mistral models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start mistral --model-id HuggingFaceH4/zephyr-7b-beta --backend pt
|
||||
openllm start HuggingFaceH4/zephyr-7b-beta --backend pt
|
||||
```
|
||||
|
||||
- vLLM (Recommended):
|
||||
|
||||
```bash
|
||||
pip install "openllm[vllm]"
|
||||
openllm start mistral --model-id HuggingFaceH4/zephyr-7b-beta --backend vllm
|
||||
openllm start HuggingFaceH4/zephyr-7b-beta --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -230,7 +229,7 @@ pip install "openllm[llama]"
|
||||
Run the following commands to quickly spin up a Llama 2 server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf
|
||||
openllm start meta-llama/Llama-2-7b-chat-hf
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -273,14 +272,14 @@ You can specify any of the following Llama models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend pt
|
||||
openllm start meta-llama/Llama-2-7b-chat-hf --backend pt
|
||||
```
|
||||
|
||||
- vLLM (Recommended):
|
||||
|
||||
```bash
|
||||
pip install "openllm[llama, vllm]"
|
||||
openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend vllm
|
||||
openllm start meta-llama/Llama-2-7b-chat-hf --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -304,7 +303,7 @@ pip install "openllm[chatglm]"
|
||||
Run the following commands to quickly spin up a ChatGLM server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start chatglm --model-id thudm/chatglm-6b
|
||||
openllm start thudm/chatglm2-6b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -325,7 +324,7 @@ You can specify any of the following ChatGLM models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start chatglm --model-id thudm/chatglm-6b --backend pt
|
||||
openllm start thudm/chatglm2-6b --backend pt
|
||||
```
|
||||
|
||||
</details>
|
||||
@@ -346,7 +345,7 @@ pip install openllm
|
||||
Run the following commands to quickly spin up a Dolly-v2 server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start dolly-v2 --model-id databricks/dolly-v2-3b
|
||||
openllm start databricks/dolly-v2-3b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -365,13 +364,13 @@ You can specify any of the following Dolly-v2 models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start dolly-v2 --model-id databricks/dolly-v2-3b --backend pt
|
||||
openllm start databricks/dolly-v2-3b --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
openllm start dolly-v2 --model-id databricks/dolly-v2-3b --backend vllm
|
||||
openllm start databricks/dolly-v2-3b --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -395,7 +394,7 @@ pip install "openllm[falcon]"
|
||||
Run the following commands to quickly spin up a Falcon server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start falcon --model-id tiiuae/falcon-7b
|
||||
openllm start tiiuae/falcon-7b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -415,14 +414,14 @@ You can specify any of the following Falcon models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start falcon --model-id tiiuae/falcon-7b --backend pt
|
||||
openllm start tiiuae/falcon-7b --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
pip install "openllm[falcon, vllm]"
|
||||
openllm start falcon --model-id tiiuae/falcon-7b --backend vllm
|
||||
openllm start tiiuae/falcon-7b --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -446,7 +445,7 @@ pip install "openllm[flan-t5]"
|
||||
Run the following commands to quickly spin up a Flan-T5 server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start flan-t5 --model-id google/flan-t5-large
|
||||
openllm start google/flan-t5-large
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -467,7 +466,7 @@ You can specify any of the following Flan-T5 models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start flan-t5 --model-id google/flan-t5-large --backend pt
|
||||
openllm start google/flan-t5-large --backend pt
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -491,7 +490,7 @@ pip install openllm
|
||||
Run the following commands to quickly spin up a GPT-NeoX server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start gpt-neox --model-id eleutherai/gpt-neox-20b
|
||||
openllm start eleutherai/gpt-neox-20b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -508,13 +507,13 @@ You can specify any of the following GPT-NeoX models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start gpt-neox --model-id eleutherai/gpt-neox-20b --backend pt
|
||||
openllm start eleutherai/gpt-neox-20b --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
openllm start gpt-neox --model-id eleutherai/gpt-neox-20b --backend vllm
|
||||
openllm start eleutherai/gpt-neox-20b --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -538,7 +537,7 @@ pip install "openllm[mpt]"
|
||||
Run the following commands to quickly spin up a MPT server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start mpt --model-id mosaicml/mpt-7b-chat
|
||||
openllm start mosaicml/mpt-7b-chat
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -561,14 +560,14 @@ You can specify any of the following MPT models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start mpt --model-id mosaicml/mpt-7b-chat --backend pt
|
||||
openllm start mosaicml/mpt-7b-chat --backend pt
|
||||
```
|
||||
|
||||
- vLLM (Recommended):
|
||||
|
||||
```bash
|
||||
pip install "openllm[mpt, vllm]"
|
||||
openllm start mpt --model-id mosaicml/mpt-7b-chat --backend vllm
|
||||
openllm start mosaicml/mpt-7b-chat --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -592,7 +591,7 @@ pip install "openllm[opt]"
|
||||
Run the following commands to quickly spin up an OPT server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start opt --model-id facebook/opt-2.7b
|
||||
openllm start facebook/opt-2.7b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -614,14 +613,14 @@ You can specify any of the following OPT models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start opt --model-id facebook/opt-2.7b --backend pt
|
||||
openllm start facebook/opt-2.7b --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
pip install "openllm[opt, vllm]"
|
||||
openllm start opt --model-id facebook/opt-2.7b --backend vllm
|
||||
openllm start facebook/opt-2.7b --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -645,7 +644,7 @@ pip install openllm
|
||||
Run the following commands to quickly spin up a StableLM server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b
|
||||
openllm start stabilityai/stablelm-tuned-alpha-7b
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -665,13 +664,13 @@ You can specify any of the following StableLM models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b --backend pt
|
||||
openllm start stabilityai/stablelm-tuned-alpha-7b --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b --backend vllm
|
||||
openllm start stabilityai/stablelm-tuned-alpha-7b --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -695,7 +694,7 @@ pip install "openllm[starcoder]"
|
||||
Run the following commands to quickly spin up a StarCoder server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start startcoder --model-id [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
|
||||
openllm start bigcode/starcoder
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -713,14 +712,14 @@ You can specify any of the following StarCoder models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start startcoder --model-id bigcode/starcoder --backend pt
|
||||
openllm start bigcode/starcoder --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
pip install "openllm[startcoder, vllm]"
|
||||
openllm start startcoder --model-id bigcode/starcoder --backend vllm
|
||||
openllm start bigcode/starcoder --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -744,7 +743,7 @@ pip install "openllm[baichuan]"
|
||||
Run the following commands to quickly spin up a Baichuan server and send a request to it.
|
||||
|
||||
```bash
|
||||
openllm start baichuan --model-id baichuan-inc/baichuan-13b-base
|
||||
openllm start baichuan-inc/baichuan-13b-base
|
||||
export OPENLLM_ENDPOINT=http://localhost:3000
|
||||
openllm query 'What are large language models?'
|
||||
```
|
||||
@@ -766,14 +765,14 @@ You can specify any of the following Baichuan models by using `--model-id`.
|
||||
- PyTorch (Default):
|
||||
|
||||
```bash
|
||||
openllm start baichuan --model-id baichuan-inc/baichuan-13b-base --backend pt
|
||||
openllm start baichuan-inc/baichuan-13b-base --backend pt
|
||||
```
|
||||
|
||||
- vLLM:
|
||||
|
||||
```bash
|
||||
pip install "openllm[baichuan, vllm]"
|
||||
openllm start baichuan --model-id baichuan-inc/baichuan-13b-base --backend vllm
|
||||
openllm start baichuan-inc/baichuan-13b-base --backend vllm
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -788,7 +787,7 @@ More models will be integrated with OpenLLM and we welcome your contributions if
|
||||
OpenLLM allows you to start your model server on multiple GPUs and specify the number of workers per resource assigned using the `--workers-per-resource` option. For example, if you have 4 available GPUs, you set the value as one divided by the number as only one instance of the Runner server will be spawned.
|
||||
|
||||
```bash
|
||||
openllm start opt --workers-per-resource 0.25
|
||||
openllm start facebook/opt-2.7b --workers-per-resource 0.25
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -808,7 +807,7 @@ Different LLMs may support multiple runtime implementations. Models that have `v
|
||||
To specify a specific runtime for your chosen model, use the `--backend` option. For example:
|
||||
|
||||
```bash
|
||||
openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend vllm
|
||||
openllm start meta-llama/Llama-2-7b-chat-hf --backend vllm
|
||||
```
|
||||
|
||||
Note:
|
||||
@@ -842,7 +841,7 @@ openllm start opt --quantize int8
|
||||
To run inference with `gptq`, simply pass `--quantize gptq`:
|
||||
|
||||
```bash
|
||||
openllm start llama --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq
|
||||
openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
@@ -863,7 +862,7 @@ openllm start mistral --model-id TheBloke/zephyr-7B-alpha-AWQ --quantize awq
|
||||
To run inference with `squeezellm`, simply pass `--quantize squeezellm`:
|
||||
|
||||
```bash
|
||||
openllm start llama --model-id squeeze-ai-lab/sq-llama-2-7b-w4-s0 --quantize squeezellm --serialization legacy
|
||||
openllm start squeeze-ai-lab/sq-llama-2-7b-w4-s0 --quantize squeezellm --serialization legacy
|
||||
```
|
||||
|
||||
> [!IMPORTANT]
|
||||
@@ -916,13 +915,13 @@ curl -X 'POST' \
|
||||
To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command:
|
||||
|
||||
```bash
|
||||
openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
|
||||
openllm build facebook/opt-6.7b --adapter-id ...
|
||||
```
|
||||
|
||||
If you use a relative path for `--adapter-id`, you need to add `--build-ctx`.
|
||||
|
||||
```bash
|
||||
openllm build opt --adapter-id ./path/to/adapter_id --build-ctx .
|
||||
openllm build facebook/opt-6.7b --adapter-id ./path/to/adapter_id --build-ctx .
|
||||
```
|
||||
|
||||
> [!IMPORTANT]
|
||||
@@ -938,6 +937,43 @@ The following UIs are currently available for OpenLLM:
|
||||
| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/openllm-contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 |
|
||||
| TS | BentoML Team | | 🚧 |
|
||||
|
||||
## 🐍 Python SDK
|
||||
|
||||
Each LLM can be instantiated with `openllm.LLM`:
|
||||
|
||||
```python
|
||||
import openllm
|
||||
|
||||
llm = openllm.LLM('facebook/opt-2.7b')
|
||||
```
|
||||
|
||||
The main inference API is the streaming `generate_iterator` method:
|
||||
|
||||
```python
|
||||
async for generation in llm.generate_iterator('What is the meaning of life?'): print(generation.outputs[0].text)
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> The motivation behind making `llm.generate_iterator` an async generator is to provide support for Continuous batching with vLLM backend. By having the async endpoints, each prompt
|
||||
> will be added correctly to the request queue to process with vLLM backend.
|
||||
|
||||
There is also a _one-shot_ `generate` method:
|
||||
|
||||
```python
|
||||
await llm.generate('What is the meaning of life?')
|
||||
```
|
||||
|
||||
This method is easy to use for one-shot generation use case, but merely served as an example how to use `llm.generate_iterator` as it uses `generate_iterator` under the hood.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> If you need to call your code in a synchronous context, you can use `asyncio.run` that wraps an async function:
|
||||
> ```python
|
||||
> import asyncio
|
||||
> async def generate(prompt, **attrs): return await llm.generate(prompt, **attrs)
|
||||
> asyncio.run(generate("The meaning of life is", temperature=0.23))
|
||||
> ```
|
||||
|
||||
|
||||
## ⚙️ Integrations
|
||||
|
||||
OpenLLM is not just a standalone product; it's a building block designed to
|
||||
@@ -1051,10 +1087,10 @@ There are several ways to deploy your LLMs:
|
||||
### 🐳 Docker container
|
||||
|
||||
1. **Building a Bento**: With OpenLLM, you can easily build a Bento for a
|
||||
specific model, like `dolly-v2`, using the `build` command.:
|
||||
specific model, like `mistralai/Mistral-7B-Instruct-v0.1`, using the `build` command.:
|
||||
|
||||
```bash
|
||||
openllm build dolly-v2
|
||||
openllm build mistralai/Mistral-7B-Instruct-v0.1
|
||||
```
|
||||
|
||||
A
|
||||
@@ -1091,10 +1127,10 @@ serverless cloud for shipping and scaling AI applications.
|
||||
> specific API token and the BentoCloud endpoint respectively.
|
||||
|
||||
3. **Bulding a Bento**: With OpenLLM, you can easily build a Bento for a
|
||||
specific model, such as `dolly-v2`:
|
||||
specific model, such as `mistralai/Mistral-7B-Instruct-v0.1`:
|
||||
|
||||
```bash
|
||||
openllm build dolly-v2
|
||||
openllm build mistralai/Mistral-7B-Instruct-v0.1
|
||||
```
|
||||
|
||||
4. **Pushing a Bento**: Push your freshly-built Bento service to BentoCloud via
|
||||
|
||||
173
changelog.d/592.refactor.md
Normal file
173
changelog.d/592.refactor.md
Normal file
@@ -0,0 +1,173 @@
|
||||
## Auto backend detection
|
||||
|
||||
By default, OpenLLM will use vLLM (if available) to run the server. We recommend users to always explicitly set backend to `--backend vllm` for the best performance.
|
||||
|
||||
if vLLM is not available, OpenLLM will fall back to PyTorch backend. Note that the PyTorch backend won't be as performant
|
||||
|
||||
## Revamped CLI interface
|
||||
|
||||
This is a part of the recent restructure of `openllm.LLM`
|
||||
|
||||
For all CLI, there is no need to pass in the architecture anymore. One can directly pass in the model and save a few characters
|
||||
|
||||
Start:
|
||||
|
||||
```bash
|
||||
|
||||
openllm start meta-llama/Llama-2-13b-chat-hf --device 0
|
||||
|
||||
```
|
||||
|
||||
Build:
|
||||
|
||||
```bash
|
||||
|
||||
openllm build meta-llama/Llama-2-13b-chat-hf --serialisation safetensors
|
||||
|
||||
```
|
||||
|
||||
Import:
|
||||
|
||||
```bash
|
||||
|
||||
openllm build mistralai/Mistral-7B-v0.1 --serialisation legacy
|
||||
|
||||
```
|
||||
|
||||
All CLI outputs will now dump JSON objects to stdout. This will ensure easier programmatic access to the CLI.
|
||||
This means `--output/-o` is removed from all CLI commands, as all of them will output JSON.
|
||||
|
||||
Passing in `model_name` will now be deprecated and will be removed from the future. If you try `openllm start opt`, you will see the following
|
||||
|
||||
```bash
|
||||
$ openllm start opt
|
||||
|
||||
Passing 'openllm start opt' is deprecated and will be remove in a future version. Use 'openllm start facebook/opt-1.3b' instead.
|
||||
```
|
||||
|
||||
Example outputs of `openllm models`:
|
||||
|
||||
```bash
|
||||
$ openllm models
|
||||
|
||||
{
|
||||
"chatglm": {
|
||||
"architecture": "ChatGLMModel",
|
||||
"example_id": "thudm/chatglm2-6b",
|
||||
"supported_backends": [
|
||||
"pt"
|
||||
],
|
||||
"installation": "pip install \"openllm[chatglm]\"",
|
||||
"items": []
|
||||
},
|
||||
"dolly_v2": {
|
||||
"architecture": "GPTNeoXForCausalLM",
|
||||
"example_id": "databricks/dolly-v2-3b",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install openllm",
|
||||
"items": []
|
||||
},
|
||||
"falcon": {
|
||||
"architecture": "FalconForCausalLM",
|
||||
"example_id": "tiiuae/falcon-40b-instruct",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[falcon]\"",
|
||||
"items": []
|
||||
},
|
||||
"flan_t5": {
|
||||
"architecture": "T5ForConditionalGeneration",
|
||||
"example_id": "google/flan-t5-small",
|
||||
"supported_backends": [
|
||||
"pt"
|
||||
],
|
||||
"installation": "pip install openllm",
|
||||
"items": []
|
||||
},
|
||||
"gpt_neox": {
|
||||
"architecture": "GPTNeoXForCausalLM",
|
||||
"example_id": "eleutherai/gpt-neox-20b",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install openllm",
|
||||
"items": []
|
||||
},
|
||||
"llama": {
|
||||
"architecture": "LlamaForCausalLM",
|
||||
"example_id": "NousResearch/llama-2-70b-hf",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[llama]\"",
|
||||
"items": []
|
||||
},
|
||||
"mpt": {
|
||||
"architecture": "MPTForCausalLM",
|
||||
"example_id": "mosaicml/mpt-7b-chat",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[mpt]\"",
|
||||
"items": []
|
||||
},
|
||||
"opt": {
|
||||
"architecture": "OPTForCausalLM",
|
||||
"example_id": "facebook/opt-2.7b",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[opt]\"",
|
||||
"items": []
|
||||
},
|
||||
"stablelm": {
|
||||
"architecture": "GPTNeoXForCausalLM",
|
||||
"example_id": "stabilityai/stablelm-base-alpha-3b",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install openllm",
|
||||
"items": []
|
||||
},
|
||||
"starcoder": {
|
||||
"architecture": "GPTBigCodeForCausalLM",
|
||||
"example_id": "bigcode/starcoder",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[starcoder]\"",
|
||||
"items": []
|
||||
},
|
||||
"mistral": {
|
||||
"architecture": "MistralForCausalLM",
|
||||
"example_id": "amazon/MistralLite",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install openllm",
|
||||
"items": []
|
||||
},
|
||||
"baichuan": {
|
||||
"architecture": "BaiChuanForCausalLM",
|
||||
"example_id": "fireballoon/baichuan-vicuna-chinese-7b",
|
||||
"supported_backends": [
|
||||
"pt",
|
||||
"vllm"
|
||||
],
|
||||
"installation": "pip install \"openllm[baichuan]\"",
|
||||
"items": []
|
||||
}
|
||||
}
|
||||
```
|
||||
2
mypy.ini
2
mypy.ini
@@ -8,4 +8,4 @@ ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
warn_unreachable = true
|
||||
modules = openllm_client
|
||||
files = openllm-client/src/openllm_client/__init__.pyi
|
||||
files = openllm-client/src/openllm_client/__init__.pyi|openllm-core/src/openllm_core/_typing_compat.py
|
||||
|
||||
@@ -46,7 +46,6 @@ from .utils import dantic
|
||||
from .utils import field_env_key
|
||||
from .utils import first_not_none
|
||||
from .utils import lenient_issubclass
|
||||
from .utils.import_utils import is_vllm_available
|
||||
from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING
|
||||
from .utils.peft import FineTuneConfig
|
||||
from .utils.peft import PeftType
|
||||
@@ -198,6 +197,8 @@ converter.register_unstructure_hook_factory(
|
||||
k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
|
||||
}))
|
||||
|
||||
_GenerationConfigT = t.TypeVar('_GenerationConfig', bound=GenerationConfig)
|
||||
|
||||
@attr.frozen(slots=True, repr=False, init=False)
|
||||
class SamplingParams(ReprMixin):
|
||||
"""SamplingParams is the attr-compatible version of ``vllm.SamplingParams``. It provides some utilities to also respect shared variables from ``openllm.LLMConfig``.
|
||||
@@ -303,6 +304,8 @@ converter.register_unstructure_hook_factory(
|
||||
converter.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
|
||||
lambda cls: make_dict_structure_fn(cls, converter, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')))
|
||||
|
||||
_SamplingParamsT = t.TypeVar('_SamplingParams', bound=SamplingParams)
|
||||
|
||||
# cached it here to save one lookup per assignment
|
||||
_object_getattribute = object.__getattribute__
|
||||
|
||||
@@ -336,7 +339,6 @@ class ModelSettings(t.TypedDict, total=False):
|
||||
backend: t.Tuple[LiteralBackend, ...]
|
||||
model_name: NotRequired[str]
|
||||
start_name: NotRequired[str]
|
||||
env: NotRequired[openllm_core.utils.EnvVarMixin]
|
||||
# serving configuration
|
||||
timeout: int
|
||||
workers_per_resource: t.Union[int, float]
|
||||
@@ -404,7 +406,6 @@ class _ModelSettingsAttr:
|
||||
backend: t.Tuple[LiteralBackend, ...]
|
||||
model_name: str
|
||||
start_name: str
|
||||
env: openllm_core.utils.EnvVarMixin
|
||||
timeout: int
|
||||
workers_per_resource: t.Union[int, float]
|
||||
fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig]
|
||||
@@ -431,7 +432,6 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
|
||||
|
||||
model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
|
||||
|
||||
_final_value_dct['env'] = openllm_core.utils.EnvVarMixin(model_name, backend='vllm' if is_vllm_available() else 'pt', model_id=_settings_attr.default_id)
|
||||
_final_value_dct['service_name'] = f'generated_{model_name}_service.py'
|
||||
|
||||
# NOTE: default conversation templates
|
||||
@@ -476,7 +476,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
|
||||
_reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'}
|
||||
|
||||
@attr.define(slots=True)
|
||||
class _ConfigAttr:
|
||||
class _ConfigAttr(t.Generic[_GenerationConfigT, _SamplingParamsT]):
|
||||
@staticmethod
|
||||
def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
|
||||
"""Field is a alias to the internal dantic utilities to easily create
|
||||
@@ -495,7 +495,7 @@ class _ConfigAttr:
|
||||
__config__: ModelSettings = Field(None)
|
||||
'''Internal configuration for this LLM model. Each of the field in here will be populated
|
||||
and prefixed with __openllm_<value>__'''
|
||||
GenerationConfig: GenerationConfig = Field(None)
|
||||
GenerationConfig: _GenerationConfigT = Field(None)
|
||||
'''Users can override this subclass of any given LLMConfig to provide GenerationConfig
|
||||
default value. For example:
|
||||
|
||||
@@ -508,7 +508,7 @@ class _ConfigAttr:
|
||||
eos_token_id: int = 11
|
||||
```
|
||||
'''
|
||||
SamplingParams: SamplingParams = Field(None)
|
||||
SamplingParams: _SamplingParamsT = Field(None)
|
||||
'''Users can override this subclass of any given LLMConfig to provide SamplingParams
|
||||
default value. For example:
|
||||
|
||||
@@ -534,11 +534,13 @@ class _ConfigAttr:
|
||||
'''The accepted keys for this LLMConfig.'''
|
||||
__openllm_extras__: DictStrAny = Field(None, init=False)
|
||||
'''Extra metadata for this LLMConfig.'''
|
||||
__openllm_generation_class__: type[openllm_core._configuration.GenerationConfig] = Field(None)
|
||||
__openllm_config_override__: DictStrAny = Field(None, init=False)
|
||||
'''Additional override for some variables in LLMConfig.__config__'''
|
||||
__openllm_generation_class__: type[_GenerationConfigT] = Field(None)
|
||||
'''The result generated GenerationConfig class for this LLMConfig. This will be used
|
||||
to create the generation_config argument that can be used throughout the lifecycle.
|
||||
This class will also be managed internally by OpenLLM.'''
|
||||
__openllm_sampling_class__: type[openllm_core._configuration.SamplingParams] = Field(None)
|
||||
__openllm_sampling_class__: type[_SamplingParamsT] = Field(None)
|
||||
'''The result generated SamplingParams class for this LLMConfig. This will be used
|
||||
to create arguments for vLLM LLMEngine that can be used throughout the lifecycle.
|
||||
This class will also be managed internally by OpenLLM.'''
|
||||
@@ -598,8 +600,6 @@ class _ConfigAttr:
|
||||
'''The normalized version of __openllm_start_name__, determined by __openllm_name_type__'''
|
||||
__openllm_start_name__: str = Field(None)
|
||||
'''Default name to be used with `openllm start`'''
|
||||
__openllm_env__: openllm_core.utils.EnvVarMixin = Field(None)
|
||||
'''A EnvVarMixin instance for this LLMConfig.'''
|
||||
__openllm_timeout__: int = Field(None)
|
||||
'''The default timeout to be set for this given LLM.'''
|
||||
__openllm_workers_per_resource__: t.Union[int, float] = Field(None)
|
||||
@@ -724,7 +724,7 @@ class _ConfigBuilder:
|
||||
return self
|
||||
|
||||
@attr.define(slots=True, init=False)
|
||||
class LLMConfig(_ConfigAttr):
|
||||
class LLMConfig(_ConfigAttr[t.Any, t.Any]):
|
||||
"""``openllm.LLMConfig`` is a pydantic-like ``attrs`` interface that offers fast and easy-to-use APIs.
|
||||
|
||||
It lives in between the nice UX of `pydantic` and fast performance of `attrs` where it allows users to quickly formulate
|
||||
@@ -906,7 +906,13 @@ class LLMConfig(_ConfigAttr):
|
||||
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
|
||||
super().__setattr__(attr, value)
|
||||
|
||||
def __init__(self, *, generation_config: DictStrAny | None = None, sampling_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any):
|
||||
def __init__(self,
|
||||
*,
|
||||
generation_config: DictStrAny | None = None,
|
||||
sampling_config: DictStrAny | None = None,
|
||||
__openllm_extras__: DictStrAny | None = None,
|
||||
__openllm_config_override__: DictStrAny | None = None,
|
||||
**attrs: t.Any):
|
||||
# create a copy of the keys as cache
|
||||
_cached_keys = tuple(attrs.keys())
|
||||
_generation_cl_dict = attr.fields_dict(self.__openllm_generation_class__)
|
||||
@@ -920,6 +926,7 @@ class LLMConfig(_ConfigAttr):
|
||||
for k in _cached_keys:
|
||||
if k in generation_config or k in sampling_config or attrs[k] is None: del attrs[k]
|
||||
|
||||
self.__openllm_config_override__ = __openllm_config_override__ or {}
|
||||
self.__openllm_extras__ = config_merger.merge(first_not_none(__openllm_extras__, default={}), {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__})
|
||||
self.generation_config = self['generation_class'](_internal=True, **generation_config)
|
||||
self.sampling_config = self['sampling_class'].from_generation_config(self.generation_config, **sampling_config)
|
||||
@@ -957,8 +964,6 @@ class LLMConfig(_ConfigAttr):
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['start_name']) -> str: ...
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['env']) -> openllm_core.utils.EnvVarMixin: ...
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['timeout']) -> int: ...
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['workers_per_resource']) -> t.Union[int, float]: ...
|
||||
@@ -1109,10 +1114,15 @@ class LLMConfig(_ConfigAttr):
|
||||
if item in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.")
|
||||
internal_attributes = f'__openllm_{item}__'
|
||||
if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
|
||||
elif hasattr(self, item): return getattr(self, item)
|
||||
elif hasattr(self.__openllm_generation_class__, item): return getattr(self.generation_config, item)
|
||||
elif hasattr(self.__openllm_sampling_class__, item): return getattr(self.sampling_config, item)
|
||||
if hasattr(self, internal_attributes):
|
||||
if item in self.__openllm_config_override__: return self.__openllm_config_override__[item]
|
||||
return getattr(self, internal_attributes)
|
||||
elif hasattr(self, item):
|
||||
return getattr(self, item)
|
||||
elif hasattr(self.__openllm_generation_class__, item):
|
||||
return getattr(self.generation_config, item)
|
||||
elif hasattr(self.__openllm_sampling_class__, item):
|
||||
return getattr(self.sampling_config, item)
|
||||
elif item in self.__class__.__openllm_fine_tune_strategies__:
|
||||
return self.__class__.__openllm_fine_tune_strategies__[t.cast(AdapterType, item)]
|
||||
elif item in self.__openllm_extras__:
|
||||
@@ -1209,15 +1219,14 @@ class LLMConfig(_ConfigAttr):
|
||||
def model_construct_env(cls, **attrs: t.Any) -> Self:
|
||||
"""A helpers that respect configuration values environment variables."""
|
||||
attrs = {k: v for k, v in attrs.items() if v is not None}
|
||||
model_config = cls.__openllm_env__.config
|
||||
env_json_string = os.environ.get(model_config, None)
|
||||
env_json_string = os.environ.get('OPENLLM_CONFIG', None)
|
||||
|
||||
config_from_env: DictStrAny = {}
|
||||
if env_json_string is not None:
|
||||
try:
|
||||
config_from_env = orjson.loads(env_json_string)
|
||||
except orjson.JSONDecodeError as e:
|
||||
raise RuntimeError(f"Failed to parse '{model_config}' as valid JSON string.") from e
|
||||
raise RuntimeError("Failed to parse 'OPENLLM_CONFIG' as valid JSON string.") from e
|
||||
|
||||
if 'generation_config' in attrs and 'sampling_config' in attrs: # backward compatibility
|
||||
generation_config = attrs.pop('generation_config')
|
||||
@@ -1243,12 +1252,18 @@ class LLMConfig(_ConfigAttr):
|
||||
llm_config_attrs: DictStrAny = {'generation_config': {}, 'sampling_config': {}}
|
||||
key_to_remove: ListStr = []
|
||||
for k, v in attrs.items():
|
||||
if k.startswith(f"{self['model_name']}_generation_"):
|
||||
if k.startswith(f"{self['model_name']}_generation_"): # NOTE: This is an internal state for openllm cli.
|
||||
llm_config_attrs['generation_config'][k[len(self['model_name'] + '_generation_'):]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith('_openllm_genericinternal_generation_'):
|
||||
llm_config_attrs['generation_config'][k[len('_openllm_genericinternal_generation_'):]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith(f"{self['model_name']}_sampling_"):
|
||||
llm_config_attrs['sampling_config'][k[len(self['model_name'] + '_sampling_'):]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith('_openllm_genericinternal_sampling_'):
|
||||
llm_config_attrs['sampling_config'][k[len('_openllm_genericinternal_sampling_'):]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith(f"{self['model_name']}_"):
|
||||
llm_config_attrs[k[len(self['model_name'] + '_'):]] = v
|
||||
key_to_remove.append(k)
|
||||
@@ -1304,14 +1319,14 @@ class LLMConfig(_ConfigAttr):
|
||||
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
|
||||
if t.get_origin(ty) is t.Union: continue
|
||||
f = dantic.attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_generation=True)(f)
|
||||
f = cog.optgroup.group(f'{cls.__openllm_generation_class__.__name__} generation options')(f)
|
||||
f = cog.optgroup.group('GenerationConfig generation options')(f)
|
||||
|
||||
for name, field in attr.fields_dict(cls.__openllm_sampling_class__).items():
|
||||
ty = cls.__openllm_hints__.get(name)
|
||||
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
|
||||
if t.get_origin(ty) is t.Union: continue
|
||||
f = dantic.attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_sampling=True)(f)
|
||||
f = cog.optgroup.group(f'{cls.__openllm_sampling_class__.__name__} sampling options')(f)
|
||||
f = cog.optgroup.group('SamplingParams sampling options')(f)
|
||||
|
||||
total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(attr.fields_dict(cls.__openllm_sampling_class__))
|
||||
|
||||
@@ -1335,6 +1350,7 @@ converter.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls,
|
||||
|
||||
def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
|
||||
if not isinstance(data, dict): raise RuntimeError(f'Expected a dictionary, but got {type(data)}')
|
||||
_config_override = {k: v for k, v in data.items() if k in cls.__config__}
|
||||
cls_attrs = {k: v for k, v in data.items() if k in cls.__openllm_accepted_keys__}
|
||||
generation_cls_fields = attr.fields_dict(cls.__openllm_generation_class__)
|
||||
sampling_cls_fields = attr.fields_dict(cls.__openllm_sampling_class__)
|
||||
@@ -1353,8 +1369,8 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
|
||||
else:
|
||||
sampling_config = {k: v for k, v in data.items() if k in sampling_cls_fields}
|
||||
# The rest should be passed to extras
|
||||
data = {k: v for k, v in data.items() if k not in cls.__openllm_accepted_keys__}
|
||||
return cls(generation_config=generation_config, sampling_config=sampling_config, __openllm_extras__=data, **cls_attrs)
|
||||
data = {k: v for k, v in data.items() if k not in cls.__openllm_accepted_keys__ and k not in _config_override}
|
||||
return cls(generation_config=generation_config, sampling_config=sampling_config, __openllm_extras__=data, __openllm_config_override__=_config_override, **cls_attrs)
|
||||
|
||||
converter.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
|
||||
openllm_home = os.path.expanduser(os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))
|
||||
|
||||
@@ -1,45 +1,23 @@
|
||||
# mypy: disable-error-code="type-arg,valid-type"
|
||||
from __future__ import annotations
|
||||
import abc
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
|
||||
_is_bentoml_installed = False
|
||||
try:
|
||||
import bentoml
|
||||
_is_bentoml_installed = True
|
||||
except ImportError:
|
||||
bentoml = None
|
||||
_is_bentoml_installed = False
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import peft
|
||||
import transformers
|
||||
|
||||
import openllm
|
||||
|
||||
from bentoml._internal.runner.runnable import RunnableMethod
|
||||
from bentoml._internal.runner.runner import RunnerMethod
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
from peft.peft_model import PeftModel
|
||||
from transformers import PreTrainedModel
|
||||
from transformers import PreTrainedTokenizer
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
|
||||
from .utils.lazy import VersionInfo
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from types import UnionType
|
||||
|
||||
from bentoml._internal.types import LazyType
|
||||
|
||||
AnyType: t.TypeAlias = t.Type[t.Any] | UnionType | LazyType[t.Any]
|
||||
else:
|
||||
AnyType = t.Any
|
||||
|
||||
M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, peft.PeftModel]')
|
||||
T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')
|
||||
M = t.TypeVar('M', bound='t.Union[PreTrainedModel, PeftModel]')
|
||||
T = t.TypeVar('T', bound='t.Union[PreTrainedTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase]')
|
||||
|
||||
def get_literal_args(typ: t.Any) -> tuple[str, ...]:
|
||||
return getattr(typ, '__args__')
|
||||
return getattr(typ, '__args__', tuple())
|
||||
|
||||
AnyCallable = t.Callable[..., t.Any]
|
||||
DictStrAny = t.Dict[str, t.Any]
|
||||
@@ -87,17 +65,6 @@ else:
|
||||
from typing_extensions import TypeAlias as TypeAlias
|
||||
from typing_extensions import TypeGuard as TypeGuard
|
||||
|
||||
class ModelSignatureDict(t.TypedDict, total=False):
|
||||
batchable: bool
|
||||
batch_dim: t.Union[t.Tuple[int, int], int]
|
||||
input_spec: t.Optional[t.Union[t.Tuple[AnyType], AnyType]]
|
||||
output_spec: t.Optional[AnyType]
|
||||
|
||||
class PeftAdapterOutput(t.TypedDict):
|
||||
success: bool
|
||||
result: t.Dict[str, peft.PeftConfig]
|
||||
error_msg: str
|
||||
|
||||
class AdapterTuple(TupleAny):
|
||||
adapter_id: str
|
||||
name: str
|
||||
@@ -109,55 +76,3 @@ class RefTuple(TupleAny):
|
||||
git_hash: str
|
||||
version: VersionInfo
|
||||
strategy: LiteralContainerVersionStrategy
|
||||
|
||||
if _is_bentoml_installed:
|
||||
|
||||
class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
|
||||
SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
class LLMRunner(bentoml.Runner, t.Generic[M, T]):
|
||||
__doc__: str
|
||||
__module__: str
|
||||
llm_type: str
|
||||
llm_tag: bentoml.Tag
|
||||
identifying_params: dict[str, t.Any]
|
||||
llm: openllm.LLM[M, T]
|
||||
config: openllm.LLMConfig
|
||||
backend: LiteralBackend
|
||||
has_adapters: bool
|
||||
system_message: str | None
|
||||
prompt_template: str | None
|
||||
generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
def __init__(self,
|
||||
runnable_class: type[LLMRunnable[M, T]],
|
||||
*,
|
||||
runnable_init_params: dict[str, t.Any] | None = ...,
|
||||
name: str | None = ...,
|
||||
scheduling_strategy: type[Strategy] = ...,
|
||||
models: list[bentoml.Model] | None = ...,
|
||||
max_batch_size: int | None = ...,
|
||||
max_latency_ms: int | None = ...,
|
||||
method_configs: dict[str, dict[str, int]] | None = ...,
|
||||
embedded: bool = False,
|
||||
) -> None:
|
||||
...
|
||||
|
||||
@abc.abstractmethod
|
||||
def download_model(self) -> bentoml.Model:
|
||||
...
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def peft_adapters(self) -> PeftAdapterOutput:
|
||||
...
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
...
|
||||
else:
|
||||
# NOTE: t.Any is also a type
|
||||
LLMRunnable = LLMRunner = t.Any
|
||||
|
||||
@@ -5,7 +5,6 @@ import openllm_core
|
||||
|
||||
from openllm_core._conversation import SeparatorStyle
|
||||
from openllm_core.prompts import PromptTemplate
|
||||
from openllm_core.utils import dantic
|
||||
|
||||
START_LLAMA_COMMAND_DOCSTRING = '''\
|
||||
Run a LLMServer for Llama model.
|
||||
@@ -70,7 +69,6 @@ class LlamaConfig(openllm_core.LLMConfig):
|
||||
Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
|
||||
for more information.
|
||||
"""
|
||||
use_llama2_prompt: bool = dantic.Field(False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.')
|
||||
__config__ = {
|
||||
'name_type': 'lowercase',
|
||||
'url': 'https://github.com/facebookresearch/llama',
|
||||
@@ -106,7 +104,7 @@ class LlamaConfig(openllm_core.LLMConfig):
|
||||
|
||||
@property
|
||||
def default_prompt_template(self) -> str:
|
||||
return DEFAULT_PROMPT_TEMPLATE('v2' if self.use_llama2_prompt else 'v1').to_string()
|
||||
return DEFAULT_PROMPT_TEMPLATE('v2').to_string()
|
||||
|
||||
@property
|
||||
def default_system_message(self) -> str:
|
||||
@@ -120,10 +118,9 @@ class LlamaConfig(openllm_core.LLMConfig):
|
||||
top_p: float | None = None,
|
||||
temperature: float | None = None,
|
||||
max_new_tokens: int | None = None,
|
||||
use_llama2_prompt: bool = True,
|
||||
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
system_message = DEFAULT_SYSTEM_MESSAGE if system_message is None else system_message
|
||||
if prompt_template is None: prompt_template = DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1')
|
||||
if prompt_template is None: prompt_template = DEFAULT_PROMPT_TEMPLATE('v2')
|
||||
elif isinstance(prompt_template, str): prompt_template = PromptTemplate(template=prompt_template)
|
||||
return prompt_template.with_options(system_message=system_message).format(instruction=prompt), {
|
||||
'max_new_tokens': max_new_tokens,
|
||||
|
||||
@@ -5,7 +5,6 @@ import openllm_core
|
||||
|
||||
from openllm_core._conversation import SeparatorStyle
|
||||
from openllm_core.prompts import process_prompt
|
||||
from openllm_core.utils import dantic
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core.prompts.prompt_template import PromptTemplate
|
||||
@@ -61,7 +60,6 @@ class OPTConfig(openllm_core.LLMConfig):
|
||||
'bias': 'none'
|
||||
},)
|
||||
}
|
||||
format_outputs: bool = dantic.Field(False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''')
|
||||
|
||||
class GenerationConfig:
|
||||
top_k: int = 15
|
||||
@@ -88,5 +86,4 @@ class OPTConfig(openllm_core.LLMConfig):
|
||||
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
|
||||
if len(generation_result) == 1: return generation_result[0]
|
||||
if self.config.format_outputs: return 'Generated result:\n' + '\n -'.join(generation_result)
|
||||
else: return '\n'.join(generation_result)
|
||||
return '\n'.join(generation_result)
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
"""Utilities function for OpenLLM.
|
||||
|
||||
User can import these function for convenience, but we won't ensure backward compatibility for these functions. So use with caution.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import contextlib
|
||||
import functools
|
||||
import hashlib
|
||||
@@ -24,7 +19,6 @@ from .import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
|
||||
from .lazy import LazyLoader as LazyLoader
|
||||
from .lazy import LazyModule as LazyModule
|
||||
from .lazy import VersionInfo as VersionInfo
|
||||
from .._typing_compat import TypeGuard
|
||||
from .._typing_compat import overload
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -51,18 +45,7 @@ else:
|
||||
|
||||
DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG'
|
||||
|
||||
def is_async_callable(obj: t.Any) -> TypeGuard[t.Callable[..., t.Awaitable[t.Any]]]:
|
||||
# Borrowed from starlette._utils
|
||||
while isinstance(obj, functools.partial):
|
||||
obj = obj.func
|
||||
return asyncio.iscoroutinefunction(obj) or (callable(obj) and asyncio.iscoroutinefunction(obj.__call__))
|
||||
|
||||
def resolve_user_filepath(filepath: str, ctx: str | None) -> str:
|
||||
'''Resolve the abspath of a filepath provided by user. User provided file path can:
|
||||
* be a relative path base on ctx dir
|
||||
* contain leading "~" for HOME directory
|
||||
* contain environment variables such as "$HOME/workspace"
|
||||
'''
|
||||
# Return if filepath exist after expanduser
|
||||
|
||||
_path = os.path.expanduser(os.path.expandvars(filepath))
|
||||
@@ -132,12 +115,6 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
|
||||
if isinstance(cls, _WithArgsTypes): return False
|
||||
raise
|
||||
|
||||
def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
|
||||
if in_notebook(): return asyncio.run(coro) # For running coroutine in notebooks see https://github.com/jupyter/notebook/issues/5261
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
|
||||
else: return loop.run_until_complete(coro)
|
||||
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1') -> str:
|
||||
"""Generate a hash from given file's modification time.
|
||||
@@ -159,11 +136,6 @@ def check_bool_env(env: str, default: bool = True) -> bool:
|
||||
# equivocal setattr to save one lookup per assignment
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
|
||||
"""This makes sure that we don't overwrite any existing attributes on the object."""
|
||||
_setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
|
||||
if not hasattr(obj, name): _setattr(name, value)
|
||||
|
||||
def field_env_key(key: str, suffix: str | None = None) -> str:
|
||||
return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))
|
||||
|
||||
@@ -175,11 +147,11 @@ SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit()
|
||||
MYPY = False
|
||||
|
||||
def get_debug_mode() -> bool:
|
||||
if not DEBUG and DEBUG_ENV_VAR in os.environ: return check_bool_env(DEBUG_ENV_VAR)
|
||||
if not DEBUG and DEBUG_ENV_VAR in os.environ: return check_bool_env(DEBUG_ENV_VAR, False)
|
||||
return DEBUG
|
||||
|
||||
def get_quiet_mode() -> bool:
|
||||
if QUIET_ENV_VAR in os.environ: return check_bool_env(QUIET_ENV_VAR)
|
||||
if QUIET_ENV_VAR in os.environ: return check_bool_env(QUIET_ENV_VAR, False)
|
||||
if DEBUG: return False
|
||||
return False
|
||||
|
||||
@@ -284,8 +256,6 @@ def in_notebook() -> bool:
|
||||
except (ImportError, AttributeError):
|
||||
return False
|
||||
|
||||
_dockerenv, _cgroup = Path('/.dockerenv'), Path('/proc/self/cgroup')
|
||||
|
||||
class suppress(contextlib.suppress, contextlib.ContextDecorator):
|
||||
"""A version of contextlib.suppress with decorator support.
|
||||
|
||||
@@ -334,20 +304,6 @@ def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
|
||||
"""
|
||||
return lambda func: functools.wraps(func)(compose(transform, func))
|
||||
|
||||
@apply(bool)
|
||||
@suppress(FileNotFoundError)
|
||||
def _text_in_file(text: str, filename: Path) -> bool:
|
||||
return any(text in line for line in filename.open())
|
||||
|
||||
def in_docker() -> bool:
|
||||
"""Is this current environment running in docker?
|
||||
|
||||
```python
|
||||
type(in_docker())
|
||||
```
|
||||
"""
|
||||
return _dockerenv.exists() or _text_in_file('docker', _cgroup)
|
||||
|
||||
T = t.TypeVar('T')
|
||||
K = t.TypeVar('K')
|
||||
|
||||
@@ -394,7 +350,7 @@ _whitelist_modules = {'pkg'}
|
||||
# XXX: define all classes, functions import above this line
|
||||
# since _extras will be the locals() import from this file.
|
||||
_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))}
|
||||
_extras['__openllm_migration__'] = {'ModelEnv': 'EnvVarMixin', 'bentoml_cattr': 'converter'}
|
||||
_extras['__openllm_migration__'] = {'bentoml_cattr': 'converter'}
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
'analytics': [],
|
||||
'codegen': [],
|
||||
@@ -404,10 +360,8 @@ _import_structure: dict[str, list[str]] = {
|
||||
'representation': ['ReprMixin'],
|
||||
'serde': ['converter'],
|
||||
'import_utils': [
|
||||
'OPTIONAL_DEPENDENCIES', 'EnvVarMixin', 'is_cpm_kernels_available', 'is_einops_available', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available',
|
||||
'is_datasets_available', 'is_jupyter_available', 'is_jupytext_available', 'is_notebook_available', 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available',
|
||||
'is_xformers_available', 'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', 'is_transformers_available', 'is_optimum_supports_gptq', 'is_autoawq_available',
|
||||
'is_bentoml_available'
|
||||
'OPTIONAL_DEPENDENCIES', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available', 'is_jupyter_available', 'is_jupytext_available',
|
||||
'is_notebook_available', 'is_autogptq_available', 'is_grpc_available', 'is_transformers_available', 'is_optimum_supports_gptq', 'is_autoawq_available', 'is_bentoml_available'
|
||||
]
|
||||
}
|
||||
|
||||
@@ -418,28 +372,19 @@ if t.TYPE_CHECKING:
|
||||
from . import dantic as dantic
|
||||
from . import serde as serde
|
||||
from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
|
||||
from .import_utils import EnvVarMixin as EnvVarMixin
|
||||
from .import_utils import is_autoawq_available as is_autoawq_available
|
||||
from .import_utils import is_autogptq_available as is_autogptq_available
|
||||
from .import_utils import is_bentoml_available as is_bentoml_available
|
||||
from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available
|
||||
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
|
||||
from .import_utils import is_datasets_available as is_datasets_available
|
||||
from .import_utils import is_einops_available as is_einops_available
|
||||
from .import_utils import is_fairscale_available as is_fairscale_available
|
||||
from .import_utils import is_grpc_available as is_grpc_available
|
||||
from .import_utils import is_grpc_health_available as is_grpc_health_available
|
||||
from .import_utils import is_jupyter_available as is_jupyter_available
|
||||
from .import_utils import is_jupytext_available as is_jupytext_available
|
||||
from .import_utils import is_notebook_available as is_notebook_available
|
||||
from .import_utils import is_optimum_supports_gptq as is_optimum_supports_gptq
|
||||
from .import_utils import is_peft_available as is_peft_available
|
||||
from .import_utils import is_sentencepiece_available as is_sentencepiece_available
|
||||
from .import_utils import is_torch_available as is_torch_available
|
||||
from .import_utils import is_transformers_available as is_transformers_available
|
||||
from .import_utils import is_triton_available as is_triton_available
|
||||
from .import_utils import is_vllm_available as is_vllm_available
|
||||
from .import_utils import is_xformers_available as is_xformers_available
|
||||
from .representation import ReprMixin as ReprMixin
|
||||
from .serde import converter as converter
|
||||
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
"""Telemetry related for OpenLLM tracking.
|
||||
|
||||
Users can disable this with OPENLLM_DO_NOT_TRACK envvar.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import contextlib
|
||||
import functools
|
||||
|
||||
@@ -139,8 +139,7 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig],
|
||||
return generate_function(cls, '__auto_env', lines, args=('_', 'fields'), globs=globs, annotations={'_': 'type[LLMConfig]', 'fields': fields_ann, 'return': fields_ann})
|
||||
|
||||
def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
|
||||
"""Enhance sdk with nice repr that plays well with your brain."""
|
||||
from openllm_core.utils import ReprMixin
|
||||
from .representation import ReprMixin
|
||||
if name is None: name = func.__name__.strip('_')
|
||||
_signatures = inspect.signature(func).parameters
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
"""Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons."""
|
||||
from __future__ import annotations
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
@@ -7,17 +6,6 @@ import logging
|
||||
import os
|
||||
import typing as t
|
||||
|
||||
import inflection
|
||||
|
||||
import openllm_core
|
||||
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import overload
|
||||
|
||||
from .lazy import LazyLoader
|
||||
from .representation import ReprMixin
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from collections import OrderedDict
|
||||
BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
|
||||
@@ -42,22 +30,14 @@ _torch_available = importlib.util.find_spec('torch') is not None
|
||||
_vllm_available = importlib.util.find_spec('vllm') is not None
|
||||
_transformers_available = _is_package_available('transformers')
|
||||
_grpc_available = importlib.util.find_spec('grpc') is not None
|
||||
_grpc_health_available = importlib.util.find_spec('grpc_health') is not None
|
||||
_bentoml_available = _is_package_available('bentoml')
|
||||
_peft_available = _is_package_available('peft')
|
||||
_einops_available = _is_package_available('einops')
|
||||
_cpm_kernel_available = _is_package_available('cpm_kernels')
|
||||
_bitsandbytes_available = _is_package_available('bitsandbytes')
|
||||
_datasets_available = _is_package_available('datasets')
|
||||
_triton_available = _is_package_available('triton')
|
||||
_jupyter_available = _is_package_available('jupyter')
|
||||
_jupytext_available = _is_package_available('jupytext')
|
||||
_notebook_available = _is_package_available('notebook')
|
||||
_autogptq_available = _is_package_available('auto_gptq')
|
||||
_autoawq_available = importlib.util.find_spec('awq') is not None
|
||||
_sentencepiece_available = _is_package_available('sentencepiece')
|
||||
_xformers_available = _is_package_available('xformers')
|
||||
_fairscale_available = _is_package_available('fairscale')
|
||||
|
||||
def is_bentoml_available() -> bool:
|
||||
return _bentoml_available
|
||||
@@ -68,9 +48,6 @@ def is_transformers_available() -> bool:
|
||||
def is_grpc_available() -> bool:
|
||||
return _grpc_available
|
||||
|
||||
def is_grpc_health_available() -> bool:
|
||||
return _grpc_health_available
|
||||
|
||||
def is_optimum_supports_gptq() -> bool:
|
||||
from . import pkg
|
||||
return pkg.pkg_version_info('optimum')[:2] >= (0, 12)
|
||||
@@ -84,36 +61,15 @@ def is_jupytext_available() -> bool:
|
||||
def is_notebook_available() -> bool:
|
||||
return _notebook_available
|
||||
|
||||
def is_triton_available() -> bool:
|
||||
return _triton_available
|
||||
|
||||
def is_datasets_available() -> bool:
|
||||
return _datasets_available
|
||||
|
||||
def is_peft_available() -> bool:
|
||||
return _peft_available
|
||||
|
||||
def is_einops_available() -> bool:
|
||||
return _einops_available
|
||||
|
||||
def is_cpm_kernels_available() -> bool:
|
||||
return _cpm_kernel_available
|
||||
|
||||
def is_bitsandbytes_available() -> bool:
|
||||
return _bitsandbytes_available
|
||||
|
||||
def is_autogptq_available() -> bool:
|
||||
return _autogptq_available
|
||||
|
||||
def is_sentencepiece_available() -> bool:
|
||||
return _sentencepiece_available
|
||||
|
||||
def is_xformers_available() -> bool:
|
||||
return _xformers_available
|
||||
|
||||
def is_fairscale_available() -> bool:
|
||||
return _fairscale_available
|
||||
|
||||
def is_torch_available() -> bool:
|
||||
global _torch_available
|
||||
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and _torch_available:
|
||||
@@ -139,77 +95,3 @@ def is_vllm_available() -> bool:
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
_vllm_available = False
|
||||
return _vllm_available
|
||||
|
||||
class EnvVarMixin(ReprMixin):
|
||||
model_name: str
|
||||
config: str
|
||||
model_id: str
|
||||
quantize: str
|
||||
backend: str
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['config']) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['model_id']) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['quantize']) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['backend']) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['backend_value']) -> LiteralBackend:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['quantize_value']) -> t.Literal['int8', 'int4', 'gptq'] | None:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
|
||||
...
|
||||
|
||||
def __getitem__(self, item: str | t.Any) -> t.Any:
|
||||
if item.endswith('_value') and hasattr(self, f'_{item}'): return object.__getattribute__(self, f'_{item}')()
|
||||
elif hasattr(self, item): return getattr(self, item)
|
||||
raise KeyError(f'Key {item} not found in {self}')
|
||||
|
||||
def __init__(self, model_name: str, backend: LiteralBackend = 'pt', model_id: str | None = None, quantize: LiteralString | None = None) -> None:
|
||||
"""EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
|
||||
from openllm_core.utils import field_env_key
|
||||
self.model_name = inflection.underscore(model_name)
|
||||
self._backend = backend
|
||||
self._model_id = model_id
|
||||
self._quantize = quantize
|
||||
for att in {'config', 'model_id', 'quantize', 'backend'}:
|
||||
setattr(self, att, field_env_key(att.upper()))
|
||||
|
||||
def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
|
||||
from . import first_not_none
|
||||
return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], first_not_none(os.environ.get(self['quantize']), default=self._quantize))
|
||||
|
||||
def _backend_value(self) -> LiteralBackend:
|
||||
from . import first_not_none
|
||||
return t.cast(LiteralBackend, first_not_none(os.environ.get(self['backend']), default=self._backend))
|
||||
|
||||
def _model_id_value(self) -> str | None:
|
||||
from . import first_not_none
|
||||
return first_not_none(os.environ.get(self['model_id']), default=self._model_id)
|
||||
|
||||
@property
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
return {'config', 'model_id', 'quantize', 'backend'}
|
||||
|
||||
@property
|
||||
def start_docstring(self) -> str:
|
||||
return getattr(openllm_core.config, f'START_{self.model_name.upper()}_COMMAND_DOCSTRING')
|
||||
|
||||
@property
|
||||
def module(self) -> LazyLoader:
|
||||
return LazyLoader(f'configuration_{self.model_name}', globals(), f'openllm_core.config.configuration_{self.model_name}')
|
||||
|
||||
@@ -79,8 +79,8 @@ class FineTuneConfig:
|
||||
|
||||
def build(self) -> PeftConfig:
|
||||
try:
|
||||
from peft.utils.peft_types import TaskType
|
||||
from peft.mapping import get_peft_config
|
||||
from peft.utils.peft_types import TaskType
|
||||
except ImportError:
|
||||
raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.')
|
||||
adapter_config = self.adapter_config.copy()
|
||||
|
||||
@@ -19,38 +19,17 @@ class ReprMixin:
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
'''This can be overriden by base class using this mixin.'''
|
||||
|
||||
def __repr__(self) -> str:
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__} {orjson.dumps({k: utils.converter.unstructure(v) if attr.has(v) else v for k, v in self.__repr_args__()}, option=orjson.OPT_INDENT_2).decode()}'
|
||||
|
||||
'''The `__repr__` for any subclass of Mixin.
|
||||
|
||||
It will print nicely the class name with each of the fields under '__repr_keys__' as kv JSON dict.
|
||||
'''
|
||||
|
||||
def __str__(self) -> str:
|
||||
def __str__(self):
|
||||
return self.__repr_str__(' ')
|
||||
|
||||
'''The string representation of the given Mixin subclass.
|
||||
|
||||
It will contains all of the attributes from __repr_keys__
|
||||
'''
|
||||
|
||||
def __repr_name__(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
'''Name of the instance's class, used in __repr__.'''
|
||||
|
||||
def __repr_str__(self, join_str: str) -> str:
|
||||
def __repr_str__(self, join_str: str):
|
||||
return join_str.join(repr(v) if a is None else f'{a}={v!r}' for a, v in self.__repr_args__())
|
||||
|
||||
'''To be used with __str__.'''
|
||||
|
||||
def __repr_args__(self) -> ReprArgs:
|
||||
return ((k, getattr(self, k)) for k in self.__repr_keys__)
|
||||
|
||||
'''This can also be overriden by base class using this mixin.
|
||||
|
||||
By default it does a getattr of the current object from __repr_keys__.
|
||||
'''
|
||||
|
||||
@@ -46,7 +46,6 @@ dependencies = [
|
||||
"optimum>=1.12.0",
|
||||
"accelerate",
|
||||
"ghapi",
|
||||
"tabulate[widechars]>=0.9.0",
|
||||
"click>=8.1.3",
|
||||
"cuda-python;platform_system!=\"Darwin\"",
|
||||
"bitsandbytes<0.42",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
'''OpenLLM.
|
||||
"""OpenLLM.
|
||||
|
||||
An open platform for operating large language models in production. Fine-tune, serve,
|
||||
deploy, and monitor any LLMs with ease.
|
||||
@@ -7,7 +7,7 @@ deploy, and monitor any LLMs with ease.
|
||||
* Option to bring your own fine-tuned LLMs
|
||||
* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
|
||||
* Native integration with BentoML and LangChain for custom LLM apps
|
||||
'''
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import logging as _logging
|
||||
import os as _os
|
||||
@@ -34,11 +34,11 @@ from openllm_core.config import FalconConfig as FalconConfig
|
||||
from openllm_core.config import FlanT5Config as FlanT5Config
|
||||
from openllm_core.config import GPTNeoXConfig as GPTNeoXConfig
|
||||
from openllm_core.config import LlamaConfig as LlamaConfig
|
||||
from openllm_core.config import MistralConfig as MistralConfig
|
||||
from openllm_core.config import MPTConfig as MPTConfig
|
||||
from openllm_core.config import OPTConfig as OPTConfig
|
||||
from openllm_core.config import StableLMConfig as StableLMConfig
|
||||
from openllm_core.config import StarCoderConfig as StarCoderConfig
|
||||
from openllm_core.config import MistralConfig as MistralConfig
|
||||
|
||||
from . import exceptions as exceptions
|
||||
from . import utils as utils
|
||||
@@ -85,8 +85,7 @@ if _t.TYPE_CHECKING:
|
||||
from . import serialisation as serialisation
|
||||
from . import testing as testing
|
||||
from . import utils as utils
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
|
||||
from ._strategies import get_resource as get_resource
|
||||
from ._deprecated import Runner as Runner
|
||||
from ._generation import LogitsProcessorList as LogitsProcessorList
|
||||
from ._generation import StopOnTokens as StopOnTokens
|
||||
from ._generation import StoppingCriteriaList as StoppingCriteriaList
|
||||
@@ -96,17 +95,18 @@ if _t.TYPE_CHECKING:
|
||||
from ._llm import LLMRunnable as LLMRunnable
|
||||
from ._llm import LLMRunner as LLMRunner
|
||||
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
||||
from ._deprecated import Runner as Runner
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
|
||||
from ._strategies import get_resource as get_resource
|
||||
from .cli._sdk import build as build
|
||||
from .cli._sdk import import_model as import_model
|
||||
from .cli._sdk import list_models as list_models
|
||||
from .cli._sdk import start as start
|
||||
from .cli._sdk import start_grpc as start_grpc
|
||||
from .entrypoints import mount_entrypoints as mount_entrypoints
|
||||
from .prompts import PromptTemplate as PromptTemplate
|
||||
from .protocol import openai as openai
|
||||
from .serialisation import ggml as ggml
|
||||
from .serialisation import transformers as transformers
|
||||
from .entrypoints import mount_entrypoints as mount_entrypoints
|
||||
|
||||
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
|
||||
__lazy = openllm_core.utils.LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'COMPILED': COMPILED})
|
||||
|
||||
@@ -24,7 +24,7 @@ def _mark_deprecated(fn: t.Callable[P, t.Any]) -> t.Callable[P, t.Any]:
|
||||
|
||||
@_mark_deprecated
|
||||
def Runner(model_name: str,
|
||||
ensure_available: bool = False,
|
||||
ensure_available: bool = True,
|
||||
init_local: bool = False,
|
||||
backend: LiteralBackend | None = None,
|
||||
llm_config: LLMConfig | None = None,
|
||||
@@ -48,7 +48,8 @@ def Runner(model_name: str,
|
||||
Args:
|
||||
model_name: Supported model name from 'openllm models'
|
||||
ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
|
||||
If False, make sure the model is available locally.
|
||||
If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models
|
||||
are available locally. based on generated tag.
|
||||
backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
|
||||
llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
|
||||
init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
|
||||
@@ -56,7 +57,7 @@ def Runner(model_name: str,
|
||||
'''
|
||||
from ._llm import LLM
|
||||
if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
model_id = attrs.get('model_id') or llm_config['env']['model_id_value']
|
||||
model_id = attrs.get('model_id', default=os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
|
||||
_RUNNER_MSG = f'''\
|
||||
Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:
|
||||
|
||||
@@ -73,20 +74,20 @@ def Runner(model_name: str,
|
||||
warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
|
||||
attrs.update({
|
||||
'model_id': model_id,
|
||||
'quantize': llm_config['env']['quantize_value'],
|
||||
'quantize': os.getenv('OPENLLM_QUANTIZE', attrs.get('quantize', None)),
|
||||
'serialisation': first_not_none(attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
|
||||
'system_message': first_not_none(os.environ.get('OPENLLM_SYSTEM_MESSAGE'), attrs.get('system_message'), None),
|
||||
'prompt_template': first_not_none(os.environ.get('OPENLLM_PROMPT_TEMPLATE'), attrs.get('prompt_template'), None),
|
||||
})
|
||||
|
||||
backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
|
||||
if init_local: ensure_available = True
|
||||
llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, **attrs)
|
||||
if ensure_available: llm.save_pretrained()
|
||||
if init_local: llm.runner.init_local(quiet=True)
|
||||
return llm.runner
|
||||
|
||||
_DEPRECATED = {k: v for k, v in locals().items() if getattr(v, '__deprecated__', False)}
|
||||
|
||||
__all__ = list(_DEPRECATED)
|
||||
|
||||
def __dir__() -> list[str]:
|
||||
return sorted(_DEPRECATED.keys())
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# mypy: disable-error-code="name-defined,attr-defined"
|
||||
from __future__ import annotations
|
||||
import abc
|
||||
import logging
|
||||
import os
|
||||
import types
|
||||
@@ -26,8 +27,6 @@ from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralSerialisation
|
||||
from openllm_core._typing_compat import LLMRunnable
|
||||
from openllm_core._typing_compat import LLMRunner
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import T
|
||||
@@ -60,6 +59,10 @@ if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from bentoml._internal.runner.runnable import RunnableMethod
|
||||
from bentoml._internal.runner.runner import RunnerMethod
|
||||
from bentoml._internal.runner.runner_handle import RunnerHandle
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core.utils.representation import ReprArgs
|
||||
|
||||
@@ -191,6 +194,13 @@ class LLM(t.Generic[M, T]):
|
||||
llm_config__=llm_config,
|
||||
llm_trust_remote_code__=trust_remote_code)
|
||||
|
||||
try:
|
||||
model = bentoml.models.get(self.tag)
|
||||
except bentoml.exceptions.NotFound:
|
||||
model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
|
||||
# resolve the tag
|
||||
self._tag = model.tag
|
||||
|
||||
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
|
||||
def _make_tag_components(self, model_id: str, model_version: str | None, backend: LiteralBackend) -> tuple[str, str | None]:
|
||||
"""Return a valid tag name (<backend>-<repo>--<model_id>) and its tag version."""
|
||||
@@ -230,7 +240,6 @@ class LLM(t.Generic[M, T]):
|
||||
elif self._quantise is not None:self.__llm_quantization_config__,self._model_attrs=infer_quantisation_config(self, self._quantise, **self._model_attrs)
|
||||
else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
|
||||
return self.__llm_quantization_config__
|
||||
def save_pretrained(self)->bentoml.Model:return openllm.import_model(self.config['start_name'], model_id=self.model_id, model_version=self._revision, backend=self.__llm_backend__, quantize=self._quantise)
|
||||
@property
|
||||
def has_adapters(self)->bool:return self._adapter_map is not None
|
||||
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
||||
@@ -397,7 +406,6 @@ def _RunnerFactory(self: openllm.LLM[M, T],
|
||||
yield 'llm_type', self.llm_type
|
||||
yield 'backend', backend
|
||||
yield 'llm_tag', self.tag
|
||||
def _get_adapter_map(_: LLMRunner[M, T]) -> ResolvedAdapterMap: return converter.unstructure(self.adapter_map)
|
||||
# yapf: enable
|
||||
|
||||
return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
|
||||
@@ -408,9 +416,8 @@ def _RunnerFactory(self: openllm.LLM[M, T],
|
||||
'llm': self,
|
||||
'config': self.config,
|
||||
'backend': backend,
|
||||
'download_model': self.save_pretrained,
|
||||
'__module__': self.__module__,
|
||||
'__doc__': self.config['env'].start_docstring,
|
||||
'__doc__': getattr(openllm_core.config, f'START_{self.config["model_name"].upper()}_COMMAND_DOCSTRING'),
|
||||
'__repr__': ReprMixin.__repr__,
|
||||
'__repr_keys__': property(_wrapped_repr_keys),
|
||||
'__repr_args__': _wrapped_repr_args,
|
||||
@@ -427,4 +434,49 @@ def _RunnerFactory(self: openllm.LLM[M, T],
|
||||
runnable_init_params=dict(llm=self),
|
||||
method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}))
|
||||
|
||||
@t.final
|
||||
class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
|
||||
SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
@t.final
|
||||
class LLMRunner(t.Protocol[M, T]):
|
||||
__doc__: str
|
||||
__module__: str
|
||||
llm_type: str
|
||||
llm_tag: bentoml.Tag
|
||||
identifying_params: dict[str, t.Any]
|
||||
llm: openllm.LLM[M, T]
|
||||
config: openllm.LLMConfig
|
||||
backend: LiteralBackend
|
||||
has_adapters: bool
|
||||
system_message: str | None
|
||||
prompt_template: str | None
|
||||
generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
|
||||
scheduling_strategy: type[Strategy]
|
||||
workers_per_resource: int | float
|
||||
runnable_init_params: dict[str, t.Any]
|
||||
_runner_handle: RunnerHandle
|
||||
|
||||
def __init__(self,
|
||||
runnable_class: type[LLMRunnable[M, T]],
|
||||
*,
|
||||
runnable_init_params: dict[str, t.Any] | None = ...,
|
||||
name: str | None = ...,
|
||||
scheduling_strategy: type[Strategy] = ...,
|
||||
models: list[bentoml.Model] | None = ...,
|
||||
max_batch_size: int | None = ...,
|
||||
max_latency_ms: int | None = ...,
|
||||
method_configs: dict[str, dict[str, int]] | None = ...,
|
||||
embedded: bool = False) -> None:
|
||||
...
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
...
|
||||
|
||||
__all__ = ['LLMRunner', 'LLMRunnable', 'LLM']
|
||||
|
||||
@@ -15,14 +15,14 @@ from bentoml.io import Text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
llm_config = openllm.AutoConfig.for_model(svars.model)
|
||||
llm = openllm.LLM[t.Any, t.Any](svars.model_id,
|
||||
llm_config=llm_config,
|
||||
model_tag=svars.model_tag,
|
||||
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
|
||||
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
|
||||
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
|
||||
adapter_map=orjson.loads(svars.adapter_map))
|
||||
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), None),
|
||||
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), None),
|
||||
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), 'safetensors'),
|
||||
adapter_map=orjson.loads(svars.adapter_map),
|
||||
trust_remote_code=openllm.utils.check_bool_env('TRUST_REMOTE_CODE', default=False))
|
||||
llm_config = llm.config
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])
|
||||
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)
|
||||
@@ -41,7 +41,7 @@ _Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
configuration=llm_config.model_dump_json(flatten=True).decode(),
|
||||
prompt_template=llm.runner.prompt_template,
|
||||
system_message=llm.runner.system_message)
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
||||
model = os.environ['OPENLLM_MODEL'] # openllm: model name
|
||||
model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name
|
||||
model_tag = None # openllm: model tag
|
||||
adapter_map = os.environ['OPENLLM_ADAPTER_MAP'] # openllm: model adapter map
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
model = '{__model_name__}' # openllm: model name
|
||||
model_id = '{__model_id__}' # openllm: model id
|
||||
model_tag = '{__model_tag__}' # openllm: model tag
|
||||
adapter_map = '''{__model_adapter_map__}''' # openllm: model adapter map
|
||||
|
||||
@@ -45,7 +45,7 @@ OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
|
||||
|
||||
def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
|
||||
"""Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
|
||||
if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
|
||||
if openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False): return None
|
||||
# We need to build the package in editable mode, so that we can import it
|
||||
from build import ProjectBuilder
|
||||
from build.env import IsolatedEnvBuilder
|
||||
@@ -77,8 +77,6 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
|
||||
|
||||
env = llm.config['env']
|
||||
env['backend_value']
|
||||
if not openllm_core.utils.is_torch_available():
|
||||
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
|
||||
packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']) # XXX: Currently locking this for correctness
|
||||
@@ -95,10 +93,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, quantize: Li
|
||||
serialisation: LiteralSerialisation, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
|
||||
env: openllm_core.utils.EnvVarMixin = llm.config['env']
|
||||
env_dict = {
|
||||
env.backend: env['backend_value'],
|
||||
env.config: f"'{llm.config.model_dump_json().decode()}'",
|
||||
'OPENLLM_BACKEND': llm.__llm_backend__,
|
||||
'OPENLLM_CONFIG': f"'{llm.config.model_dump_json(flatten=True).decode()}'",
|
||||
'OPENLLM_SERIALIZATION': serialisation,
|
||||
'BENTOML_DEBUG': str(True),
|
||||
'BENTOML_QUIET': str(False),
|
||||
@@ -107,11 +104,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, quantize: Li
|
||||
if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
|
||||
if llm._system_message: env_dict['OPENLLM_SYSTEM_MESSAGE'] = repr(llm._system_message)
|
||||
if llm._prompt_template: env_dict['OPENLLM_PROMPT_TEMPLATE'] = repr(llm._prompt_template.to_string())
|
||||
|
||||
# We need to handle None separately here, as env from subprocess doesn't accept None value.
|
||||
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
|
||||
|
||||
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
|
||||
if quantize: env_dict['OPENLLM_QUANTISE'] = str(quantize)
|
||||
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
|
||||
|
||||
OPENLLM_MODEL_NAME = '# openllm: model name'
|
||||
@@ -179,7 +172,6 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
|
||||
def create_bento(bento_tag: bentoml.Tag,
|
||||
llm_fs: FS,
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
workers_per_resource: str | float,
|
||||
quantize: LiteralString | None,
|
||||
dockerfile_template: str | None,
|
||||
adapter_map: dict[str, str] | None = None,
|
||||
@@ -191,26 +183,9 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
|
||||
_serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({
|
||||
'_type': llm.llm_type,
|
||||
'_framework': llm.config['env']['backend_value'],
|
||||
'start_name': llm.config['start_name'],
|
||||
'base_name_or_path': llm.model_id,
|
||||
'bundler': 'openllm.bundle'
|
||||
})
|
||||
labels.update({'_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'})
|
||||
if adapter_map: labels.update(adapter_map)
|
||||
if isinstance(workers_per_resource, str):
|
||||
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
|
||||
elif workers_per_resource == 'conserved':
|
||||
workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
|
||||
else:
|
||||
try:
|
||||
workers_per_resource = float(workers_per_resource)
|
||||
except ValueError:
|
||||
raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
|
||||
elif isinstance(workers_per_resource, int):
|
||||
workers_per_resource = float(workers_per_resource)
|
||||
logger.info("Building Bento for '%s'", llm.config['start_name'])
|
||||
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
|
||||
# add service.py definition to this temporary folder
|
||||
write_service(llm, adapter_map, llm_fs)
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import importlib.util
|
||||
import logging
|
||||
import os
|
||||
import typing as t
|
||||
@@ -8,37 +7,32 @@ import typing as t
|
||||
import click
|
||||
import click_option_group as cog
|
||||
import inflection
|
||||
import orjson
|
||||
|
||||
from bentoml_cli.utils import BentoMLCommandGroup
|
||||
from click import ClickException
|
||||
from click import shell_completion as sc
|
||||
from click.shell_completion import CompletionItem
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import openllm_core
|
||||
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core._typing_compat import Concatenate
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralSerialisation
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import get_literal_args
|
||||
from openllm_core.utils import DEBUG
|
||||
from openllm_core.utils import check_bool_env
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import is_vllm_available
|
||||
|
||||
from . import termui
|
||||
class _OpenLLM_GenericInternalConfig(LLMConfig):
|
||||
__config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import subprocess
|
||||
|
||||
from openllm_core._configuration import LLMConfig
|
||||
class GenerationConfig:
|
||||
top_k: int = 15
|
||||
top_p: float = 0.9
|
||||
temperature: float = 0.75
|
||||
max_new_tokens: int = 128
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -91,146 +85,12 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
|
||||
ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0]
|
||||
return None
|
||||
|
||||
def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
command_attrs: DictStrAny = dict(name=llm_config['model_name'],
|
||||
context_settings=_context_settings or termui.CONTEXT_SETTINGS,
|
||||
short_help=f"Start a LLMServer for '{model}'",
|
||||
aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
|
||||
help=f'''\
|
||||
{llm_config['env'].start_docstring}
|
||||
|
||||
\b
|
||||
Note: ``{llm_config['start_name']}`` can also be run with any other models available on HuggingFace
|
||||
or fine-tuned variants as long as it belongs to the architecture generation ``{llm_config['architecture']}`` (trust_remote_code={llm_config['trust_remote_code']}).
|
||||
|
||||
\b
|
||||
For example: One can start [Fastchat-T5](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0) with ``openllm start flan-t5``:
|
||||
|
||||
\b
|
||||
$ openllm start flan-t5 --model-id lmsys/fastchat-t5-3b-v1.0
|
||||
|
||||
\b
|
||||
Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
\b
|
||||
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
|
||||
''')
|
||||
|
||||
@group.command(**command_attrs)
|
||||
@start_decorator(llm_config, serve_grpc=_serve_grpc)
|
||||
@click.pass_context
|
||||
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, system_message: str | None, prompt_template_file: t.IO[t.Any] | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], quantize: LiteralQuantise | None, backend: LiteralBackend | None,
|
||||
serialisation: LiteralSerialisation | None, cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
_serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default=llm_config['serialisation']))
|
||||
if _serialisation == 'safetensors' and quantize is not None and check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
termui.echo(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
|
||||
fg='yellow')
|
||||
termui.echo(f"Make sure to check out '{model_id}' repository to see if the weights is in '{_serialisation}' format if unsure.")
|
||||
adapter_map: dict[str, str] | None = attrs.pop(_adapter_mapping_key, None)
|
||||
config, server_attrs = llm_config.model_validate_click(**attrs)
|
||||
server_timeout = first_not_none(server_timeout, default=config['timeout'])
|
||||
server_attrs.update({'working_dir': os.path.dirname(os.path.dirname(__file__)), 'timeout': server_timeout})
|
||||
if _serve_grpc: server_attrs['grpc_protocol_version'] = 'v1'
|
||||
# NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
|
||||
development = server_attrs.pop('development')
|
||||
server_attrs.setdefault('production', not development)
|
||||
wpr = first_not_none(workers_per_resource, default=config['workers_per_resource'])
|
||||
|
||||
if isinstance(wpr, str):
|
||||
if wpr == 'round_robin': wpr = 1.0
|
||||
elif wpr == 'conserved':
|
||||
if device and openllm.utils.device_count() == 0:
|
||||
termui.echo('--device will have no effect as there is no GPUs available', fg='yellow')
|
||||
wpr = 1.0
|
||||
else:
|
||||
available_gpu = len(device) if device else openllm.utils.device_count()
|
||||
wpr = 1.0 if available_gpu == 0 else float(1 / available_gpu)
|
||||
else:
|
||||
wpr = float(wpr)
|
||||
elif isinstance(wpr, int):
|
||||
wpr = float(wpr)
|
||||
|
||||
# Create a new model env to work with the envvar during CLI invocation
|
||||
env = openllm.utils.EnvVarMixin(config['model_name'],
|
||||
backend=openllm_core.utils.first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'),
|
||||
model_id=model_id or config['default_id'],
|
||||
quantize=quantize)
|
||||
requirements = llm_config['requirements']
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
|
||||
|
||||
# NOTE: This is to set current configuration
|
||||
start_env = os.environ.copy()
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
|
||||
|
||||
prompt_template: str | None = prompt_template_file.read() if prompt_template_file is not None else None
|
||||
start_env.update({
|
||||
'OPENLLM_MODEL': model,
|
||||
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
|
||||
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
|
||||
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
|
||||
'OPENLLM_SERIALIZATION': _serialisation,
|
||||
env.backend: env['backend_value'],
|
||||
})
|
||||
if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
|
||||
if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value'])
|
||||
if system_message: start_env['OPENLLM_SYSTEM_MESSAGE'] = system_message
|
||||
if prompt_template: start_env['OPENLLM_PROMPT_TEMPLATE'] = prompt_template
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=start_env[env.model_id],
|
||||
revision=model_version,
|
||||
prompt_template=prompt_template,
|
||||
system_message=system_message,
|
||||
llm_config=config,
|
||||
backend=env['backend_value'],
|
||||
adapter_map=adapter_map,
|
||||
quantize=env['quantize_value'],
|
||||
serialisation=_serialisation)
|
||||
llm.save_pretrained() # ensure_available = True
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode()})
|
||||
|
||||
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
|
||||
openllm.utils.analytics.track_start_init(llm.config)
|
||||
|
||||
def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
|
||||
cmd_name = f'openllm build {model_name}'
|
||||
if not llm._local: cmd_name += f' --model-id {llm.model_id}'
|
||||
if llm._quantise: cmd_name += f' --quantize {llm._quantise}'
|
||||
cmd_name += f' --serialization {_serialisation}'
|
||||
if adapter_map is not None:
|
||||
cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
|
||||
if not openllm.utils.get_quiet_mode():
|
||||
termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
|
||||
|
||||
if return_process:
|
||||
server.start(env=start_env, text=True)
|
||||
if server.process is None: raise click.ClickException('Failed to start the server.')
|
||||
return server.process
|
||||
else:
|
||||
try:
|
||||
server.start(env=start_env, text=True, blocking=True)
|
||||
except Exception as err:
|
||||
termui.echo(f'Error caught while running LLM Server:\n{err}', fg='red')
|
||||
raise
|
||||
else:
|
||||
next_step(model, adapter_map)
|
||||
|
||||
# NOTE: Return the configuration for telemetry purposes.
|
||||
return config
|
||||
|
||||
return start_cmd
|
||||
|
||||
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
|
||||
composed = openllm.utils.compose(
|
||||
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
|
||||
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), model_id_option(factory=cog.optgroup),
|
||||
model_version_option(factory=cog.optgroup), system_message_option(factory=cog.optgroup), prompt_template_file_option(factory=cog.optgroup),
|
||||
_OpenLLM_GenericInternalConfig().to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
|
||||
cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'), model_version_option(factory=cog.optgroup),
|
||||
system_message_option(factory=cog.optgroup), prompt_template_file_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
|
||||
backend_option(factory=cog.optgroup),
|
||||
cog.optgroup.group('LLM Optimization Options',
|
||||
@@ -248,7 +108,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
multiple=True,
|
||||
envvar='CUDA_VISIBLE_DEVICES',
|
||||
callback=parse_device_callback,
|
||||
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
|
||||
help='Assign GPU devices (if available)',
|
||||
show_envvar=True),
|
||||
cog.optgroup.group('Fine-tuning related options',
|
||||
help='''\
|
||||
@@ -268,7 +128,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
'''),
|
||||
cog.optgroup.option('--adapter-id',
|
||||
default=None,
|
||||
help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
|
||||
help='Optional name or path for given LoRA adapter',
|
||||
multiple=True,
|
||||
callback=_id_callback,
|
||||
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'), click.option('--return-process', is_flag=True, default=False, help='Internal use only.',
|
||||
@@ -341,24 +201,6 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
|
||||
cli_option = functools.partial(_click_factory_type, attr='option')
|
||||
cli_argument = functools.partial(_click_factory_type, attr='argument')
|
||||
|
||||
def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
output = ['json', 'pretty', 'porcelain']
|
||||
|
||||
def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
|
||||
return [CompletionItem(it) for it in output]
|
||||
|
||||
return cli_option('-o',
|
||||
'--output',
|
||||
'output',
|
||||
type=click.Choice(output),
|
||||
default=default_value,
|
||||
help='Showing output type.',
|
||||
show_default=True,
|
||||
envvar='OPENLLM_OUTPUT',
|
||||
show_envvar=True,
|
||||
shell_complete=complete_output_var,
|
||||
**attrs)(f)
|
||||
|
||||
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
|
||||
|
||||
@@ -450,37 +292,23 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
|
||||
return cli_option('--serialisation',
|
||||
'--serialization',
|
||||
'serialisation',
|
||||
type=str,
|
||||
type=click.Choice(get_literal_args(LiteralSerialisation)),
|
||||
default=None,
|
||||
show_default=True,
|
||||
show_envvar=True,
|
||||
envvar='OPENLLM_SERIALIZATION',
|
||||
callback=serialisation_callback,
|
||||
help='''Serialisation format for save/load LLM.
|
||||
|
||||
Currently the following strategies are supported:
|
||||
|
||||
- ``safetensors``: This will use safetensors format, which is synonymous to
|
||||
- ``safetensors``: This will use safetensors format, which is synonymous to ``safe_serialization=True``.
|
||||
|
||||
\b
|
||||
``safe_serialization=True``.
|
||||
|
||||
\b
|
||||
> [!NOTE] that this format might not work for every cases, and
|
||||
you can always fallback to ``legacy`` if needed.
|
||||
> [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.
|
||||
|
||||
- ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
|
||||
|
||||
> [!NOTE] that GGML format is working in progress.
|
||||
''',
|
||||
**attrs)(f)
|
||||
|
||||
def serialisation_callback(ctx: click.Context, param: click.Parameter, value: LiteralSerialisation | None) -> LiteralSerialisation | None:
|
||||
if value is None: return value
|
||||
if value not in {'safetensors', 'legacy'}:
|
||||
raise click.BadParameter(f"'serialisation' only accept 'safetensors', 'legacy' as serialisation format. got {value} instead.", ctx, param) from None
|
||||
return value
|
||||
|
||||
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--container-registry',
|
||||
'container_registry',
|
||||
|
||||
@@ -7,20 +7,21 @@ import subprocess
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
import orjson
|
||||
|
||||
from simple_di import Provide
|
||||
from simple_di import inject
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import openllm_core
|
||||
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm.exceptions import OpenLLMException
|
||||
from openllm_core._typing_compat import LiteralSerialisation
|
||||
from openllm_core.exceptions import OpenLLMException
|
||||
from openllm_core.utils import codegen
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import is_vllm_available
|
||||
|
||||
from . import termui
|
||||
from ._factory import start_command_factory
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from bentoml._internal.bento import BentoStore
|
||||
from openllm_core._configuration import LLMConfig
|
||||
@@ -28,15 +29,11 @@ if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry
|
||||
from openllm_core._typing_compat import LiteralContainerVersionStrategy
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralSerialisation
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _start(model_name: str,
|
||||
/,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
def _start(model_id: str,
|
||||
timeout: int = 30,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
|
||||
device: tuple[str, ...] | t.Literal['all'] | None = None,
|
||||
@@ -61,8 +58,7 @@ def _start(model_name: str,
|
||||
``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_id: The model id to start this LLMServer
|
||||
timeout: The server timeout
|
||||
system_message: Optional system message for supported LLMs. If given LLM supports system message, OpenLLM will provide a default system message.
|
||||
prompt_template_file: Optional file path containing user-defined custom prompt template. By default, the prompt template for the specified LLM will be used..
|
||||
@@ -91,8 +87,7 @@ def _start(model_name: str,
|
||||
from .entrypoint import start_grpc_command
|
||||
os.environ['OPENLLM_BACKEND'] = openllm_core.utils.first_not_none(backend, default='vllm' if is_vllm_available() else 'pt')
|
||||
|
||||
args: list[str] = []
|
||||
if model_id: args.extend(['--model-id', model_id])
|
||||
args: list[str] = [model_id]
|
||||
if system_message: args.extend(['--system-message', system_message])
|
||||
if prompt_template_file: args.extend(['--prompt-template-file', openllm_core.utils.resolve_filepath(prompt_template_file)])
|
||||
if timeout: args.extend(['--server-timeout', str(timeout)])
|
||||
@@ -106,14 +101,11 @@ def _start(model_name: str,
|
||||
if additional_args: args.extend(additional_args)
|
||||
if __test__: args.append('--return-process')
|
||||
|
||||
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS,
|
||||
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
|
||||
cmd = start_command if not _serve_grpc else start_grpc_command
|
||||
return cmd.main(args=args, standalone_mode=False)
|
||||
|
||||
@inject
|
||||
def _build(model_name: str,
|
||||
/,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
def _build(model_id: str,
|
||||
model_version: str | None = None,
|
||||
bento_version: str | None = None,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
@@ -122,17 +114,17 @@ def _build(model_name: str,
|
||||
prompt_template_file: str | None = None,
|
||||
build_ctx: str | None = None,
|
||||
enable_features: tuple[str, ...] | None = None,
|
||||
workers_per_resource: float | None = None,
|
||||
dockerfile_template: str | None = None,
|
||||
overwrite: bool = False,
|
||||
container_registry: LiteralContainerRegistry | None = None,
|
||||
container_version_strategy: LiteralContainerVersionStrategy | None = None,
|
||||
push: bool = False,
|
||||
force_push: bool = False,
|
||||
containerize: bool = False,
|
||||
serialisation: LiteralSerialisation | None = None,
|
||||
additional_args: list[str] | None = None,
|
||||
bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
|
||||
"""Package a LLM into a Bento.
|
||||
"""Package a LLM into a BentoLLM.
|
||||
|
||||
The LLM will be built into a BentoService with the following structure:
|
||||
if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
|
||||
@@ -140,8 +132,7 @@ def _build(model_name: str,
|
||||
``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_id: The model id to build this BentoLLM
|
||||
model_version: Optional model version for this given LLM
|
||||
bento_version: Optional bento veresion for this given BentoLLM
|
||||
system_message: Optional system message for supported LLMs. If given LLM supports system message, OpenLLM will provide a default system message.
|
||||
@@ -154,15 +145,6 @@ def _build(model_name: str,
|
||||
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
|
||||
build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
|
||||
enable_features: Additional OpenLLM features to be included with this BentoLLM.
|
||||
workers_per_resource: Number of workers per resource assigned.
|
||||
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
|
||||
for more information. By default, this is set to 1.
|
||||
|
||||
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
|
||||
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
> - ``conserved``: This will determine the number of available GPU resources, and only assign
|
||||
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
|
||||
> equivalent to ``--workers-per-resource 0.25``.
|
||||
dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
|
||||
overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
|
||||
push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
|
||||
@@ -178,17 +160,17 @@ def _build(model_name: str,
|
||||
Returns:
|
||||
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
config = openllm.AutoConfig.for_model(model_name)
|
||||
_serialisation = openllm_core.utils.first_not_none(serialisation, default=config['serialisation'])
|
||||
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', _serialisation]
|
||||
from ..serialisation.transformers.weights import has_safetensors_weights
|
||||
args: list[str] = [
|
||||
sys.executable, '-m', 'openllm', 'build', model_id, '--machine', '--serialisation',
|
||||
t.cast(LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id) else 'legacy'))
|
||||
]
|
||||
if quantize: args.extend(['--quantize', quantize])
|
||||
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
|
||||
if push: args.extend(['--push'])
|
||||
if containerize: args.extend(['--containerize'])
|
||||
if model_id: args.extend(['--model-id', model_id])
|
||||
if build_ctx: args.extend(['--build-ctx', build_ctx])
|
||||
if enable_features: args.extend([f'--enable-features={f}' for f in enable_features])
|
||||
if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource)])
|
||||
if overwrite: args.append('--overwrite')
|
||||
if system_message: args.extend(['--system-message', system_message])
|
||||
if prompt_template_file: args.extend(['--prompt-template-file', openllm_core.utils.resolve_filepath(prompt_template_file)])
|
||||
@@ -204,23 +186,24 @@ def _build(model_name: str,
|
||||
try:
|
||||
output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error('Exception caught while building %s', model_name, exc_info=e)
|
||||
logger.error("Exception caught while building Bento for '%s'", model_id, exc_info=e)
|
||||
if e.stderr: raise OpenLLMException(e.stderr.decode('utf-8')) from None
|
||||
raise OpenLLMException(str(e)) from None
|
||||
matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
|
||||
matched = re.match(r'__object__:(\{.*\})$', output.decode('utf-8').strip())
|
||||
if matched is None:
|
||||
raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
|
||||
return bentoml.get(matched.group(1), _bento_store=bento_store)
|
||||
try:
|
||||
result = orjson.loads(matched.group(1))
|
||||
except orjson.JSONDecodeError as e:
|
||||
raise ValueError(f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") from e
|
||||
return bentoml.get(result['tag'], _bento_store=bento_store)
|
||||
|
||||
def _import_model(model_name: str,
|
||||
/,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
def _import_model(model_id: str,
|
||||
model_version: str | None = None,
|
||||
backend: LiteralBackend | None = None,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
serialisation: t.Literal['legacy', 'safetensors'] | None = None,
|
||||
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
|
||||
serialisation: LiteralSerialisation | None = None,
|
||||
additional_args: t.Sequence[str] | None = None) -> dict[str, t.Any]:
|
||||
"""Import a LLM into local store.
|
||||
|
||||
> [!NOTE]
|
||||
@@ -228,14 +211,13 @@ def _import_model(model_name: str,
|
||||
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
|
||||
> support on-demand quantisation during initial startup.
|
||||
|
||||
``openllm.download`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.
|
||||
``openllm.import_model`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.
|
||||
|
||||
> [!NOTE]
|
||||
> ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
|
||||
> ``openllm.start`` will automatically invoke ``openllm.import_model`` under the hood.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_id: required model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
backend: The backend to use for this LLM. By default, this is set to ``pt``.
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
@@ -243,29 +225,26 @@ def _import_model(model_name: str,
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
Default behaviour is similar to ``safe_serialization=False``.
|
||||
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. Default behaviour is similar to ``safe_serialization=False``.
|
||||
additional_args: Additional arguments to pass to ``openllm import``.
|
||||
|
||||
Returns:
|
||||
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
from .entrypoint import import_command
|
||||
config = openllm.AutoConfig.for_model(model_name)
|
||||
_serialisation = openllm_core.utils.first_not_none(serialisation, default=config['serialisation'])
|
||||
args = [model_name, '--machine', '--serialisation', _serialisation]
|
||||
args = [model_id, '--quiet']
|
||||
if backend is not None: args.extend(['--backend', backend])
|
||||
if model_id is not None: args.append(model_id)
|
||||
if model_version is not None: args.extend(['--model-version', str(model_version)])
|
||||
if additional_args is not None: args.extend(additional_args)
|
||||
if quantize is not None: args.extend(['--quantize', quantize])
|
||||
if serialisation is not None: args.extend(['--serialisation', serialisation])
|
||||
if additional_args is not None: args.extend(additional_args)
|
||||
return import_command.main(args=args, standalone_mode=False)
|
||||
|
||||
def _list_models() -> dict[str, t.Any]:
|
||||
"""List all available models within the local store."""
|
||||
from .entrypoint import models_command
|
||||
return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
|
||||
return models_command.main(args=['--show-available', '--quiet'], standalone_mode=False)
|
||||
|
||||
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
|
||||
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
|
||||
start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True)
|
||||
build, import_model, list_models = codegen.gen_sdk(_build), codegen.gen_sdk(_import_model), codegen.gen_sdk(_list_models)
|
||||
__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
|
||||
|
||||
@@ -5,29 +5,33 @@ This module also contains the SDK to call ``start`` and ``build`` from SDK
|
||||
Start any LLM:
|
||||
|
||||
```python
|
||||
openllm.start("falcon", model_id='tiiuae/falcon-7b-instruct')
|
||||
openllm.start('mistral', model_id='mistralai/Mistral-7B-v0.1')
|
||||
```
|
||||
|
||||
Build a BentoLLM
|
||||
|
||||
```python
|
||||
bento = openllm.build("falcon")
|
||||
bento = openllm.build('mistralai/Mistral-7B-v0.1')
|
||||
```
|
||||
|
||||
Import any LLM into local store
|
||||
```python
|
||||
bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct')
|
||||
bentomodel = openllm.import_model('mistralai/Mistral-7B-v0.1')
|
||||
```
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import enum
|
||||
import functools
|
||||
import importlib.util
|
||||
import inspect
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import random
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
@@ -57,23 +61,22 @@ from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralSerialisation
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import NotRequired
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import Self
|
||||
from openllm_core.config import CONFIG_MAPPING
|
||||
from openllm_core.utils import DEBUG_ENV_VAR
|
||||
from openllm_core.utils import OPTIONAL_DEPENDENCIES
|
||||
from openllm_core.utils import QUIET_ENV_VAR
|
||||
from openllm_core.utils import EnvVarMixin
|
||||
from openllm_core.utils import LazyLoader
|
||||
from openllm_core.utils import analytics
|
||||
from openllm_core.utils import check_bool_env
|
||||
from openllm_core.utils import compose
|
||||
from openllm_core.utils import configure_logging
|
||||
from openllm_core.utils import converter
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import get_quiet_mode
|
||||
from openllm_core.utils import is_torch_available
|
||||
from openllm_core.utils import is_vllm_available
|
||||
from openllm_core.utils import resolve_user_filepath
|
||||
from openllm_core.utils import set_debug_mode
|
||||
from openllm_core.utils import set_quiet_mode
|
||||
@@ -85,24 +88,22 @@ from ._factory import _AnyCallable
|
||||
from ._factory import backend_option
|
||||
from ._factory import container_registry_option
|
||||
from ._factory import machine_option
|
||||
from ._factory import model_id_option
|
||||
from ._factory import model_name_argument
|
||||
from ._factory import model_version_option
|
||||
from ._factory import output_option
|
||||
from ._factory import parse_config_options
|
||||
from ._factory import prompt_template_file_option
|
||||
from ._factory import quantize_option
|
||||
from ._factory import serialisation_option
|
||||
from ._factory import start_command_factory
|
||||
from ._factory import start_decorator
|
||||
from ._factory import system_message_option
|
||||
from ._factory import workers_per_resource_option
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
from bentoml._internal.bento import BentoStore
|
||||
from bentoml._internal.container import DefaultBuilder
|
||||
from openllm_client._schemas import Response
|
||||
from openllm_client._schemas import StreamingResponse
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry
|
||||
from openllm_core._typing_compat import LiteralContainerVersionStrategy
|
||||
else:
|
||||
@@ -134,6 +135,16 @@ _object_setattr = object.__setattr__
|
||||
|
||||
_EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension'))
|
||||
|
||||
def backend_warning(backend: LiteralBackend):
|
||||
if backend == 'pt' and check_bool_env('OPENLLM_BACKEND_WARNING') and not get_quiet_mode():
|
||||
if openllm.utils.is_vllm_available():
|
||||
termui.warning(
|
||||
'\nvLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).')
|
||||
else:
|
||||
termui.warning('\nvLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.')
|
||||
termui.debug(
|
||||
content="\nTip: if you are running 'openllm build' you can set '--backend vllm' to package your Bento with vLLM backend. To hide these messages, set 'OPENLLM_BACKEND_WARNING=False'\n")
|
||||
|
||||
class Extensions(click.MultiCommand):
|
||||
def list_commands(self, ctx: click.Context) -> list[str]:
|
||||
return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])
|
||||
@@ -162,7 +173,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
ctx.obj = GlobalOptions(cloud_context=cloud_context)
|
||||
if quiet:
|
||||
set_quiet_mode(True)
|
||||
if debug: logger.warning("'--quiet' passed; ignoring '--verbose/--debug'")
|
||||
if debug: termui.warning("'--quiet' passed; ignoring '--verbose/--debug'")
|
||||
elif debug: set_debug_mode(True)
|
||||
configure_logging()
|
||||
return f(*args, **attrs)
|
||||
@@ -202,21 +213,9 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
|
||||
return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
|
||||
cmd_name = self.resolve_alias(cmd_name)
|
||||
if ctx.command.name in _start_mapping:
|
||||
try:
|
||||
return _start_mapping[ctx.command.name][cmd_name]
|
||||
except KeyError:
|
||||
# TODO: support start from a bento
|
||||
try:
|
||||
bentoml.get(cmd_name)
|
||||
raise click.ClickException(f"'openllm start {cmd_name}' is currently disabled for the time being. Please let us know if you need this feature by opening an issue on GitHub.")
|
||||
except bentoml.exceptions.NotFound:
|
||||
pass
|
||||
raise click.BadArgumentUsage(f'{cmd_name} is not a valid model identifier supported by OpenLLM.') from None
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
def list_commands(self, ctx: click.Context) -> list[str]:
|
||||
if ctx.command.name in {'start', 'start-grpc'}: return list(CONFIG_MAPPING.keys())
|
||||
return super().list_commands(ctx) + t.cast('Extensions', extension_command).list_commands(ctx)
|
||||
|
||||
def command(self, *args: t.Any, **kwargs: t.Any) -> t.Callable[[t.Callable[..., t.Any]], click.Command]: # type: ignore[override] # XXX: fix decorator on BentoMLCommandGroup
|
||||
@@ -280,10 +279,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
formatter.write_dl(rows)
|
||||
|
||||
@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='openllm')
|
||||
@click.version_option(None,
|
||||
'--version',
|
||||
'-v',
|
||||
message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}")
|
||||
@click.version_option(None, '--version', '-v', message=f'%(prog)s, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}')
|
||||
def cli() -> None:
|
||||
"""\b
|
||||
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
|
||||
@@ -298,52 +294,285 @@ def cli() -> None:
|
||||
Fine-tune, serve, deploy, and monitor any LLMs with ease.
|
||||
"""
|
||||
|
||||
@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'])
|
||||
def start_command() -> None:
|
||||
@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'], short_help='Start a LLMServer for any supported LLM.')
|
||||
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
|
||||
@click.option('--model-id',
|
||||
'deprecated_model_id',
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
hidden=True,
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.')
|
||||
@start_decorator(serve_grpc=False)
|
||||
def start_command(model_id: str, server_timeout: int, model_version: str | None, system_message: str | None, prompt_template_file: t.IO[t.Any] | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], quantize: LiteralQuantise | None, backend: LiteralBackend | None,
|
||||
serialisation: LiteralSerialisation | None, cors: bool, adapter_id: str | None, return_process: bool, deprecated_model_id: str | None,
|
||||
**attrs: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
"""Start any LLM as a REST server.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm <start|start-http> <model_name> --<options> ...
|
||||
$ openllm <start|start-http> <model_id> --<options> ...
|
||||
```
|
||||
"""
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
_model_name = model_id
|
||||
if deprecated_model_id is not None: model_id = deprecated_model_id
|
||||
else: model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
|
||||
termui.warning(
|
||||
f"Passing 'openllm start {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm start {model_id}' instead."
|
||||
)
|
||||
|
||||
@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start-grpc')
|
||||
def start_grpc_command() -> None:
|
||||
adapter_map: dict[str, str] | None = attrs.pop('adapter_map', None)
|
||||
prompt_template = prompt_template_file.read() if prompt_template_file is not None else None
|
||||
|
||||
from ..serialisation.transformers.weights import has_safetensors_weights
|
||||
serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id) else 'legacy'))
|
||||
if serialisation == 'safetensors' and quantize is not None and check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
termui.warning(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation."
|
||||
)
|
||||
termui.warning(f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure.")
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=model_id,
|
||||
model_version=model_version,
|
||||
prompt_template=prompt_template,
|
||||
system_message=system_message,
|
||||
backend=backend,
|
||||
adapter_map=adapter_map,
|
||||
quantize=quantize,
|
||||
serialisation=serialisation,
|
||||
trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'))
|
||||
backend_warning(llm.__llm_backend__)
|
||||
|
||||
config, server_attrs = llm.config.model_validate_click(**attrs)
|
||||
server_timeout = first_not_none(server_timeout, default=config['timeout'])
|
||||
server_attrs.update({'working_dir': os.path.dirname(os.path.dirname(__file__)), 'timeout': server_timeout})
|
||||
# XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
|
||||
development = server_attrs.pop('development')
|
||||
server_attrs.setdefault('production', not development)
|
||||
wpr = first_not_none(workers_per_resource, default=config['workers_per_resource'])
|
||||
if isinstance(wpr, str):
|
||||
if wpr == 'round_robin': wpr = 1.0
|
||||
elif wpr == 'conserved':
|
||||
if device and openllm.utils.device_count() == 0:
|
||||
termui.echo('--device will have no effect as there is no GPUs available', fg='yellow')
|
||||
wpr = 1.0
|
||||
else:
|
||||
available_gpu = len(device) if device else openllm.utils.device_count()
|
||||
wpr = 1.0 if available_gpu == 0 else float(1 / available_gpu)
|
||||
else:
|
||||
wpr = float(wpr)
|
||||
elif isinstance(wpr, int):
|
||||
wpr = float(wpr)
|
||||
|
||||
requirements = llm.config['requirements']
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
|
||||
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy())
|
||||
start_env.update({
|
||||
'OPENLLM_MODEL_ID': model_id,
|
||||
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
|
||||
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
|
||||
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
|
||||
'OPENLLM_SERIALIZATION': serialisation,
|
||||
'OPENLLM_BACKEND': llm.__llm_backend__,
|
||||
'OPENLLM_CONFIG': llm.config.model_dump_json(flatten=True).decode(),
|
||||
})
|
||||
if llm._quantise: start_env['OPENLLM_QUANTIZE'] = str(llm._quantise)
|
||||
if system_message: start_env['OPENLLM_SYSTEM_MESSAGE'] = system_message
|
||||
if prompt_template: start_env['OPENLLM_PROMPT_TEMPLATE'] = prompt_template
|
||||
|
||||
server = bentoml.HTTPServer('_service:svc', **server_attrs)
|
||||
openllm.utils.analytics.track_start_init(llm.config)
|
||||
|
||||
def next_step(adapter_map: DictStrAny | None, caught_exception: bool = False) -> None:
|
||||
if caught_exception: return
|
||||
cmd_name = f'openllm build {model_id}'
|
||||
if llm._quantise: cmd_name += f' --quantize {llm._quantise}'
|
||||
cmd_name += f' --serialization {serialisation}'
|
||||
if adapter_map is not None:
|
||||
cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
|
||||
if not openllm.utils.get_quiet_mode():
|
||||
termui.info(f"\n\n🚀 Next step: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
|
||||
|
||||
_exception = False
|
||||
if return_process:
|
||||
server.start(env=start_env, text=True)
|
||||
if server.process is None: raise click.ClickException('Failed to start the server.')
|
||||
return server.process
|
||||
else:
|
||||
try:
|
||||
server.start(env=start_env, text=True, blocking=True)
|
||||
except KeyboardInterrupt:
|
||||
_exception = True
|
||||
except Exception as err:
|
||||
termui.error(f'Error caught while running LLM Server:\n{err}')
|
||||
_exception = True
|
||||
raise
|
||||
else:
|
||||
next_step(adapter_map, _exception)
|
||||
|
||||
# NOTE: Return the configuration for telemetry purposes.
|
||||
return config
|
||||
|
||||
@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start-grpc', short_help='Start a gRPC LLMServer for any supported LLM.')
|
||||
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
|
||||
@click.option('--model-id',
|
||||
'deprecated_model_id',
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
hidden=True,
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.')
|
||||
@start_decorator(serve_grpc=True)
|
||||
def start_grpc_command(model_id: str, server_timeout: int, model_version: str | None, system_message: str | None, prompt_template_file: t.IO[t.Any] | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], quantize: LiteralQuantise | None, backend: LiteralBackend | None,
|
||||
serialisation: LiteralSerialisation | None, cors: bool, adapter_id: str | None, return_process: bool, deprecated_model_id: str | None,
|
||||
**attrs: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
"""Start any LLM as a gRPC server.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm start-grpc <model_name> --<options> ...
|
||||
$ openllm start-grpc <model_id> --<options> ...
|
||||
```
|
||||
"""
|
||||
termui.warning('Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n')
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
_model_name = model_id
|
||||
if deprecated_model_id is not None: model_id = deprecated_model_id
|
||||
else: model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
|
||||
termui.warning(
|
||||
f"Passing 'openllm start-grpc {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm start-grpc {model_id}' instead."
|
||||
)
|
||||
|
||||
_start_mapping = {
|
||||
'start': {
|
||||
key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING
|
||||
},
|
||||
'start-grpc': {
|
||||
key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING
|
||||
}
|
||||
}
|
||||
adapter_map: dict[str, str] | None = attrs.pop('adapter_map', None)
|
||||
prompt_template = prompt_template_file.read() if prompt_template_file is not None else None
|
||||
|
||||
from ..serialisation.transformers.weights import has_safetensors_weights
|
||||
serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id) else 'legacy'))
|
||||
if serialisation == 'safetensors' and quantize is not None and check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
termui.warning(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation."
|
||||
)
|
||||
termui.warning(f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure.")
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=model_id,
|
||||
model_version=model_version,
|
||||
prompt_template=prompt_template,
|
||||
system_message=system_message,
|
||||
backend=backend,
|
||||
adapter_map=adapter_map,
|
||||
quantize=quantize,
|
||||
serialisation=serialisation,
|
||||
trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'))
|
||||
backend_warning(llm.__llm_backend__)
|
||||
|
||||
config, server_attrs = llm.config.model_validate_click(**attrs)
|
||||
server_timeout = first_not_none(server_timeout, default=config['timeout'])
|
||||
server_attrs.update({'working_dir': os.path.dirname(os.path.dirname(__file__)), 'timeout': server_timeout})
|
||||
server_attrs['grpc_protocol_version'] = 'v1'
|
||||
# XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
|
||||
development = server_attrs.pop('development')
|
||||
server_attrs.setdefault('production', not development)
|
||||
wpr = first_not_none(workers_per_resource, default=config['workers_per_resource'])
|
||||
if isinstance(wpr, str):
|
||||
if wpr == 'round_robin': wpr = 1.0
|
||||
elif wpr == 'conserved':
|
||||
if device and openllm.utils.device_count() == 0:
|
||||
termui.echo('--device will have no effect as there is no GPUs available', fg='yellow')
|
||||
wpr = 1.0
|
||||
else:
|
||||
available_gpu = len(device) if device else openllm.utils.device_count()
|
||||
wpr = 1.0 if available_gpu == 0 else float(1 / available_gpu)
|
||||
else:
|
||||
wpr = float(wpr)
|
||||
elif isinstance(wpr, int):
|
||||
wpr = float(wpr)
|
||||
|
||||
requirements = llm.config['requirements']
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
termui.warning(f'Make sure to have the following dependencies available: {missing_requirements}')
|
||||
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy())
|
||||
start_env.update({
|
||||
'OPENLLM_MODEL_ID': model_id,
|
||||
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
|
||||
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
|
||||
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
|
||||
'OPENLLM_SERIALIZATION': serialisation,
|
||||
'OPENLLM_BACKEND': llm.__llm_backend__,
|
||||
'OPENLLM_CONFIG': llm.config.model_dump_json().decode(),
|
||||
})
|
||||
if llm._quantise: start_env['OPENLLM_QUANTIZE'] = str(llm._quantise)
|
||||
if system_message: start_env['OPENLLM_SYSTEM_MESSAGE'] = system_message
|
||||
if prompt_template: start_env['OPENLLM_PROMPT_TEMPLATE'] = prompt_template
|
||||
|
||||
server = bentoml.GrpcServer('_service:svc', **server_attrs)
|
||||
openllm.utils.analytics.track_start_init(llm.config)
|
||||
|
||||
def next_step(adapter_map: DictStrAny | None, caught_exception: bool = False) -> None:
|
||||
if caught_exception: return
|
||||
cmd_name = f'openllm build {model_id}'
|
||||
if llm._quantise: cmd_name += f' --quantize {llm._quantise}'
|
||||
cmd_name += f' --serialization {serialisation}'
|
||||
if adapter_map is not None:
|
||||
cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
|
||||
if not openllm.utils.get_quiet_mode():
|
||||
termui.info(f"\n🚀 Next step: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
|
||||
|
||||
_exception = False
|
||||
if return_process:
|
||||
server.start(env=start_env, text=True)
|
||||
if server.process is None: raise click.ClickException('Failed to start the server.')
|
||||
return server.process
|
||||
else:
|
||||
try:
|
||||
server.start(env=start_env, text=True, blocking=True)
|
||||
except KeyboardInterrupt:
|
||||
_exception = True
|
||||
except Exception as err:
|
||||
termui.error(f'Error caught while running LLM Server:\n{err}')
|
||||
_exception = True
|
||||
raise
|
||||
else:
|
||||
next_step(adapter_map, _exception)
|
||||
|
||||
# NOTE: Return the configuration for telemetry purposes.
|
||||
return config
|
||||
|
||||
class ItemState(enum.Enum):
|
||||
NOT_FOUND = 'NOT_FOUND'
|
||||
EXISTS = 'EXISTS'
|
||||
OVERWRITE = 'OVERWRITE'
|
||||
|
||||
class ImportModelOutput(t.TypedDict):
|
||||
state: ItemState
|
||||
backend: LiteralBackend
|
||||
tag: str
|
||||
|
||||
@cli.command(name='import', aliases=['download'])
|
||||
@model_name_argument
|
||||
@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
|
||||
@click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
|
||||
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
|
||||
@click.option('--model-id',
|
||||
'deprecated_model_id',
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
hidden=True,
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.')
|
||||
@model_version_option
|
||||
@output_option
|
||||
@quantize_option
|
||||
@machine_option
|
||||
@backend_option
|
||||
@quantize_option
|
||||
@serialisation_option
|
||||
def import_command(model_name: str, model_id: str | None, converter: str | None, model_version: str | None, output: LiteralOutput, machine: bool, backend: LiteralBackend | None,
|
||||
quantize: LiteralQuantise | None, serialisation: LiteralSerialisation | None) -> bentoml.Model:
|
||||
def import_command(model_id: str, deprecated_model_id: str | None, model_version: str | None, backend: LiteralBackend | None, quantize: LiteralQuantise | None,
|
||||
serialisation: LiteralSerialisation | None) -> ImportModelOutput:
|
||||
"""Setup LLM interactively.
|
||||
|
||||
It accepts two positional arguments: `model_name` and `model_id`. The first name determine
|
||||
the model type to download, and the second one is the optional model id to download.
|
||||
|
||||
\b
|
||||
This `model_id` can be either pretrained model id that you can get from HuggingFace Hub, or
|
||||
a custom model path from your custom pretrained model. Note that the custom model path should
|
||||
@@ -351,8 +580,9 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
and `transformers.PreTrainedTokenizer` objects.
|
||||
|
||||
\b
|
||||
Note: This is useful for development and setup for fine-tune.
|
||||
This will be automatically called when `ensure_available=True` in `openllm.LLM.for_model`
|
||||
Note that if `--serialisation` is not defined, then we will try to infer serialisation from HuggingFace Hub.
|
||||
If the model id contains safetensors weights, then we will use `safetensors` serialisation. Otherwise, we will
|
||||
fallback to `legacy` '.bin' (otherwise known as pickle) serialisation.
|
||||
|
||||
\b
|
||||
``--model-version`` is an optional option to save the model. Note that
|
||||
@@ -362,7 +592,7 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm import opt facebook/opt-2.7b
|
||||
$ openllm import mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
\b
|
||||
@@ -370,41 +600,58 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
|
||||
> support on-demand quantisation during initial startup.
|
||||
"""
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
_serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default=llm_config['serialisation']))
|
||||
env = EnvVarMixin(model_name, model_id=model_id, quantize=quantize)
|
||||
model_id = first_not_none(model_id, env['model_id_value'], default=llm_config['default_id'])
|
||||
backend = first_not_none(backend, env['backend_value'], default='vllm' if is_vllm_available() else 'pt')
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=model_id, llm_config=llm_config, revision=model_version, quantize=env['quantize_value'], serialisation=_serialisation, backend=backend)
|
||||
_previously_saved = False
|
||||
from ..serialisation.transformers.weights import has_safetensors_weights
|
||||
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
_model_name = model_id
|
||||
if deprecated_model_id is not None: model_id = deprecated_model_id
|
||||
else: model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
|
||||
termui.echo(
|
||||
f"Passing 'openllm import {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm import {model_id}' instead.",
|
||||
fg='yellow')
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=model_id,
|
||||
model_version=model_version,
|
||||
quantize=quantize,
|
||||
backend=backend,
|
||||
serialisation=t.cast(LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id) else 'legacy')))
|
||||
backend_warning(llm.__llm_backend__)
|
||||
|
||||
state = ItemState.NOT_FOUND
|
||||
try:
|
||||
_ref = openllm.serialisation.get(llm)
|
||||
_previously_saved = True
|
||||
except openllm.exceptions.OpenLLMException:
|
||||
if not machine and output == 'pretty':
|
||||
msg = f"'{model_name}' with model_id='{model_id}' does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
termui.echo(msg, fg='yellow', nl=True)
|
||||
_ref = openllm.serialisation.get(llm, auto_import=True)
|
||||
if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
if machine: return _ref
|
||||
elif output == 'pretty':
|
||||
if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow')
|
||||
else: termui.echo(f'Saved model: {_ref.tag}')
|
||||
elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
|
||||
else: termui.echo(_ref.tag)
|
||||
return _ref
|
||||
model = bentoml.models.get(llm.tag)
|
||||
state = ItemState.EXISTS
|
||||
except bentoml.exceptions.NotFound:
|
||||
model = openllm.serialisation.import_model(llm, trust_remote_code=llm.trust_remote_code)
|
||||
if llm.__llm_backend__ == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
response = ImportModelOutput(state=state, backend=llm.__llm_backend__, tag=str(model.tag))
|
||||
termui.echo(orjson.dumps(response).decode(), fg='white')
|
||||
return response
|
||||
|
||||
class DeploymentInstruction(t.TypedDict):
|
||||
type: t.Literal['container', 'bentocloud']
|
||||
content: str
|
||||
|
||||
class BuildBentoOutput(t.TypedDict):
|
||||
state: ItemState
|
||||
tag: str
|
||||
backend: LiteralBackend
|
||||
instructions: t.List[DeploymentInstruction]
|
||||
|
||||
@cli.command(context_settings={'token_normalize_func': inflection.underscore})
|
||||
@model_name_argument
|
||||
@model_id_option
|
||||
@output_option
|
||||
@machine_option
|
||||
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
|
||||
@click.option('--model-id',
|
||||
'deprecated_model_id',
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
hidden=True,
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.')
|
||||
@backend_option
|
||||
@system_message_option
|
||||
@prompt_template_file_option
|
||||
@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
|
||||
@workers_per_resource_option(factory=click, build=True)
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options') # type: ignore[misc]
|
||||
@quantize_option(factory=cog.optgroup, build=True)
|
||||
@click.option('--enable-features',
|
||||
@@ -434,17 +681,18 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.")
|
||||
@cog.optgroup.option('--push', default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
|
||||
@click.option('--force-push', default=False, is_flag=True, type=click.BOOL, help='Whether to force push.')
|
||||
@machine_option
|
||||
@click.pass_context
|
||||
def build_command(ctx: click.Context, /, model_name: str, model_id: str | None, bento_version: str | None, overwrite: bool, output: LiteralOutput, quantize: LiteralQuantise | None,
|
||||
enable_features: tuple[str, ...] | None, workers_per_resource: float | None, adapter_id: tuple[str, ...], build_ctx: str | None, backend: LiteralBackend | None,
|
||||
system_message: str | None, prompt_template_file: t.IO[t.Any] | None, machine: bool, model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool,
|
||||
push: bool, serialisation: LiteralSerialisation | None, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
|
||||
force_push: bool, **attrs: t.Any) -> bentoml.Bento:
|
||||
"""Package a given models into a Bento.
|
||||
def build_command(ctx: click.Context, /, model_id: str, deprecated_model_id: str | None, bento_version: str | None, overwrite: bool, quantize: LiteralQuantise | None, machine: bool,
|
||||
enable_features: tuple[str, ...] | None, adapter_id: tuple[str, ...], build_ctx: str | None, backend: LiteralBackend | None, system_message: str | None,
|
||||
prompt_template_file: t.IO[t.Any] | None, model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool, push: bool,
|
||||
serialisation: LiteralSerialisation | None, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, force_push: bool,
|
||||
**_: t.Any) -> BuildBentoOutput:
|
||||
"""Package a given models into a BentoLLM.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm build flan-t5 --model-id google/flan-t5-large
|
||||
$ openllm build google/flan-t5-large
|
||||
```
|
||||
|
||||
\b
|
||||
@@ -457,47 +705,47 @@ def build_command(ctx: click.Context, /, model_name: str, model_id: str | None,
|
||||
> To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
|
||||
> target also use the same Python version and architecture as build machine.
|
||||
"""
|
||||
if machine: output = 'porcelain'
|
||||
from .._llm import normalise_model_name
|
||||
from ..serialisation.transformers.weights import has_safetensors_weights
|
||||
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
_model_name = model_id
|
||||
if deprecated_model_id is not None: model_id = deprecated_model_id
|
||||
else: model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
|
||||
termui.echo(
|
||||
f"Passing 'openllm build {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm build {model_id}' instead.",
|
||||
fg='yellow')
|
||||
|
||||
if enable_features: enable_features = tuple(itertools.chain.from_iterable((s.split(',') for s in enable_features)))
|
||||
|
||||
_previously_built = False
|
||||
state = ItemState.NOT_FOUND
|
||||
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
_serialisation = t.cast(LiteralSerialisation, first_not_none(serialisation, default=llm_config['serialisation']))
|
||||
env = EnvVarMixin(model_name, backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), model_id=model_id or llm_config['default_id'], quantize=quantize)
|
||||
prompt_template: str | None = prompt_template_file.read() if prompt_template_file is not None else None
|
||||
prompt_template = prompt_template_file.read() if prompt_template_file is not None else None
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=model_id,
|
||||
model_version=model_version,
|
||||
prompt_template=prompt_template,
|
||||
system_message=system_message,
|
||||
backend=backend,
|
||||
quantize=quantize,
|
||||
serialisation=t.cast(LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id) else 'legacy')))
|
||||
backend_warning(llm.__llm_backend__)
|
||||
|
||||
os.environ.update({'OPENLLM_BACKEND': llm.__llm_backend__, 'OPENLLM_SERIALIZATION': llm._serialisation, 'OPENLLM_MODEL_ID': llm.model_id})
|
||||
if llm._quantise: os.environ['OPENLLM_QUANTIZE'] = str(llm._quantise)
|
||||
if system_message: os.environ['OPENLLM_SYSTEM_MESSAGE'] = system_message
|
||||
if prompt_template: os.environ['OPENLLM_PROMPT_TEMPLATE'] = prompt_template
|
||||
|
||||
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
|
||||
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
|
||||
try:
|
||||
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': _serialisation, env.backend: env['backend_value']})
|
||||
if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
|
||||
if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
|
||||
if env['backend_value']: os.environ[env.backend] = str(env['backend_value'])
|
||||
if system_message: os.environ['OPENLLM_SYSTEM_MESSAGE'] = system_message
|
||||
if prompt_template: os.environ['OPENLLM_PROMPT_TEMPLATE'] = prompt_template
|
||||
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id=env['model_id_value'] or llm_config['default_id'],
|
||||
revision=model_version,
|
||||
prompt_template=prompt_template,
|
||||
system_message=system_message,
|
||||
llm_config=llm_config,
|
||||
backend=env['backend_value'],
|
||||
quantize=env['quantize_value'],
|
||||
serialisation=_serialisation,
|
||||
**attrs)
|
||||
llm.save_pretrained() # ensure_available = True
|
||||
|
||||
assert llm.bentomodel # HACK: call it here to patch correct tag with revision and everything
|
||||
# FIX: This is a patch for _service_vars injection
|
||||
if 'OPENLLM_MODEL_ID' not in os.environ: os.environ['OPENLLM_MODEL_ID'] = llm.model_id
|
||||
if 'OPENLLM_ADAPTER_MAP' not in os.environ: os.environ['OPENLLM_ADAPTER_MAP'] = orjson.dumps(None).decode()
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
|
||||
workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])
|
||||
labels.update({'_type': llm.llm_type, '_framework': llm.__llm_backend__})
|
||||
|
||||
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
|
||||
with fs.open_fs(f'temp://llm_{normalise_model_name(model_id)}') as llm_fs:
|
||||
dockerfile_template_path = None
|
||||
if dockerfile_template:
|
||||
with dockerfile_template:
|
||||
@@ -505,8 +753,8 @@ def build_command(ctx: click.Context, /, model_name: str, model_id: str | None,
|
||||
dockerfile_template_path = llm_fs.getsyspath('/Dockerfile.template')
|
||||
|
||||
adapter_map: dict[str, str] | None = None
|
||||
if adapter_id and not build_ctx: ctx.fail("'build_ctx' is required when '--adapter-id' is passsed.")
|
||||
if adapter_id:
|
||||
if not build_ctx: ctx.fail("'build_ctx' is required when '--adapter-id' is passsed.")
|
||||
adapter_map = {}
|
||||
for v in adapter_id:
|
||||
_adapter_id, *adapter_name = v.rsplit(':', maxsplit=1)
|
||||
@@ -531,15 +779,14 @@ def build_command(ctx: click.Context, /, model_name: str, model_id: str | None,
|
||||
try:
|
||||
bento = bentoml.get(bento_tag)
|
||||
if overwrite:
|
||||
if output == 'pretty': termui.echo(f'Overwriting existing Bento {bento_tag}', fg='yellow')
|
||||
bentoml.delete(bento_tag)
|
||||
state = ItemState.OVERWRITE
|
||||
raise bentoml.exceptions.NotFound(f'Rebuilding existing Bento {bento_tag}') from None
|
||||
_previously_built = True
|
||||
state = ItemState.EXISTS
|
||||
except bentoml.exceptions.NotFound:
|
||||
bento = bundle.create_bento(bento_tag,
|
||||
llm_fs,
|
||||
llm,
|
||||
workers_per_resource=workers_per_resource,
|
||||
adapter_map=adapter_map,
|
||||
quantize=quantize,
|
||||
extra_dependencies=enable_features,
|
||||
@@ -547,23 +794,26 @@ def build_command(ctx: click.Context, /, model_name: str, model_id: str | None,
|
||||
container_registry=container_registry,
|
||||
container_version_strategy=container_version_strategy)
|
||||
except Exception as err:
|
||||
raise err from None
|
||||
traceback.print_exc()
|
||||
raise click.ClickException('Exception caught while building BentoLLM:\n' + str(err)) from err
|
||||
|
||||
if machine: termui.echo(f'__tag__:{bento.tag}', fg='white')
|
||||
elif output == 'pretty':
|
||||
if not get_quiet_mode() and (not push or not containerize):
|
||||
termui.echo('\n' + OPENLLM_FIGLET, fg='white')
|
||||
if not _previously_built: termui.echo(f'Successfully built {bento}.', fg='green')
|
||||
elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg='yellow')
|
||||
termui.echo('📖 Next steps:\n\n' + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
|
||||
f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" +
|
||||
"\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n",
|
||||
fg='blue',
|
||||
)
|
||||
elif output == 'json':
|
||||
termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
|
||||
else:
|
||||
termui.echo(bento.tag)
|
||||
response = BuildBentoOutput(state=state,
|
||||
tag=str(bento_tag),
|
||||
backend=llm.__llm_backend__,
|
||||
instructions=[
|
||||
DeploymentInstruction(type='bentocloud', content=f"Push to BentoCloud with 'bentoml push': `bentoml push {bento_tag}`"),
|
||||
DeploymentInstruction(type='container', content=f"Container BentoLLM with 'bentoml containerize': `bentoml containerize {bento_tag} --opt progress=plain`")
|
||||
])
|
||||
|
||||
if machine: termui.echo(f'__object__:{orjson.dumps(response).decode()}\n\n', fg='white')
|
||||
elif not get_quiet_mode() and (not push or not containerize):
|
||||
if not overwrite: termui.warning(f"Bento for '{model_id}' already exists [{bento}]. To overwrite it pass '--overwrite'.\n")
|
||||
elif state != ItemState.EXISTS: termui.info(f"Successfully built Bento '{bento.tag}'.\n")
|
||||
if not get_debug_mode():
|
||||
termui.echo(OPENLLM_FIGLET)
|
||||
termui.echo('\n📖 Next steps:\n\n', nl=False)
|
||||
for instruction in response['instructions']:
|
||||
termui.echo(f"* {instruction['content']}\n", nl=False)
|
||||
|
||||
if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
|
||||
elif containerize:
|
||||
@@ -576,86 +826,50 @@ def build_command(ctx: click.Context, /, model_name: str, model_id: str | None,
|
||||
bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io'))
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
|
||||
return bento
|
||||
|
||||
response.pop('instructions')
|
||||
if get_debug_mode(): termui.echo('\n' + orjson.dumps(response).decode(), fg=None)
|
||||
return response
|
||||
|
||||
class ModelItem(t.TypedDict):
|
||||
architecture: str
|
||||
example_id: str
|
||||
supported_backends: t.Tuple[LiteralBackend, ...]
|
||||
installation: str
|
||||
items: NotRequired[t.List[str]]
|
||||
|
||||
@cli.command()
|
||||
@output_option
|
||||
@click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
|
||||
@machine_option
|
||||
@click.pass_context
|
||||
def models_command(ctx: click.Context, output: LiteralOutput, show_available: bool, machine: bool) -> DictStrAny | None:
|
||||
def models_command(show_available: bool) -> dict[t.LiteralString, ModelItem]:
|
||||
"""List all supported models.
|
||||
|
||||
\b
|
||||
> NOTE: '--show-available' and '-o porcelain' are mutually exclusive.
|
||||
|
||||
\b
|
||||
```bash
|
||||
openllm models --show-available
|
||||
```
|
||||
"""
|
||||
from .._llm import normalise_model_name
|
||||
|
||||
models = tuple(inflection.dasherize(key) for key in CONFIG_MAPPING.keys())
|
||||
if output == 'porcelain':
|
||||
if show_available: raise click.BadOptionUsage('--show-available', "Cannot use '--show-available' with '-o porcelain' (mutually exclusive).")
|
||||
termui.echo('\n'.join(models), fg='white')
|
||||
else:
|
||||
json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {}
|
||||
converted: list[str] = []
|
||||
for m in models:
|
||||
config = openllm.AutoConfig.for_model(m)
|
||||
json_data[m] = {
|
||||
'architecture': config['architecture'],
|
||||
'model_id': config['model_ids'],
|
||||
'backend': config['backend'],
|
||||
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
|
||||
}
|
||||
converted.extend([normalise_model_name(i) for i in config['model_ids']])
|
||||
|
||||
ids_in_local_store = {
|
||||
k: [
|
||||
i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
|
||||
] for k in json_data.keys()
|
||||
}
|
||||
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
|
||||
local_models: DictStrAny | None = None
|
||||
if show_available:
|
||||
local_models = {k: [str(i.tag) for i in val] for k, val in ids_in_local_store.items()}
|
||||
|
||||
if machine:
|
||||
if show_available: json_data['local'] = local_models
|
||||
return json_data
|
||||
elif output == 'pretty':
|
||||
import tabulate
|
||||
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
# llm, architecture, url, model_id, installation, backend
|
||||
data: list[str | tuple[str, str, list[str], str, tuple[LiteralBackend, ...]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], v['backend'])])
|
||||
column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4)]
|
||||
|
||||
table = tabulate.tabulate(data, tablefmt='fancy_grid', headers=['LLM', 'Architecture', 'Models Id', 'Installation', 'Runtime'], maxcolwidths=column_widths)
|
||||
termui.echo(table, fg='white')
|
||||
|
||||
if show_available:
|
||||
if len(ids_in_local_store) == 0:
|
||||
termui.echo('No models available locally.')
|
||||
ctx.exit(0)
|
||||
termui.echo('The following are available in local store:', fg='magenta')
|
||||
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
else:
|
||||
if show_available: json_data['local'] = local_models
|
||||
termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg='white')
|
||||
ctx.exit(0)
|
||||
result: dict[t.LiteralString, ModelItem] = {
|
||||
m: ModelItem(architecture=config.__openllm_architecture__,
|
||||
example_id=random.choice(config.__openllm_model_ids__),
|
||||
supported_backends=config.__openllm_backend__,
|
||||
installation='pip install ' + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
|
||||
items=[] if not show_available else [
|
||||
str(md.tag)
|
||||
for md in bentoml.models.list()
|
||||
if 'framework' in md.info.labels and md.info.labels['framework'] == 'openllm' and 'model_name' in md.info.labels and md.info.labels['model_name'] == m
|
||||
]) for m, config in CONFIG_MAPPING.items()
|
||||
}
|
||||
termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg=None)
|
||||
return result
|
||||
|
||||
@cli.command()
|
||||
@model_name_argument(required=False)
|
||||
@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
|
||||
@click.option('--include-bentos/--no-include-bentos', is_flag=True, default=False, help='Whether to also include pruning bentos.')
|
||||
@inject
|
||||
def prune_command(model_name: str | None,
|
||||
@click.pass_context
|
||||
def prune_command(ctx: click.Context,
|
||||
model_name: str | None,
|
||||
yes: bool,
|
||||
include_bentos: bool,
|
||||
model_store: ModelStore = Provide[BentoMLContainer.model_store],
|
||||
@@ -679,7 +893,8 @@ def prune_command(model_name: str | None,
|
||||
else: delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
|
||||
if delete_confirmed:
|
||||
store.delete(store_item.tag)
|
||||
termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg='yellow')
|
||||
termui.warning(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.")
|
||||
ctx.exit(0)
|
||||
|
||||
def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, value: list[str] | str | None) -> tuple[str, bool | str] | list[str] | str | None:
|
||||
if value is None:
|
||||
@@ -700,23 +915,26 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val
|
||||
else:
|
||||
raise click.BadParameter(f'Invalid option format: {value}')
|
||||
|
||||
def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal['json', 'porcelain', 'pretty'] = 'pretty') -> t.Callable[[FC], FC]:
|
||||
def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]:
|
||||
options = [
|
||||
click.option('--endpoint', type=click.STRING, help='OpenLLM Server endpoint, i.e: http://localhost:3000', envvar='OPENLLM_ENDPOINT', default='http://localhost:3000',
|
||||
),
|
||||
click.option('--endpoint',
|
||||
type=click.STRING,
|
||||
help='OpenLLM Server endpoint, i.e: http://localhost:3000',
|
||||
envvar='OPENLLM_ENDPOINT',
|
||||
show_envvar=True,
|
||||
show_default=True,
|
||||
default='http://localhost:3000'),
|
||||
click.option('--timeout', type=click.INT, default=30, help='Default server timeout', show_default=True),
|
||||
output_option(default_value=output_value),
|
||||
]
|
||||
return compose(*options)(f) if f is not None else compose(*options)
|
||||
|
||||
@cli.command()
|
||||
@cli.command(hidden=True)
|
||||
@click.argument('task', type=click.STRING, metavar='TASK')
|
||||
@shared_client_options
|
||||
@click.option('--agent', type=click.Choice(['hf']), default='hf', help='Whether to interact with Agents from given Server endpoint.', show_default=True)
|
||||
@click.option('--remote', is_flag=True, default=False, help='Whether or not to use remote tools (inference endpoints) instead of local ones.', show_default=True)
|
||||
@click.option('--opt',
|
||||
help="Define prompt options. "
|
||||
"(format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)",
|
||||
help="Define prompt options. (format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)",
|
||||
required=False,
|
||||
multiple=True,
|
||||
callback=opt_callback,
|
||||
@@ -750,8 +968,8 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
|
||||
# raise click.BadOptionUsage('agent', f'Unknown agent type {agent}')
|
||||
|
||||
@cli.command()
|
||||
@shared_client_options(output_value='porcelain')
|
||||
@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
|
||||
@shared_client_options
|
||||
@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True, hidden=True)
|
||||
@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
|
||||
@click.argument('prompt', type=click.STRING)
|
||||
@click.option('--sampling-params',
|
||||
@@ -761,45 +979,28 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
|
||||
callback=opt_callback,
|
||||
metavar='ARG=VALUE[,ARG=VALUE]')
|
||||
@click.pass_context
|
||||
def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, stream: bool, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny,
|
||||
**attrs: t.Any) -> None:
|
||||
'''Ask a LLM interactively, from a terminal.
|
||||
def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, stream: bool, server_type: t.Literal['http', 'grpc'], _memoized: DictStrAny, **_: t.Any) -> None:
|
||||
'''Query a LLM interactively, from a terminal.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
|
||||
```
|
||||
'''
|
||||
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
|
||||
if server_type == 'grpc': raise click.ClickException("'grpc' is currently disabled.")
|
||||
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
|
||||
# TODO: grpc support
|
||||
client = openllm.client.HTTPClient(address=endpoint, timeout=timeout)
|
||||
input_fg, generated_fg = 'magenta', 'cyan'
|
||||
if output != 'porcelain':
|
||||
termui.echo('==Input==\n', fg='white')
|
||||
termui.echo(f'{prompt}', fg=input_fg)
|
||||
|
||||
if stream:
|
||||
stream_res: t.Iterator[StreamingResponse] = client.generate_stream(prompt, **{**client._config(), **_memoized})
|
||||
if output == 'pretty':
|
||||
termui.echo('\n\n==Responses==\n', fg='white')
|
||||
for it in stream_res:
|
||||
termui.echo(it.text, fg=generated_fg, nl=False)
|
||||
elif output == 'json':
|
||||
for it in stream_res:
|
||||
termui.echo(orjson.dumps(converter.unstructure(it), option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
else:
|
||||
for it in stream_res:
|
||||
termui.echo(it.text, fg=generated_fg, nl=False)
|
||||
stream_res: t.Iterator[StreamingResponse] = client.generate_stream(prompt, **_memoized)
|
||||
termui.echo(prompt, fg=input_fg, nl=False)
|
||||
for it in stream_res:
|
||||
termui.echo(it.text, fg=generated_fg, nl=False)
|
||||
else:
|
||||
res: Response = client.generate(prompt, **{**client._config(), **_memoized})
|
||||
if output == 'pretty':
|
||||
termui.echo('\n\n==Responses==\n', fg='white')
|
||||
termui.echo(res.outputs[0].text, fg=generated_fg)
|
||||
elif output == 'json':
|
||||
termui.echo(orjson.dumps(converter.unstructure(res), option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
else:
|
||||
termui.echo(res.outputs[0].text, fg='white')
|
||||
termui.echo(prompt, fg=input_fg, nl=False)
|
||||
termui.echo(client.generate(prompt, **_memoized).outputs[0].text, fg=generated_fg, nl=False)
|
||||
ctx.exit(0)
|
||||
|
||||
@cli.group(cls=Extensions, hidden=True, name='extension')
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import traceback
|
||||
import typing as t
|
||||
|
||||
import click
|
||||
@@ -8,21 +10,18 @@ import orjson
|
||||
from bentoml_cli.utils import opt_callback
|
||||
|
||||
import openllm
|
||||
import openllm_core
|
||||
|
||||
from openllm.cli import termui
|
||||
from openllm.cli._factory import machine_option
|
||||
from openllm.cli._factory import model_complete_envvar
|
||||
from openllm.cli._factory import output_option
|
||||
from openllm_core.prompts import process_prompt
|
||||
|
||||
LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
|
||||
@click.argument('prompt', type=click.STRING)
|
||||
@output_option
|
||||
@click.option('--format', type=click.STRING, default=None)
|
||||
@machine_option
|
||||
@click.option('--opt',
|
||||
help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
|
||||
required=False,
|
||||
@@ -30,9 +29,9 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
|
||||
callback=opt_callback,
|
||||
metavar='ARG=VALUE[,ARG=VALUE]')
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
|
||||
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
|
||||
"""Get the default prompt used by OpenLLM."""
|
||||
module = openllm.utils.EnvVarMixin(model_name).module
|
||||
module = getattr(openllm_core.config, f'configuration_{model_name}')
|
||||
_memoized = {k: v[0] for k, v in _memoized.items() if v}
|
||||
try:
|
||||
template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
|
||||
@@ -54,15 +53,11 @@ def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None,
|
||||
try:
|
||||
# backward-compatible. TO BE REMOVED once every model has default system message and prompt template.
|
||||
fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
|
||||
except RuntimeError:
|
||||
except RuntimeError as err:
|
||||
logger.debug('Exception caught while formatting prompt: %s', err)
|
||||
fully_formatted = openllm.AutoConfig.for_model(model_name).sanitize_parameters(prompt, prompt_template=_prompt_template)[0]
|
||||
if machine: return repr(fully_formatted)
|
||||
elif output == 'porcelain': termui.echo(repr(fully_formatted), fg='white')
|
||||
elif output == 'json':
|
||||
termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
else:
|
||||
termui.echo(f'== Prompt for {model_name} ==\n', fg='magenta')
|
||||
termui.echo(fully_formatted, fg='white')
|
||||
except AttributeError:
|
||||
raise click.ClickException(f'Failed to determine a default prompt template for {model_name}.') from None
|
||||
termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
raise click.ClickException(f'Failed to determine a default prompt template for {model_name}.') from err
|
||||
ctx.exit(0)
|
||||
|
||||
@@ -9,13 +9,10 @@ import openllm
|
||||
|
||||
from bentoml._internal.utils import human_readable_size
|
||||
from openllm.cli import termui
|
||||
from openllm.cli._factory import LiteralOutput
|
||||
from openllm.cli._factory import output_option
|
||||
|
||||
@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@output_option(default_value='json')
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, output: LiteralOutput) -> None:
|
||||
def cli(ctx: click.Context) -> None:
|
||||
"""List available bentos built by OpenLLM."""
|
||||
mapping = {
|
||||
k: [{
|
||||
@@ -29,13 +26,5 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
|
||||
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
}
|
||||
mapping = {k: v for k, v in mapping.items() if v}
|
||||
if output == 'pretty':
|
||||
import tabulate
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
termui.echo(tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
|
||||
tablefmt='fancy_grid',
|
||||
headers=['LLM', 'Tag', 'Size', 'Models']),
|
||||
fg='white')
|
||||
else:
|
||||
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
ctx.exit(0)
|
||||
|
||||
@@ -10,18 +10,15 @@ import openllm
|
||||
|
||||
from bentoml._internal.utils import human_readable_size
|
||||
from openllm.cli import termui
|
||||
from openllm.cli._factory import LiteralOutput
|
||||
from openllm.cli._factory import model_complete_envvar
|
||||
from openllm.cli._factory import model_name_argument
|
||||
from openllm.cli._factory import output_option
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
|
||||
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@model_name_argument(required=False, shell_complete=model_complete_envvar)
|
||||
@output_option(default_value='json')
|
||||
def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
|
||||
def cli(model_name: str | None) -> DictStrAny:
|
||||
"""This is equivalent to openllm models --show-available less the nice table."""
|
||||
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
ids_in_local_store = {
|
||||
@@ -32,10 +29,5 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
|
||||
ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
|
||||
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
|
||||
local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
|
||||
if output == 'pretty':
|
||||
import tabulate
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
|
||||
else:
|
||||
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
|
||||
return local_models
|
||||
|
||||
@@ -1,20 +1,65 @@
|
||||
from __future__ import annotations
|
||||
import enum
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
import typing as t
|
||||
|
||||
import click
|
||||
import inflection
|
||||
import orjson
|
||||
|
||||
import openllm
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import get_quiet_mode
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
logger = logging.getLogger('openllm')
|
||||
|
||||
def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
|
||||
attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
|
||||
if not openllm.utils.get_quiet_mode():
|
||||
t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
|
||||
class Level(enum.IntEnum):
|
||||
NOTSET = logging.DEBUG
|
||||
DEBUG = logging.DEBUG
|
||||
INFO = logging.INFO
|
||||
WARNING = logging.WARNING
|
||||
ERROR = logging.ERROR
|
||||
CRITICAL = logging.CRITICAL
|
||||
|
||||
@property
|
||||
def color(self) -> str | None:
|
||||
return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]
|
||||
|
||||
class JsonLog(t.TypedDict):
|
||||
log_level: Level
|
||||
content: str
|
||||
|
||||
def log(content: str, level: Level = Level.INFO, fg: str | None = None) -> None:
|
||||
def caller(text: str) -> None:
|
||||
if get_debug_mode(): logger.log(level.value, text)
|
||||
else: echo(JsonLog(log_level=level, content=content), json=True, fg=fg)
|
||||
|
||||
caller(orjson.dumps(JsonLog(log_level=level, content=content)).decode())
|
||||
|
||||
warning = functools.partial(log, level=Level.WARNING)
|
||||
error = functools.partial(log, level=Level.ERROR)
|
||||
critical = functools.partial(log, level=Level.CRITICAL)
|
||||
debug = functools.partial(log, level=Level.DEBUG)
|
||||
info = functools.partial(log, level=Level.INFO)
|
||||
notset = functools.partial(log, level=Level.NOTSET)
|
||||
|
||||
def echo(text: t.Any, fg: str | None = None, _with_style: bool = True, json: bool = False, **attrs: t.Any) -> None:
|
||||
if json and not isinstance(text, dict): raise TypeError('text must be a dict')
|
||||
if json:
|
||||
if 'content' in text and 'log_level' in text:
|
||||
content = t.cast(DictStrAny, text)['content']
|
||||
fg = t.cast(Level, text['log_level']).color
|
||||
else:
|
||||
content = orjson.dumps(text).decode()
|
||||
fg = Level.INFO.color if not get_debug_mode() else Level.DEBUG.color
|
||||
else:
|
||||
content = t.cast(str, text)
|
||||
attrs['fg'] = fg if not get_debug_mode() else None
|
||||
|
||||
if not get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(content, **attrs)
|
||||
|
||||
COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
|
||||
CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
|
||||
__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']
|
||||
__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']
|
||||
|
||||
@@ -47,7 +47,7 @@ responses:
|
||||
example:
|
||||
object: 'list'
|
||||
data:
|
||||
- id: meta-llama--Llama-2-13-chat-hf
|
||||
- id: meta-llama--Llama-2-13b-chat-hf
|
||||
object: model
|
||||
created: 1686935002
|
||||
owned_by: 'na'
|
||||
@@ -81,7 +81,7 @@ requestBody:
|
||||
content: You are a helpful assistant.
|
||||
- role: user
|
||||
content: Hello, I'm looking for a chatbot that can help me with my work.
|
||||
model: meta-llama--Llama-2-13-chat-hf
|
||||
model: meta-llama--Llama-2-13b-chat-hf
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
@@ -95,7 +95,7 @@ requestBody:
|
||||
content: You are a helpful assistant.
|
||||
- role: user
|
||||
content: Hello, I'm looking for a chatbot that can help me with my work.
|
||||
model: meta-llama--Llama-2-13-chat-hf
|
||||
model: meta-llama--Llama-2-13b-chat-hf
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
@@ -133,7 +133,7 @@ responses:
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Model 'meta-llama--Llama-2-13-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
@@ -218,7 +218,7 @@ requestBody:
|
||||
summary: One-shot input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
model: meta-llama--Llama-2-13-chat-hf
|
||||
model: meta-llama--Llama-2-13b-chat-hf
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
logprobs: 1
|
||||
@@ -229,7 +229,7 @@ requestBody:
|
||||
summary: Streaming input example
|
||||
value:
|
||||
prompt: This is a test
|
||||
model: meta-llama--Llama-2-13-chat-hf
|
||||
model: meta-llama--Llama-2-13b-chat-hf
|
||||
max_tokens: 256
|
||||
temperature: 0.7
|
||||
top_p: 0.43
|
||||
@@ -286,7 +286,7 @@ responses:
|
||||
value: >
|
||||
{
|
||||
"error": {
|
||||
"message": "Model 'meta-llama--Llama-2-13-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"message": "Model 'meta-llama--Llama-2-13b-chat-hf' does not exists. Try 'GET /v1/models' to see available models.\\nTip: If you are migrating from OpenAI, make sure to update your 'model' parameters in the request.",
|
||||
"type": "invalid_request_error",
|
||||
"object": "error",
|
||||
"param": null,
|
||||
|
||||
@@ -57,7 +57,6 @@ else:
|
||||
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
|
||||
|
||||
llm = openllm.LLM(model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
|
||||
llm.save_pretrained()
|
||||
model, tokenizer = llm.prepare_for_training(adapter_type="lora",
|
||||
lora_alpha=16,
|
||||
lora_dropout=0.1,
|
||||
|
||||
@@ -164,7 +164,7 @@ else:
|
||||
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
|
||||
|
||||
# import the model first hand
|
||||
openllm.import_model("llama", model_id=model_args.model_id, model_version=model_args.model_version)
|
||||
openllm.import_model(model_id=model_args.model_id, model_version=model_args.model_version)
|
||||
|
||||
def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
|
||||
import peft
|
||||
|
||||
@@ -56,7 +56,6 @@ else:
|
||||
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
|
||||
|
||||
llm = openllm.LLM(model_args.model_id, quantize="int8")
|
||||
llm.save_pretrained()
|
||||
model, tokenizer = llm.prepare_for_training(adapter_type="lora", r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
|
||||
|
||||
# ft on english_quotes
|
||||
|
||||
@@ -16,7 +16,7 @@ _conversion_strategy = {'pt': 'ggml'}
|
||||
def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
|
||||
def get(llm: openllm.LLM[t.Any, t.Any]) -> bentoml.Model:
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
|
||||
@@ -6,6 +6,8 @@ import typing as t
|
||||
|
||||
import attr
|
||||
import orjson
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from simple_di import Provide
|
||||
@@ -13,8 +15,6 @@ from simple_di import inject
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from bentoml._internal.models.model import ModelOptions
|
||||
@@ -29,6 +29,7 @@ from .weights import HfIgnore
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import types
|
||||
|
||||
from bentoml._internal.models import ModelStore
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
|
||||
@@ -124,7 +125,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
del model
|
||||
return bentomodel
|
||||
|
||||
def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
def get(llm: openllm.LLM[M, T]) -> bentoml.Model:
|
||||
try:
|
||||
model = bentoml.models.get(llm.tag)
|
||||
backend = model.info.labels['backend']
|
||||
@@ -132,7 +133,6 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
_patch_correct_tag(llm, transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code), _revision=model.info.metadata.get('_revision'))
|
||||
return model
|
||||
except Exception as err:
|
||||
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
|
||||
raise openllm.exceptions.OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
|
||||
|
||||
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from __future__ import annotations
|
||||
import copy, re
|
||||
from pathlib import Path
|
||||
import copy
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
import transformers
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
import openllm
|
||||
|
||||
from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
|
||||
from openllm.serialisation.constants import HUB_ATTRS
|
||||
|
||||
@@ -1,18 +1,40 @@
|
||||
from __future__ import annotations
|
||||
import traceback
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
from openllm_core.exceptions import Error
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from huggingface_hub.hf_api import ModelInfo as HfModelInfo
|
||||
|
||||
import openllm
|
||||
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
|
||||
__global_inst__ = None
|
||||
__cached_id__: dict[str, HfModelInfo] = dict()
|
||||
|
||||
def Client() -> HfApi:
|
||||
global __global_inst__ # noqa: PLW0603
|
||||
if __global_inst__ is None: __global_inst__ = HfApi()
|
||||
return __global_inst__
|
||||
|
||||
def ModelInfo(model_id: str, revision: str | None = None) -> HfModelInfo:
|
||||
if model_id in __cached_id__: return __cached_id__[model_id]
|
||||
try:
|
||||
__cached_id__[model_id] = Client().model_info(model_id, revision=revision)
|
||||
return __cached_id__[model_id]
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
raise Error(f'Failed to fetch {model_id} from huggingface.co') from err
|
||||
|
||||
def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool:
|
||||
return any(s.rfilename.endswith('.safetensors') for s in HfApi().model_info(model_id, revision=revision).siblings)
|
||||
return any(s.rfilename.endswith('.safetensors') for s in ModelInfo(model_id, revision=revision).siblings)
|
||||
|
||||
@attr.define(slots=True)
|
||||
class HfIgnore:
|
||||
|
||||
@@ -4,14 +4,64 @@ User can import these function for convenience, but
|
||||
we won't ensure backward compatibility for these functions. So use with caution.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import typing as t
|
||||
|
||||
import functools
|
||||
import openllm_core
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import openllm
|
||||
|
||||
from openllm_core.utils import DEBUG as DEBUG
|
||||
from openllm_core.utils import DEBUG_ENV_VAR as DEBUG_ENV_VAR
|
||||
from openllm_core.utils import DEV_DEBUG_VAR as DEV_DEBUG_VAR
|
||||
from openllm_core.utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
|
||||
from openllm_core.utils import MYPY as MYPY
|
||||
from openllm_core.utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
|
||||
from openllm_core.utils import QUIET_ENV_VAR as QUIET_ENV_VAR
|
||||
from openllm_core.utils import SHOW_CODEGEN as SHOW_CODEGEN
|
||||
from openllm_core.utils import LazyLoader as LazyLoader
|
||||
from openllm_core.utils import LazyModule as LazyModule
|
||||
from openllm_core.utils import ReprMixin as ReprMixin
|
||||
from openllm_core.utils import VersionInfo as VersionInfo
|
||||
from openllm_core.utils import analytics as analytics
|
||||
from openllm_core.utils import calc_dir_size as calc_dir_size
|
||||
from openllm_core.utils import check_bool_env as check_bool_env
|
||||
from openllm_core.utils import codegen as codegen
|
||||
from openllm_core.utils import configure_logging as configure_logging
|
||||
from openllm_core.utils import dantic as dantic
|
||||
from openllm_core.utils import field_env_key as field_env_key
|
||||
from openllm_core.utils import first_not_none as first_not_none
|
||||
from openllm_core.utils import flatten_attrs as flatten_attrs
|
||||
from openllm_core.utils import gen_random_uuid as gen_random_uuid
|
||||
from openllm_core.utils import generate_context as generate_context
|
||||
from openllm_core.utils import generate_hash_from_file as generate_hash_from_file
|
||||
from openllm_core.utils import get_debug_mode as get_debug_mode
|
||||
from openllm_core.utils import get_quiet_mode as get_quiet_mode
|
||||
from openllm_core.utils import in_notebook as in_notebook
|
||||
from openllm_core.utils import is_autoawq_available as is_autoawq_available
|
||||
from openllm_core.utils import is_autogptq_available as is_autogptq_available
|
||||
from openllm_core.utils import is_bentoml_available as is_bentoml_available
|
||||
from openllm_core.utils import is_bitsandbytes_available as is_bitsandbytes_available
|
||||
from openllm_core.utils import is_grpc_available as is_grpc_available
|
||||
from openllm_core.utils import is_jupyter_available as is_jupyter_available
|
||||
from openllm_core.utils import is_jupytext_available as is_jupytext_available
|
||||
from openllm_core.utils import is_notebook_available as is_notebook_available
|
||||
from openllm_core.utils import is_optimum_supports_gptq as is_optimum_supports_gptq
|
||||
from openllm_core.utils import is_peft_available as is_peft_available
|
||||
from openllm_core.utils import is_torch_available as is_torch_available
|
||||
from openllm_core.utils import is_transformers_available as is_transformers_available
|
||||
from openllm_core.utils import is_vllm_available as is_vllm_available
|
||||
from openllm_core.utils import lenient_issubclass as lenient_issubclass
|
||||
from openllm_core.utils import reserve_free_port as reserve_free_port
|
||||
from openllm_core.utils import resolve_filepath as resolve_filepath
|
||||
from openllm_core.utils import resolve_user_filepath as resolve_user_filepath
|
||||
from openllm_core.utils import serde as serde
|
||||
from openllm_core.utils import set_debug_mode as set_debug_mode
|
||||
from openllm_core.utils import set_quiet_mode as set_quiet_mode
|
||||
from openllm_core.utils import validate_is_path as validate_is_path
|
||||
from openllm_core.utils.serde import converter as converter
|
||||
|
||||
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
|
||||
return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
|
||||
|
||||
|
||||
@@ -10,8 +10,6 @@ from openllm_core._configuration import ModelSettings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_name in openllm.CONFIG_MAPPING.keys()])
|
||||
|
||||
@st.composite
|
||||
def model_settings(draw: st.DrawFn):
|
||||
"""Strategy for generating ModelSettings objects."""
|
||||
|
||||
@@ -161,8 +161,6 @@ def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
|
||||
envvar = openllm.utils.EnvVarMixin(model)
|
||||
|
||||
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
|
||||
pass
|
||||
container_name = f'openllm-{model}-{self(model_id)}'.replace('-', '_')
|
||||
@@ -179,8 +177,7 @@ def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode
|
||||
|
||||
env: DictStrAny = {}
|
||||
|
||||
if quantize is not None:
|
||||
env[envvar.quantize] = quantize
|
||||
if quantize is not None: env['OPENLLM_QUANTIZE'] = quantize
|
||||
|
||||
gpus = openllm.utils.device_count() or -1
|
||||
devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None
|
||||
@@ -195,8 +192,7 @@ def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode
|
||||
ports={
|
||||
'3000/tcp': port,
|
||||
'3001/tcp': prom_port
|
||||
},
|
||||
)
|
||||
})
|
||||
|
||||
yield DockerHandle(client, container.name, port, deployment_mode)
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
import typing as t
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
|
||||
import openllm
|
||||
|
||||
@@ -28,7 +27,7 @@ def test_general_build_with_internal_testing():
|
||||
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
|
||||
|
||||
assert llm.llm_type == bento.info.labels['_type']
|
||||
assert llm.config['env']['backend_value'] == bento.info.labels['_framework']
|
||||
assert llm.__llm_backend__ == bento.info.labels['_framework']
|
||||
|
||||
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
|
||||
assert len(bento_store.list(bento.tag)) == 1
|
||||
@@ -37,13 +36,9 @@ def test_general_build_with_internal_testing():
|
||||
def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
|
||||
local_path = tmp_path_factory.mktemp('local_t5')
|
||||
llm = openllm.LLM(model_id=HF_INTERNAL_T5_TESTING, serialisation='legacy')
|
||||
llm.save_pretrained()
|
||||
|
||||
if isinstance(llm.model, transformers.Pipeline):
|
||||
llm.model.save_pretrained(str(local_path))
|
||||
else:
|
||||
llm.model.save_pretrained(str(local_path))
|
||||
llm.tokenizer.save_pretrained(str(local_path))
|
||||
llm.model.save_pretrained(str(local_path))
|
||||
llm.tokenizer.save_pretrained(str(local_path))
|
||||
|
||||
assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')
|
||||
|
||||
|
||||
@@ -160,25 +160,8 @@ ignore_patterns = [
|
||||
"openllm-python/src/openllm/playground",
|
||||
"openllm-python/src/openllm/models/__init__.py",
|
||||
"openllm-client/src/openllm_client/pb/**",
|
||||
"examples/openai_client.py"
|
||||
]
|
||||
|
||||
[tool.yapf]
|
||||
BASED_ON_STYLE = "google"
|
||||
INDENT_WIDTH = 2
|
||||
JOIN_MULTIPLE_LINES = true
|
||||
COLUMN_LIMIT = 192
|
||||
USE_TABS = false
|
||||
BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
|
||||
BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
|
||||
DISABLE_ENDING_COMMA_HEURISTIC = true
|
||||
BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
|
||||
BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
|
||||
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
|
||||
ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
|
||||
ALLOW_MULTILINE_DICTIONARY_KEYS = false
|
||||
ALLOW_SPLIT_BEFORE_DICT_VALUE = false
|
||||
COALESCE_BRACKETS = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"]
|
||||
|
||||
@@ -125,7 +125,6 @@ _BASE_DEPENDENCIES = [
|
||||
Dependencies(name='optimum', lower_constraint='1.12.0'),
|
||||
Dependencies(name='accelerate'),
|
||||
Dependencies(name='ghapi'),
|
||||
Dependencies(name='tabulate', extensions=['widechars'], lower_constraint='0.9.0'),
|
||||
Dependencies(name='click', lower_constraint='8.1.3'),
|
||||
Dependencies(name='cuda-python', platform=('Darwin', 'ne')),
|
||||
Dependencies(name='bitsandbytes', upper_constraint='0.42'), # 0.41 works with CUDA 11.8
|
||||
|
||||
@@ -66,7 +66,6 @@ _value_docstring = {
|
||||
`model_name` and `start_name` must be specified.''',
|
||||
'model_name': 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__',
|
||||
'start_name': 'Default name to be used with `openllm start`',
|
||||
'env': 'A EnvVarMixin instance for this LLMConfig.',
|
||||
'timeout': 'The default timeout to be set for this given LLM.',
|
||||
'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model.
|
||||
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
|
||||
|
||||
Reference in New Issue
Block a user