From ef11e54a6df46c2d2b149409c97509bf1e429ce4 Mon Sep 17 00:00:00 2001 From: paperspace <29749331+aarnphm@users.noreply.github.com> Date: Wed, 29 May 2024 03:19:47 +0000 Subject: [PATCH] chore: update docs and base instruction [skip ci] Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --- README.md | 203 +----------------- examples/openai_chat_completion_client.py | 34 +-- openllm-core/src/openllm_core/__init__.py | 1 - .../src/openllm_core/config/__init__.py | 1 - .../openllm_core/config/configuration_auto.py | 5 - .../config/configuration_flan_t5.py | 45 ---- openllm-python/README.md | 203 +----------------- openllm-python/pyproject.toml | 3 +- openllm-python/src/openllm/__init__.pyi | 2 +- 9 files changed, 28 insertions(+), 469 deletions(-) delete mode 100644 openllm-core/src/openllm_core/config/configuration_flan_t5.py diff --git a/README.md b/README.md index 040f9ea2..c1949c1e 100644 --- a/README.md +++ b/README.md @@ -68,10 +68,10 @@ $ openllm -h ### Start a LLM server -OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start a [Llama 3 8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) server, run the following: +OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start a [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) server, run the following: ```bash -openllm start meta-llama/Meta-Llama-3-8B +openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code ``` To interact with the server, you can visit the web UI at [http://0.0.0.0:3000/](http://0.0.0.0:3000/) or send a request using `curl`. You can also use OpenLLM’s built-in Python client to interact with the server: @@ -89,12 +89,6 @@ OpenLLM seamlessly supports many models and their variants. You can specify diff openllm start -- ``` -> [!NOTE] -> OpenLLM supports specifying fine-tuning weights and quantized weights -> for any of the supported models as long as they can be loaded with the model -> architecture. Use the `openllm models` command to see the complete list of supported -> models, their architectures, and their variants. - ## 🧩 Supported models OpenLLM currently supports the following models. By default, OpenLLM doesn't include dependencies to run all models. The extra model-specific dependencies can be installed with the instructions below. @@ -313,43 +307,6 @@ You can specify any of the following Falcon models via `openllm start`:
-FlanT5 - - -### Quickstart - -Run the following command to quickly spin up a FlanT5 server: - -```bash -TRUST_REMOTE_CODE=True openllm start google/flan-t5-large -``` -In a different terminal, run the following command to interact with the server: - -```bash -export OPENLLM_ENDPOINT=http://localhost:3000 -openllm query 'What are large language models?' -``` - - -> **Note:** Any FlanT5 variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=flan_t5) to see more FlanT5-compatible models. - - - -### Supported models - -You can specify any of the following FlanT5 models via `openllm start`: - - -- [google/flan-t5-small](https://huggingface.co/google/flan-t5-small) -- [google/flan-t5-base](https://huggingface.co/google/flan-t5-base) -- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) -- [google/flan-t5-xl](https://huggingface.co/google/flan-t5-xl) -- [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl) - -
- -
- Gemma @@ -883,39 +840,6 @@ You can specify any of the following Yi models via `openllm start`: More models will be integrated with OpenLLM and we welcome your contributions if you want to incorporate your custom LLMs into the ecosystem. Check out [Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md) to learn more. -## 💻 Run your model on multiple GPUs - -OpenLLM allows you to start your model server on multiple GPUs and specify the number of workers per resource assigned using the `--workers-per-resource` option. For example, if you have 4 available GPUs, you set the value as one divided by the number as only one instance of the Runner server will be spawned. - -```bash -TRUST_REMOTE_CODE=True openllm start microsoft/phi-2 --workers-per-resource 0.25 -``` - -> [!NOTE] -> The amount of GPUs required depends on the model size itself. -> You can use [the Model Memory Calculator from Hugging Face](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to -> calculate how much vRAM is needed to train and perform big model -> inference on a model and then plan your GPU strategy based on it. - -When using the `--workers-per-resource` option with the `openllm build` command, the environment variable is saved into the resulting Bento. - -For more information, see [Resource scheduling strategy](https://docs.bentoml.org/en/latest/guides/scheduling.html#). - -## 🛞 Runtime implementations - -Different LLMs may support multiple runtime implementations. Models that have `vLLM` (`vllm`) supports will use vLLM by default, otherwise it fallback to use `PyTorch` (`pt`). - -To specify a specific runtime for your chosen model, use the `--backend` option. For example: - -```bash -openllm start meta-llama/Llama-2-7b-chat-hf --backend vllm -``` - -Note: - -1. To use the vLLM backend, you need a GPU with at least the Ampere architecture or newer and CUDA version 11.8. -2. To see the backend options of each model supported by OpenLLM, see the Supported models section or run `openllm models`. - ## 📐 Quantization Quantization is a technique to reduce the storage and computation requirements for machine learning models, particularly during inference. By approximating floating-point numbers as integers (quantized values), quantization allows for faster computations, reduced memory footprint, and can make it feasible to deploy large models on resource-constrained devices. @@ -929,104 +853,8 @@ OpenLLM supports the following quantization techniques - [GPTQ: Accurate Post-Training Quantization](https://arxiv.org/abs/2210.17323) - [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629). -### PyTorch backend - -With PyTorch backend, OpenLLM supports `int8`, `int4`, and `gptq`. - -For using int8 and int4 quantization through `bitsandbytes`, you can use the following command: - -```bash -TRUST_REMOTE_CODE=True openllm start microsoft/phi-2 --quantize int8 -``` - -To run inference with `gptq`, simply pass `--quantize gptq`: - -```bash -openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq -``` - > [!NOTE] -> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"` -> first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving. -> See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization. - -### vLLM backend - -With vLLM backend, OpenLLM supports `awq`, `squeezellm` - -To run inference with `awq`, simply pass `--quantize awq`: - -```bash -openllm start TheBloke/zephyr-7B-alpha-AWQ --quantize awq -``` - -To run inference with `squeezellm`, simply pass `--quantize squeezellm`: - -```bash -openllm start squeeze-ai-lab/sq-llama-2-7b-w4-s0 --quantize squeezellm --serialization legacy -``` - -> [!IMPORTANT] -> Since both `squeezellm` and `awq` are weight-aware quantization methods, meaning the quantization is done during training, all pre-trained weights needs to get quantized before inference time. Make sure to find compatible weights on HuggingFace Hub for your model of choice. - -## 🛠️ Serving fine-tuning layers - -[PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters. - -With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example: - -```bash -openllm start facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default -``` - -OpenLLM also provides flexibility by supporting adapters from custom file paths: - -```bash -openllm start facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter -``` - -To use multiple adapters, use the following format: - -```bash -openllm start facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora -``` - -By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`: - -```bash -curl -X 'POST' \ - 'http://localhost:3000/v1/generate' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "prompt": "What is the meaning of life?", - "stop": [ - "philosopher" - ], - "llm_config": { - "max_new_tokens": 256, - "temperature": 0.75, - "top_k": 15, - "top_p": 1 - }, - "adapter_name": "default" -}' -``` - -To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command: - -```bash -openllm build facebook/opt-6.7b --adapter-id ... -``` - -If you use a relative path for `--adapter-id`, you need to add `--build-ctx`. - -```bash -openllm build facebook/opt-6.7b --adapter-id ./path/to/adapter_id --build-ctx . -``` - -> [!IMPORTANT] -> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon. +> Make sure to use pre-quantized models weights when using with `openllm start`. ## ⚙️ Integrations @@ -1060,26 +888,7 @@ The compatible endpoints supports `/completions`, `/chat/completions`, and `/mod ### [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/openllm/) -To start a local LLM with `llama_index`, simply use `llama_index.llms.openllm.OpenLLM`: - -```python -import asyncio -from llama_index.llms.openllm import OpenLLM - -llm = OpenLLM('HuggingFaceH4/zephyr-7b-alpha') - -llm.complete('The meaning of life is') - - -async def main(prompt, **kwargs): - async for it in llm.astream_chat(prompt, **kwargs): - print(it) - - -asyncio.run(main('The time at San Francisco is')) -``` - -If there is a remote LLM Server running elsewhere, then you can use `llama_index.llms.openllm.OpenLLMAPI`: +You can use `llama_index.llms.openllm.OpenLLMAPI` to interact with a LLM running server: ```python from llama_index.llms.openllm import OpenLLMAPI @@ -1101,10 +910,6 @@ llm('What is the difference between a duck and a goose? And why there are so man -![Gif showing Agent integration](/.github/assets/agent.gif) - -
- ## 🚀 Deploying models to production diff --git a/examples/openai_chat_completion_client.py b/examples/openai_chat_completion_client.py index e19d4224..a3a05eda 100644 --- a/examples/openai_chat_completion_client.py +++ b/examples/openai_chat_completion_client.py @@ -1,28 +1,30 @@ -# NOTE: Make sure to install openai>1 +# NOTE: pip install openai import os, openai from openai.types.chat import ( ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ) -client = openai.OpenAI(base_url=os.getenv('OPENLLM_ENDPOINT', 'http://localhost:3000') + '/v1', api_key='na') +STREAM = str(os.getenv('STREAM', False)).upper() in ['TRUE', '1', 'YES', 'Y', 'ON'] -models = client.models.list() -print('Models:', models.model_dump_json(indent=2)) -model = models.data[0].id - -# Chat completion API -stream = str(os.getenv('STREAM', False)).upper() in ['TRUE', '1', 'YES', 'Y', 'ON'] -completions = client.chat.completions.create(messages=[ +messages = [ ChatCompletionSystemMessageParam(role='system', content='You will be the writing assistant that assume the tone of Ernest Hemmingway.'), ChatCompletionUserMessageParam(role='user', content='Write an essay on Nietzsche and absurdism.'), -], model=model, max_tokens=1024, stream=stream) +] -print(f'Chat completion result (stream={stream}):') -if stream: - for chunk in completions: +def chat_stream(client: openai.OpenAI, model_id: str): + for chunk in client.chat.completions.create(messages=messages, model=model_id, max_tokens=1024, stream=True): text = chunk.choices[0].delta.content - if text: - print(text, flush=True, end='') -else: + if text: print(text, flush=True, end='') + +def chat_one_shot(client: openai.OpenAI, model_id: str): + completions = client.chat.completions.create(messages=messages, model=model_id, max_tokens=1024, stream=False) print(completions) + + +if __name__ == "__main__": + client = openai.OpenAI(base_url=os.getenv('OPENLLM_ENDPOINT', 'http://localhost:3000') + '/v1', api_key='na') + models = client.models.list() + print('Models:', models.model_dump_json(indent=2)) + model_id = models.data[0].id + chat_stream(client, model_id) if STREAM else chat_one_shot(client, model_id) diff --git a/openllm-core/src/openllm_core/__init__.py b/openllm-core/src/openllm_core/__init__.py index 98e54d37..a9dfd3c8 100644 --- a/openllm-core/src/openllm_core/__init__.py +++ b/openllm-core/src/openllm_core/__init__.py @@ -15,7 +15,6 @@ from .config import ( ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, - FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py index 8a27ad71..50bd7f76 100644 --- a/openllm-core/src/openllm_core/config/__init__.py +++ b/openllm-core/src/openllm_core/config/__init__.py @@ -6,7 +6,6 @@ from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig from .configuration_dbrx import DbrxConfig as DbrxConfig from .configuration_dolly_v2 import DollyV2Config as DollyV2Config from .configuration_falcon import FalconConfig as FalconConfig -from .configuration_flan_t5 import FlanT5Config as FlanT5Config from .configuration_gemma import GemmaConfig as GemmaConfig from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig from .configuration_llama import LlamaConfig as LlamaConfig diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 69b69c9b..86468f1c 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -21,7 +21,6 @@ else: OrderedDictType = OrderedDict ModelType: t.TypeAlias = t.Literal[ - 'flan_t5', 'baichuan', 'chatglm', 'falcon', @@ -44,7 +43,6 @@ ModelType: t.TypeAlias = t.Literal[ # NOTE: This is the entrypoint when adding new model config CONFIG_MAPPING_NAMES: OrderedDict[ModelType, str] = OrderedDict( sorted([ - ('flan_t5', 'FlanT5Config'), ('baichuan', 'BaichuanConfig'), ('chatglm', 'ChatGLMConfig'), ('falcon', 'FalconConfig'), @@ -145,9 +143,6 @@ class AutoConfig: def for_model(cls, model_name: t.Literal['falcon'], **attrs: t.Any) -> openllm_core.config.FalconConfig: ... @t.overload @classmethod - def for_model(cls, model_name: t.Literal['flan_t5'], **attrs: t.Any) -> openllm_core.config.FlanT5Config: ... - @t.overload - @classmethod def for_model(cls, model_name: t.Literal['gemma'], **attrs: t.Any) -> openllm_core.config.GemmaConfig: ... @t.overload @classmethod diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py deleted file mode 100644 index 24d8ba0d..00000000 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import annotations - -import openllm_core, pydantic -from openllm_core._configuration import ModelSettings - - -class FlanT5Config(openllm_core.LLMConfig): - """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf). - - It is an enhanced version of T5 that has been finetuned in a mixture of tasks. - - Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information. - """ - - model_config = pydantic.ConfigDict(extra='forbid', protected_namespaces=()) - - metadata_config: ModelSettings = pydantic.Field( - default={ - 'url': 'https://huggingface.co/docs/transformers/model_doc/flan-t5', - 'architecture': 'T5ForConditionalGeneration', - 'model_type': 'seq2seq_lm', - 'backend': ('pt',), - # NOTE: See https://www.philschmid.de/fine-tune-flan-t5. No specific template found, but seems to have the same dialogue style - 'default_id': 'google/flan-t5-large', - 'model_ids': [ - 'google/flan-t5-small', - 'google/flan-t5-base', - 'google/flan-t5-large', - 'google/flan-t5-xl', - 'google/flan-t5-xxl', - ], - }, - repr=False, - exclude=True, - ) - - generation_config: openllm_core.GenerationConfig = pydantic.Field( - default=openllm_core.GenerationConfig.model_construct( - temperature=0.9, max_new_tokens=2048, top_k=50, top_p=0.4, repetition_penalty=1.0 - ) - ) - - @property - def template(self) -> str: - return 'Answer the following question:\nQuestion: {instruction}\nAnswer:' diff --git a/openllm-python/README.md b/openllm-python/README.md index 040f9ea2..c1949c1e 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -68,10 +68,10 @@ $ openllm -h ### Start a LLM server -OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start a [Llama 3 8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) server, run the following: +OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start a [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) server, run the following: ```bash -openllm start meta-llama/Meta-Llama-3-8B +openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code ``` To interact with the server, you can visit the web UI at [http://0.0.0.0:3000/](http://0.0.0.0:3000/) or send a request using `curl`. You can also use OpenLLM’s built-in Python client to interact with the server: @@ -89,12 +89,6 @@ OpenLLM seamlessly supports many models and their variants. You can specify diff openllm start -- ``` -> [!NOTE] -> OpenLLM supports specifying fine-tuning weights and quantized weights -> for any of the supported models as long as they can be loaded with the model -> architecture. Use the `openllm models` command to see the complete list of supported -> models, their architectures, and their variants. - ## 🧩 Supported models OpenLLM currently supports the following models. By default, OpenLLM doesn't include dependencies to run all models. The extra model-specific dependencies can be installed with the instructions below. @@ -313,43 +307,6 @@ You can specify any of the following Falcon models via `openllm start`:
-FlanT5 - - -### Quickstart - -Run the following command to quickly spin up a FlanT5 server: - -```bash -TRUST_REMOTE_CODE=True openllm start google/flan-t5-large -``` -In a different terminal, run the following command to interact with the server: - -```bash -export OPENLLM_ENDPOINT=http://localhost:3000 -openllm query 'What are large language models?' -``` - - -> **Note:** Any FlanT5 variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=flan_t5) to see more FlanT5-compatible models. - - - -### Supported models - -You can specify any of the following FlanT5 models via `openllm start`: - - -- [google/flan-t5-small](https://huggingface.co/google/flan-t5-small) -- [google/flan-t5-base](https://huggingface.co/google/flan-t5-base) -- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) -- [google/flan-t5-xl](https://huggingface.co/google/flan-t5-xl) -- [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl) - -
- -
- Gemma @@ -883,39 +840,6 @@ You can specify any of the following Yi models via `openllm start`: More models will be integrated with OpenLLM and we welcome your contributions if you want to incorporate your custom LLMs into the ecosystem. Check out [Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md) to learn more. -## 💻 Run your model on multiple GPUs - -OpenLLM allows you to start your model server on multiple GPUs and specify the number of workers per resource assigned using the `--workers-per-resource` option. For example, if you have 4 available GPUs, you set the value as one divided by the number as only one instance of the Runner server will be spawned. - -```bash -TRUST_REMOTE_CODE=True openllm start microsoft/phi-2 --workers-per-resource 0.25 -``` - -> [!NOTE] -> The amount of GPUs required depends on the model size itself. -> You can use [the Model Memory Calculator from Hugging Face](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to -> calculate how much vRAM is needed to train and perform big model -> inference on a model and then plan your GPU strategy based on it. - -When using the `--workers-per-resource` option with the `openllm build` command, the environment variable is saved into the resulting Bento. - -For more information, see [Resource scheduling strategy](https://docs.bentoml.org/en/latest/guides/scheduling.html#). - -## 🛞 Runtime implementations - -Different LLMs may support multiple runtime implementations. Models that have `vLLM` (`vllm`) supports will use vLLM by default, otherwise it fallback to use `PyTorch` (`pt`). - -To specify a specific runtime for your chosen model, use the `--backend` option. For example: - -```bash -openllm start meta-llama/Llama-2-7b-chat-hf --backend vllm -``` - -Note: - -1. To use the vLLM backend, you need a GPU with at least the Ampere architecture or newer and CUDA version 11.8. -2. To see the backend options of each model supported by OpenLLM, see the Supported models section or run `openllm models`. - ## 📐 Quantization Quantization is a technique to reduce the storage and computation requirements for machine learning models, particularly during inference. By approximating floating-point numbers as integers (quantized values), quantization allows for faster computations, reduced memory footprint, and can make it feasible to deploy large models on resource-constrained devices. @@ -929,104 +853,8 @@ OpenLLM supports the following quantization techniques - [GPTQ: Accurate Post-Training Quantization](https://arxiv.org/abs/2210.17323) - [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629). -### PyTorch backend - -With PyTorch backend, OpenLLM supports `int8`, `int4`, and `gptq`. - -For using int8 and int4 quantization through `bitsandbytes`, you can use the following command: - -```bash -TRUST_REMOTE_CODE=True openllm start microsoft/phi-2 --quantize int8 -``` - -To run inference with `gptq`, simply pass `--quantize gptq`: - -```bash -openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq -``` - > [!NOTE] -> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"` -> first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving. -> See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization. - -### vLLM backend - -With vLLM backend, OpenLLM supports `awq`, `squeezellm` - -To run inference with `awq`, simply pass `--quantize awq`: - -```bash -openllm start TheBloke/zephyr-7B-alpha-AWQ --quantize awq -``` - -To run inference with `squeezellm`, simply pass `--quantize squeezellm`: - -```bash -openllm start squeeze-ai-lab/sq-llama-2-7b-w4-s0 --quantize squeezellm --serialization legacy -``` - -> [!IMPORTANT] -> Since both `squeezellm` and `awq` are weight-aware quantization methods, meaning the quantization is done during training, all pre-trained weights needs to get quantized before inference time. Make sure to find compatible weights on HuggingFace Hub for your model of choice. - -## 🛠️ Serving fine-tuning layers - -[PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters. - -With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example: - -```bash -openllm start facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes:default -``` - -OpenLLM also provides flexibility by supporting adapters from custom file paths: - -```bash -openllm start facebook/opt-6.7b --adapter-id /path/to/adapters:local_adapter -``` - -To use multiple adapters, use the following format: - -```bash -openllm start facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora:default --adapter-id aarnphm/opt-6.7b-french:french_lora -``` - -By default, all adapters will be injected into the models during startup. Adapters can be specified per request via `adapter_name`: - -```bash -curl -X 'POST' \ - 'http://localhost:3000/v1/generate' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "prompt": "What is the meaning of life?", - "stop": [ - "philosopher" - ], - "llm_config": { - "max_new_tokens": 256, - "temperature": 0.75, - "top_k": 15, - "top_p": 1 - }, - "adapter_name": "default" -}' -``` - -To include this into the Bento, you can specify the `--adapter-id` option when using the `openllm build` command: - -```bash -openllm build facebook/opt-6.7b --adapter-id ... -``` - -If you use a relative path for `--adapter-id`, you need to add `--build-ctx`. - -```bash -openllm build facebook/opt-6.7b --adapter-id ./path/to/adapter_id --build-ctx . -``` - -> [!IMPORTANT] -> Fine-tuning support is still experimental and currently only works with PyTorch backend. vLLM support is coming soon. +> Make sure to use pre-quantized models weights when using with `openllm start`. ## ⚙️ Integrations @@ -1060,26 +888,7 @@ The compatible endpoints supports `/completions`, `/chat/completions`, and `/mod ### [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/openllm/) -To start a local LLM with `llama_index`, simply use `llama_index.llms.openllm.OpenLLM`: - -```python -import asyncio -from llama_index.llms.openllm import OpenLLM - -llm = OpenLLM('HuggingFaceH4/zephyr-7b-alpha') - -llm.complete('The meaning of life is') - - -async def main(prompt, **kwargs): - async for it in llm.astream_chat(prompt, **kwargs): - print(it) - - -asyncio.run(main('The time at San Francisco is')) -``` - -If there is a remote LLM Server running elsewhere, then you can use `llama_index.llms.openllm.OpenLLMAPI`: +You can use `llama_index.llms.openllm.OpenLLMAPI` to interact with a LLM running server: ```python from llama_index.llms.openllm import OpenLLMAPI @@ -1101,10 +910,6 @@ llm('What is the difference between a duck and a goose? And why there are so man -![Gif showing Agent integration](/.github/assets/agent.gif) - -
- ## 🚀 Deploying models to production diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 2cb9a385..ed26697e 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -104,9 +104,8 @@ dbrx = ["cpm-kernels"] dolly-v2 = ["cpm-kernels"] falcon = ["xformers"] fine-tune = ["peft>=0.6.0", "datasets", "trl", "huggingface-hub"] -flan-t5 = ["xformers"] full = [ - "openllm[agents,awq,baichuan,chatglm,dbrx,dolly-v2,falcon,fine-tune,flan-t5,gemma,ggml,gpt-neox,gptq,grpc,llama,mistral,mixtral,mpt,openai,opt,phi,playground,qwen,stablelm,starcoder,vllm,yi]", + "openllm[agents,awq,baichuan,chatglm,dbrx,dolly-v2,falcon,fine-tune,gemma,ggml,gpt-neox,gptq,grpc,llama,mistral,mixtral,mpt,openai,opt,phi,playground,qwen,stablelm,starcoder,vllm,yi]", ] gemma = ["xformers"] ggml = ["ctransformers"] diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi index 0a3d4977..2138637c 100644 --- a/openllm-python/src/openllm/__init__.pyi +++ b/openllm-python/src/openllm/__init__.pyi @@ -13,7 +13,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease. # fmt: off # update-config-stubs.py: import stubs start from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient -from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DbrxConfig as DbrxConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig +from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DbrxConfig as DbrxConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput, MessageParam as MessageParam from openllm_core.utils import api as api