From 887ffa9aa0e8c1adea86517a6ebdc841ba9ec0ec Mon Sep 17 00:00:00 2001 From: Aaron <29749331+aarnphm@users.noreply.github.com> Date: Tue, 5 Sep 2023 10:06:36 -0400 Subject: [PATCH] chore: cleanup pre-commit jobs and update usage Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- .pre-commit-config.yaml | 74 +-- clean.sh | 4 +- hatch.toml | 104 ++- openllm-python/README.md | 984 ++++++++++++++++++++--------- openllm-python/pyproject.toml | 112 ++-- tools/assert-model-table-latest.py | 26 - tools/mirror.sh | 14 + tools/sync-readme.sh | 9 - tools/update-readme.py | 62 -- wheels.sh | 1 - 10 files changed, 827 insertions(+), 563 deletions(-) delete mode 100755 tools/assert-model-table-latest.py create mode 100755 tools/mirror.sh delete mode 100755 tools/sync-readme.sh delete mode 100755 tools/update-readme.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fe72803b..8c7d448f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ ci: autoupdate_schedule: weekly - skip: [check-models-table-update, changelog-dry-run, mypy, yapf, sync-readme, clj-kondo] + skip: [changelog-dry-run, mypy, yapf, clj-kondo] autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci" autoupdate_commit_msg: 'ci: pre-commit autoupdate [pre-commit.ci]' default_language_version: @@ -81,54 +81,42 @@ repos: - id: check-added-large-files - id: debug-statements - id: check-merge-conflict - # - repo: https://github.com/RobertCraigie/pyright-python - # rev: v1.1.324 - # hooks: - # - id: pyright - # verbose: true - # args: [--level, error] - # exclude: | - # (?x)^( - # examples/.*| - # tools/.*| - # tests/.*| - # openllm-python/src/openllm/playground/.*| - # openllm-python/tests/.*| - # openllm-client/src/openllm_client/pb.*| - # .github/.*| - # cz.py | - # hatch_build.py - # )$ - # additional_dependencies: - # - openllm-client[grpc] - # - bentoml[io]>=1.1.2 - # - transformers[agents,torch,tokenizers,accelerate]>=4.29.0 - # - peft - # - safetensors - # - optimum - # - ghapi - # - click==8.1.3 - # - bitsandbytes - # - diffusers - # - soundfile + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.324 + hooks: + - id: pyright + verbose: true + args: [--level, error] + exclude: | + (?x)^( + examples/.*| + tools/.*| + tests/.*| + openllm-python/src/openllm/playground/.*| + openllm-python/tests/.*| + openllm-client/src/openllm_client/pb.*| + .github/.*| + cz.py | + hatch_build.py + )$ + additional_dependencies: + - openllm-client[grpc] + - bentoml[io]>=1.1.2 + - transformers[agents,torch,tokenizers,accelerate]>=4.29.0 + - peft + - safetensors + - optimum + - ghapi + - click==8.1.3 + - bitsandbytes + - diffusers + - soundfile - repo: meta hooks: - id: check-hooks-apply - id: check-useless-excludes - repo: local hooks: - - id: check-models-table-update - name: check if table in README.md is up-to-date - entry: ./tools/assert-model-table-latest.py - language: script - verbose: true - files: README.md - - id: sync-readme - name: sync readme with python core library - entry: ./tools/sync-readme.sh - language: script - verbose: true - files: README.md - id: changelog-dry-run name: Running changelog dry-run entry: hatch run changelog diff --git a/clean.sh b/clean.sh index 87e2039a..8f2f8a81 100644 --- a/clean.sh +++ b/clean.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash GIT_ROOT="$(git rev-parse --show-toplevel)" cd "$GIT_ROOT" || exit 1 -find . -type f -iname "*.so" -exec rm -f {} \; -find . -type d -name "node_modules" -exec rm -rf "{}" \; +find . -type f -iname "*.so" -exec \rm -f {} \; +find . -type d -name "node_modules" -exec \rm -rf "{}" \; diff --git a/hatch.toml b/hatch.toml index 0a294b57..084ec330 100644 --- a/hatch.toml +++ b/hatch.toml @@ -1,72 +1,70 @@ [envs.default] dependencies = [ - "openllm-core @ {root:uri}/openllm-core", - "openllm-client @ {root:uri}/openllm-client", - "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python", - # NOTE: To run all hooks - "pre-commit", - # NOTE: towncrier for changelog - "towncrier", - # NOTE: Using under ./tools/update-optional-dependencies.py - "tomlkit", - # NOTE: Using under ./tools/update-readme.py - "markdown-it-py", - # NOTE: For fancy PyPI readme - "hatch-fancy-pypi-readme", - # NOTE: For working with shell pipe - "plumbum", - # The below sync with mypyc deps and pre-commit mypy - "types-psutil", - "types-tabulate", - "types-PyYAML", - "types-protobuf", + "openllm-core @ {root:uri}/openllm-core", + "openllm-client @ {root:uri}/openllm-client", + "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python", + # NOTE: To run all hooks + "pre-commit", + # NOTE: towncrier for changelog + "towncrier", + # NOTE: Using under ./tools/update-optional-dependencies.py + "tomlkit", + # NOTE: For fancy PyPI readme + "hatch-fancy-pypi-readme", + # NOTE: For working with shell pipe + "plumbum", + # The below sync with mypyc deps and pre-commit mypy + "types-psutil", + "types-tabulate", + "types-PyYAML", + "types-protobuf", ] [envs.default.scripts] changelog = "towncrier build --version main --draft" check-stubs = [ - "./tools/update-config-stubs.py", - "./tools/update-models-import.py", - "update-dummy", + "./tools/update-config-stubs.py", + "./tools/update-models-import.py", + "update-dummy", ] inplace-changelog = "towncrier build --version main --keep" quality = [ - "./tools/dependencies.py", - "- ./tools/update-brew-tap.py", - "bash ./tools/sync-readme.sh", - "check-stubs", - "- pre-commit run --all-files", + "./tools/dependencies.py", + "- ./tools/update-brew-tap.py", + "check-stubs", + "bash ./tools/mirror.sh", + "- pre-commit run --all-files", ] setup = [ - "pre-commit install", - "- ln -s .python-version-default .python-version", - "curl -fsSL https://raw.githubusercontent.com/clj-kondo/clj-kondo/master/script/install-clj-kondo | bash -", + "pre-commit install", + "- ln -s .python-version-default .python-version", + "curl -fsSL https://raw.githubusercontent.com/clj-kondo/clj-kondo/master/script/install-clj-kondo | bash -", ] tool = ["quality", "bash ./clean.sh", "bash ./compile.sh {args}"] typing = [ - "- pre-commit run mypy {args:-a}", - "- pre-commit run pyright {args:-a}", + "- pre-commit run mypy {args:-a}", + "- pre-commit run pyright {args:-a}", ] update-dummy = ["- ./tools/update-dummy.py", "./tools/update-dummy.py"] [envs.tests] dependencies = [ - "openllm-core @ {root:uri}/openllm-core", - "openllm-client @ {root:uri}/openllm-client", - "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python", - # NOTE: interact with docker for container tests. - "docker", - # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy - "coverage[toml]>=6.5", - "filelock>=3.7.1", - "pytest", - "pytest-cov", - "pytest-mock", - "pytest-randomly", - "pytest-rerunfailures", - "pytest-asyncio>=0.21.0", - "pytest-xdist[psutil]", - "trustme", - "hypothesis", - "syrupy", + "openllm-core @ {root:uri}/openllm-core", + "openllm-client @ {root:uri}/openllm-client", + "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python", + # NOTE: interact with docker for container tests. + "docker", + # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy + "coverage[toml]>=6.5", + "filelock>=3.7.1", + "pytest", + "pytest-cov", + "pytest-mock", + "pytest-randomly", + "pytest-rerunfailures", + "pytest-asyncio>=0.21.0", + "pytest-xdist[psutil]", + "trustme", + "hypothesis", + "syrupy", ] skip-install = false template = "tests" @@ -99,6 +97,6 @@ compile = "bash ./compile.sh {args}" recompile = ["bash ./clean.sh", "compile"] edi = "bash local.sh" lock = [ - "bash tools/lock-actions.sh", - "pushd openllm-contrib/clojure && pnpm i --frozen-lockfile", + "bash tools/lock-actions.sh", + "pushd openllm-contrib/clojure && pnpm i --frozen-lockfile", ] diff --git a/openllm-python/README.md b/openllm-python/README.md index 8ba82b92..6325d120 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -37,26 +37,28 @@ ## ๐Ÿ“– Introduction -With OpenLLM, you can run inference with any open-source large-language models, -deploy to the cloud or on-premises, and build powerful AI apps. +OpenLLM is an open-source platform designed to facilitate the deployment and operation of large language models (LLMs) in real-world applications. With OpenLLM, you can run inference on any open-source LLM, deploy them on the cloud or on-premises, and build powerful AI applications. -๐Ÿš‚ **State-of-the-art LLMs**: built-in supports a wide range of open-source LLMs -and model runtime, including Llama 2๏ผŒStableLM, Falcon, Dolly, Flan-T5, ChatGLM, -StarCoder and more. +Key features include: -๐Ÿ”ฅ **Flexible APIs**: serve LLMs over RESTful API or gRPC with one command, -query via WebUI, CLI, our Python/Javascript client, or any HTTP client. +๐Ÿš‚ **State-of-the-art LLMs**: Integrated support for a wide range of open-source LLMs and model runtimes, including but not limited to Llama 2, StableLM, Falcon, Dolly, Flan-T5, ChatGLM, and StarCoder. -โ›“๏ธ **Freedom To Build**: First-class support for LangChain, BentoML and Hugging -Face that allows you to easily create your own AI apps by composing LLMs with -other models and services. +๐Ÿ”ฅ **Flexible APIs**: Serve LLMs over a RESTful API or gRPC with a single command. You can interact with the mode using a Web UI, CLI, Python/JavaScript clients, or any HTTP client of your choice. -๐ŸŽฏ **Streamline Deployment**: Automatically generate your LLM server Docker -Images or deploy as serverless endpoint via -[โ˜๏ธ BentoCloud](https://l.bentoml.com/bento-cloud). +โ›“๏ธ **Freedom to build**: First-class support for LangChain, BentoML and Hugging Face, allowing you to easily create your own AI applications by composing LLMs with other models and services. -๐Ÿค–๏ธ **Bring your own LLM**: Fine-tune any LLM to suit your needs with -`LLM.tuning()`. (Coming soon) +๐ŸŽฏ **Streamline deployment**: Automatically generate your LLM server Docker images or deploy as serverless endpoints via +[โ˜๏ธ BentoCloud](https://l.bentoml.com/bento-cloud), which effortlessly manages GPU resources, scales according to traffic, and ensures cost-effectiveness. + +๐Ÿค–๏ธ **Bring your own LLM**: Fine-tune any LLM to suit your needs. You can load LoRA layers to fine-tune models for higher accuracy and performance for specific tasks. A unified fine-tuning API for models (`LLM.tuning()`) is coming soon. + +โšกย **Quantization**: Run inference with less computational and memory costs though quantization techniques like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)ย andย [GPTQ](https://arxiv.org/abs/2210.17323). + +๐Ÿ“กย **Streaming**: Support token streaming through server-sent events (SSE). You can use the `/v1/generate_stream`ย endpoint for streaming responses from LLMs. + +๐Ÿ”„ย **Continuous batching**: Support continuous batching via [vLLM](https://github.com/vllm-project/vllm) for increased total throughput. + +OpenLLM is designed for AI application developers working to build production-ready applications based on LLMs. It delivers a comprehensive suite of tools and features for fine-tuning, serving, deploying, and monitoring these models, simplifying the end-to-end deployment workflow for LLMs. @@ -66,21 +68,23 @@ Images or deploy as serverless endpoint via -## ๐Ÿƒ Getting Started +## ๐Ÿƒ Get started -To use OpenLLM, you need to have Python 3.8 (or newer) and `pip` installed on -your system. We highly recommend using a Virtual Environment to prevent package -conflicts. +### Prerequisites -You can install OpenLLM using pip as follows: +You have installed Python 3.8 (or later) andย `pip`. We highly recommend using a [Virtual Environment](https://docs.python.org/3/library/venv.html) to prevent package conflicts. + +### Install OpenLLM + +Install OpenLLM by using `pip` as follows: ```bash pip install openllm ``` -To verify if it's installed correctly, run: +To verify the installation, run: -``` +```bash $ openllm -h Usage: openllm [OPTIONS] COMMAND [ARGS]... @@ -90,28 +94,47 @@ Usage: openllm [OPTIONS] COMMAND [ARGS]... โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•”โ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•”โ–ˆโ–ˆโ–ˆโ–ˆโ•”โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•”โ•โ•โ•โ• โ–ˆโ–ˆโ•”โ•โ•โ• โ–ˆโ–ˆโ•‘โ•šโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ•šโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•‘ โ•šโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘ โ•šโ–ˆโ–ˆโ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘ โ•šโ•โ• โ–ˆโ–ˆโ•‘ - โ•šโ•โ•โ•โ•โ•โ• โ•šโ•โ• โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ• โ•šโ•โ•โ•โ•โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ• โ•šโ•โ• + โ•šโ•โ•โ•โ•โ•โ• โ•šโ•โ• โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ• โ•šโ•โ•โ•โ•โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ•โ•โ•โ•โ•โ•โ•šโ•โ• โ•šโ•โ•. An open platform for operating large language models in production. Fine-tune, serve, deploy, and monitor any LLMs with ease. + +Options: + -v, --version Show the version and exit. + -h, --help Show this message and exit. + +Commands: + build Package a given models into a Bento. + embed Get embeddings interactively, from a terminal. + import Setup LLM interactively. + instruct Instruct agents interactively for given tasks, from a... + models List all supported models. + prune Remove all saved models, (and optionally bentos) built with... + query Ask a LLM interactively, from a terminal. + start Start any LLM as a REST server. + start-grpc Start any LLM as a gRPC server. + +Extensions: + build-base-container Base image builder for BentoLLM. + dive-bentos Dive into a BentoLLM. + get-containerfile Return Containerfile of any given Bento. + get-prompt Get the default prompt used by OpenLLM. + list-bentos List available bentos built by OpenLLM. + list-models This is equivalent to openllm models... + playground OpenLLM Playground. ``` -### Starting an LLM Server +### Start an LLM server -To start an LLM server, use `openllm start`. For example, to start a -[`OPT`](https://huggingface.co/docs/transformers/model_doc/opt) server, do the -following: +OpenLLM allows you to quickly spin up an LLM server using `openllm start`. For example, to start anย [OPT](https://huggingface.co/docs/transformers/model_doc/opt)ย server, run the following: ```bash openllm start opt ``` -Following this, a Web UI will be accessible at http://localhost:3000 where you -can experiment with the endpoints and sample input prompts. +This starts the server atย [http://0.0.0.0:3000/](http://0.0.0.0:3000/). OpenLLM downloads the model to the BentoML local Model Store if they have not been registered before. To view your local models, run `bentoml models list`. -OpenLLM provides a built-in Python client, allowing you to interact with the -model. In a different terminal window or a Jupyter Notebook, create a client to -start interacting with the model: +To interact with the server, you can visit the web UI atย [http://0.0.0.0:3000/](http://0.0.0.0:3000/) or send a request usingย `curl`. You can also use OpenLLMโ€™s built-in Python client to interact with the server: ```python import openllm @@ -119,350 +142,694 @@ client = openllm.client.HTTPClient('http://localhost:3000') client.query('Explain to me the difference between "further" and "farther"') ``` -You can also use the `openllm query` command to query the model from the -terminal: +Alternatively, use theย `openllm query`ย command to query the model: ```bash export OPENLLM_ENDPOINT=http://localhost:3000 openllm query 'Explain to me the difference between "further" and "farther"' ``` -Visit `http://localhost:3000/docs.json` for OpenLLM's API specification. - -OpenLLM seamlessly supports many models and their variants. Users can also -specify different variants of the model to be served, by providing the -`--model-id` argument, e.g.: +OpenLLM seamlessly supports many models and their variants. You can specify different variants of the model to be served by providing theย `--model-id` option. For example: ```bash -openllm start flan-t5 --model-id google/flan-t5-large +openllm start opt --model-id facebook/opt-2.7b ``` > [!NOTE] -> `openllm` also supports all variants of fine-tuning weights, custom -> model path as well as quantized weights for any of the supported models as -> long as it can be loaded with the model architecture. Refer to -> [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models) -> section for models' architecture. +> OpenLLM supports specifying fine-tuning weights and quantized weights +> for any of the supported models as long as they can be loaded with the model +> architecture. Use theย `openllm models`ย command to see the complete list of supported +> models, their architectures, and their variants. -Use the `openllm models` command to see the list of models and their variants -supported in OpenLLM. +## ๐Ÿงฉ Supported models -## ๐Ÿงฉ Supported Models +OpenLLM currently supports the following models. By default, OpenLLM doesn't include dependencies to run all models. The extra model-specific dependencies can be installed with the instructions below. -The following models are currently supported in OpenLLM. By default, OpenLLM -doesn't include dependencies to run all models. The extra model-specific -dependencies can be installed with the instructions below: +
+Llama - +### Installation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Quickstart - - - - - - +> [!NOTE] +> To use the official Llama 2 models, you must gain access by visiting +> the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and +> accepting its license terms and acceptable use policy. You also need to obtain access to these +> models on [Hugging Face](https://huggingface.co/meta-llama). Note that any Llama 2 variants can +> be deployed with OpenLLM if you donโ€™t have access to the official Llama 2 model. +> Visit the [Hugging Face Model Hub](https://huggingface.co/models?sort=trending&search=llama2) to see more Llama 2 compatible models. - - - - - - +### Quickstart - - - - - - +### Quickstart - - - - - - +### Quickstart - - - - - -
ModelArchitectureModel IdsInstallation
chatglmChatGLMForConditionalGeneration - - - - - -```bash -pip install "openllm[chatglm]" -``` - -
dolly-v2GPTNeoXForCausalLM - - - - - -```bash -pip install openllm -``` - -
falconFalconForCausalLM - - - - - -```bash -pip install "openllm[falcon]" -``` - -
flan-t5T5ForConditionalGeneration - - - - - -```bash -pip install "openllm[flan-t5]" -``` - -
gpt-neoxGPTNeoXForCausalLM - - - - - -```bash -pip install openllm -``` - -
llamaLlamaForCausalLM - - - - +To run Llama models with OpenLLM, you need to install the `llama` dependency as it is not installed by default. ```bash pip install "openllm[llama]" ``` -
mptMPTForCausalLM - - - - +Run the following commands to quickly spin up a Llama 2 server and send a request to it. ```bash -pip install "openllm[mpt]" +openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' ``` -
optOPTForCausalLM +### Supported models - +You can specify any of the following Llama models by using `--model-id`. - +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) +- [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) +- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) +- [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) +- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) +- [NousResearch/llama-2-70b-chat-hf](https://huggingface.co/NousResearch/llama-2-70b-chat-hf) +- [NousResearch/llama-2-13b-chat-hf](https://huggingface.co/NousResearch/llama-2-13b-chat-hf) +- [NousResearch/llama-2-7b-chat-hf](https://huggingface.co/NousResearch/llama-2-7b-chat-hf) +- [NousResearch/llama-2-70b-hf](https://huggingface.co/NousResearch/llama-2-70b-hf) +- [NousResearch/llama-2-13b-hf](https://huggingface.co/NousResearch/llama-2-13b-hf) +- [NousResearch/llama-2-7b-hf](https://huggingface.co/NousResearch/llama-2-7b-hf) +- [openlm-research/open_llama_7b_v2](https://huggingface.co/openlm-research/open_llama_7b_v2) +- [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) +- [openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b) +- [huggyllama/llama-65b](https://huggingface.co/huggyllama/llama-65b) +- [huggyllama/llama-30b](https://huggingface.co/huggyllama/llama-30b) +- [huggyllama/llama-13b](https://huggingface.co/huggyllama/llama-13b) +- [huggyllama/llama-7b](https://huggingface.co/huggyllama/llama-7b) +- Any other models that strictly follows the [LlamaForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend pt + ``` + +- vLLM (Recommended): + + ```bash + pip install "openllm[llama, vllm]" + openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + + + +
+ChatGLM + +### Installation + +To run ChatGLM models with OpenLLM, you need to install the `chatglm` dependency as it is not installed by default. ```bash -pip install "openllm[opt]" +pip install "openllm[chatglm]" ``` -
stablelmGPTNeoXForCausalLM +Run the following commands to quickly spin up a ChatGLM server and send a request to it. - +```bash +openllm start chatglm --model-id thudm/chatglm-6b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` - +### Supported models + +You can specify any of the following ChatGLM models by using `--model-id`. + +- [thudm/chatglm-6b](https://huggingface.co/thudm/chatglm-6b) +- [thudm/chatglm-6b-int8](https://huggingface.co/thudm/chatglm-6b-int8) +- [thudm/chatglm-6b-int4](https://huggingface.co/thudm/chatglm-6b-int4) +- [thudm/chatglm2-6b](https://huggingface.co/thudm/chatglm2-6b) +- [thudm/chatglm2-6b-int4](https://huggingface.co/thudm/chatglm2-6b-int4) +- Any other models that strictly follows the [ChatGLMForConditionalGeneration](https://github.com/THUDM/ChatGLM-6B) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start chatglm --model-id thudm/chatglm-6b --backend pt + ``` + + + +
+Dolly-v2 + +### Installation + +Dolly-v2 models do not require you to install any model-specific dependencies once you have `openllm` installed. ```bash pip install openllm ``` -
starcoderGPTBigCodeForCausalLM +Run the following commands to quickly spin up a Dolly-v2 server and send a request to it. - +```bash +openllm start dolly-v2 --model-id databricks/dolly-v2-3b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` - +### Supported models + +You can specify any of the following Dolly-v2 models by using `--model-id`. + +- [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b) +- [databricks/dolly-v2-7b](https://huggingface.co/databricks/dolly-v2-7b) +- [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b) +- Any other models that strictly follows the [GPTNeoXForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start dolly-v2 --model-id databricks/dolly-v2-3b --backend pt + ``` + +- vLLM: + + ```bash + openllm start dolly-v2 --model-id databricks/dolly-v2-3b --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + + + +
+Falcon + +### Installation + +To run Falcon models with OpenLLM, you need to install the `falcon` dependency as it is not installed by default. + +```bash +pip install "openllm[falcon]" +``` + +### Quickstart + +Run the following commands to quickly spin up a Falcon server and send a request to it. + +```bash +openllm start falcon --model-id tiiuae/falcon-7b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following Falcon models by using `--model-id`. + +- [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) +- [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b) +- [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) +- [tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) +- Any other models that strictly follows the [FalconForCausalLM](https://falconllm.tii.ae/) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start falcon --model-id tiiuae/falcon-7b --backend pt + ``` + +- vLLM: + + ```bash + pip install "openllm[falcon, vllm]" + openllm start falcon --model-id tiiuae/falcon-7b --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+Flan-T5 + +### Installation + +To run Flan-T5 models with OpenLLM, you need to install the `flan-t5` dependency as it is not installed by default. + +```bash +pip install "openllm[flan-t5]" +``` + +### Quickstart + +Run the following commands to quickly spin up a Flan-T5 server and send a request to it. + +```bash +openllm start flan-t5 --model-id google/flan-t5-large +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following Flan-T5 models by using `--model-id`. + +- [google/flan-t5-small](https://huggingface.co/google/flan-t5-small) +- [google/flan-t5-base](https://huggingface.co/google/flan-t5-base) +- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) +- [google/flan-t5-xl](https://huggingface.co/google/flan-t5-xl) +- [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl) +- Any other models that strictly follows the [T5ForConditionalGeneration](https://huggingface.co/docs/transformers/main/model_doc/t5#transformers.T5ForConditionalGeneration) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start flan-t5 --model-id google/flan-t5-large --backend pt + ``` + +- Flax: + + ```bash + pip install "openllm[flan-t5, flax]" + openllm start flan-t5 --model-id google/flan-t5-large --backend flax + ``` + +- TensorFlow: + + ```bash + pip install "openllm[flan-t5, tf]" + openllm start flan-t5 --model-id google/flan-t5-large --backend tf + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+GPT-NeoX + +### Installation + +GPT-NeoX models do not require you to install any model-specific dependencies once you have `openllm` installed. + +```bash +pip install openllm +``` + +### Quickstart + +Run the following commands to quickly spin up a GPT-NeoX server and send a request to it. + +```bash +openllm start gpt-neox --model-id eleutherai/gpt-neox-20b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following GPT-NeoX models by using `--model-id`. + +- [eleutherai/gpt-neox-20b](https://huggingface.co/eleutherai/gpt-neox-20b) +- Any other models that strictly follows the [GPTNeoXForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start gpt-neox --model-id eleutherai/gpt-neox-20b --backend pt + ``` + +- vLLM: + + ```bash + openllm start gpt-neox --model-id eleutherai/gpt-neox-20b --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+MPT + +### Installation + +To run MPT models with OpenLLM, you need to install the `mpt` dependency as it is not installed by default. + +```bash +pip install "openllm[mpt]" +``` + +### Quickstart + +Run the following commands to quickly spin up a MPT server and send a request to it. + +```bash +openllm start mpt --model-id mosaicml/mpt-7b-chat +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following MPT models by using `--model-id`. + +- [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) +- [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct) +- [mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat) +- [mosaicml/mpt-7b-storywriter](https://huggingface.co/mosaicml/mpt-7b-storywriter) +- [mosaicml/mpt-30b](https://huggingface.co/mosaicml/mpt-30b) +- [mosaicml/mpt-30b-instruct](https://huggingface.co/mosaicml/mpt-30b-instruct) +- [mosaicml/mpt-30b-chat](https://huggingface.co/mosaicml/mpt-30b-chat) +- Any other models that strictly follows the [MPTForCausalLM](https://huggingface.co/mosaicml) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start mpt --model-id mosaicml/mpt-7b-chat --backend pt + ``` + +- vLLM (Recommended): + + ```bash + pip install "openllm[mpt, vllm]" + openllm start mpt --model-id mosaicml/mpt-7b-chat --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+OPT + +### Installation + +To run OPT models with OpenLLM, you need to install the `opt` dependency as it is not installed by default. + +```bash +pip install "openllm[opt]" +``` + +### Quickstart + +Run the following commands to quickly spin up an OPT server and send a request to it. + +```bash +openllm start opt --model-id facebook/opt-2.7b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following OPT models by using `--model-id`. + +- [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) +- [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) +- [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b) +- [facebook/opt-2.7b](https://huggingface.co/facebook/opt-2.7b) +- [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) +- [facebook/opt-66b](https://huggingface.co/facebook/opt-66b) +- Any other models that strictly follows the [OPTForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start opt --model-id facebook/opt-2.7b --backend pt + ``` + +- vLLM: + + ```bash + pip install "openllm[opt, vllm]" + openllm start opt --model-id facebook/opt-2.7b --backend vllm + ``` + +- TensorFlow: + + ```bash + pip install "openllm[opt, tf]" + openllm start opt --model-id facebook/opt-2.7b --backend tf + ``` + +- Flax: + + ```bash + pip install "openllm[opt, flax]" + openllm start opt --model-id facebook/opt-2.7b --backend flax + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+StableLM + +### Installation + +StableLM models do not require you to install any model-specific dependencies once you have `openllm` installed. + +```bash +pip install openllm +``` + +### Quickstart + +Run the following commands to quickly spin up a StableLM server and send a request to it. + +```bash +openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` + +### Supported models + +You can specify any of the following StableLM models by using `--model-id`. + +- [stabilityai/stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) +- [stabilityai/stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) +- [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) +- [stabilityai/stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) +- Any other models that strictly follows the [GPTNeoXForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b --backend pt + ``` + +- vLLM: + + ```bash + openllm start stablelm --model-id stabilityai/stablelm-tuned-alpha-7b --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +
+StarCoder + +### Installation + +To run StarCoder models with OpenLLM, you need to install the `starcoder` dependency as it is not installed by default. ```bash pip install "openllm[starcoder]" ``` -
baichuanBaiChuanForCausalLM +Run the following commands to quickly spin up a StarCoder server and send a request to it. - +```bash +openllm start startcoder --model-id [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` - +### Supported models + +You can specify any of the following StarCoder models by using `--model-id`. + +- [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) +- [bigcode/starcoderbase](https://huggingface.co/bigcode/starcoderbase) +- Any other models that strictly follows the [GPTBigCodeForCausalLM](https://huggingface.co/docs/transformers/main/model_doc/gpt_bigcode#transformers.GPTBigCodeForCausalLM) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start startcoder --model-id bigcode/starcoder --backend pt + ``` + +- vLLM: + + ```bash + pip install "openllm[startcoder, vllm]" + openllm start startcoder --model-id bigcode/starcoder --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + + + +
+Baichuan + +### Installation + +To run Baichuan models with OpenLLM, you need to install the `baichuan` dependency as it is not installed by default. ```bash pip install "openllm[baichuan]" ``` -
+### Quickstart - - -### Runtime Implementations (Experimental) - -Different LLMs may have multiple runtime implementations. For instance, they -might use Pytorch (`pt`), Tensorflow (`tf`), Flax (`flax`) or vLLM (`vllm`). - -If you wish to specify a particular runtime for a model, you can do so by -setting the `OPENLLM_BACKEND={runtime}` environment variable -before running `openllm start`. - -For example, if you want to use the Tensorflow (`tf`) implementation for the -`flan-t5` model, you can use the following command: +Run the following commands to quickly spin up a Baichuan server and send a request to it. ```bash -OPENLLM_BACKEND=tf openllm start flan-t5 +openllm start baichuan --model-id baichuan-inc/baichuan-13b-base +export OPENLLM_ENDPOINT=http://localhost:3000 +openllm query 'What are large language models?' +``` -openllm start flan-t5 --backend tf +### Supported models + +You can specify any of the following Baichuan models by using `--model-id`. + +- [baichuan-inc/baichuan-7b](https://huggingface.co/baichuan-inc/baichuan-7b) +- [baichuan-inc/baichuan-13b-base](https://huggingface.co/baichuan-inc/baichuan-13b-base) +- [baichuan-inc/baichuan-13b-chat](https://huggingface.co/baichuan-inc/baichuan-13b-chat) +- [fireballoon/baichuan-vicuna-chinese-7b](https://huggingface.co/fireballoon/baichuan-vicuna-chinese-7b) +- [fireballoon/baichuan-vicuna-7b](https://huggingface.co/fireballoon/baichuan-vicuna-7b) +- [hiyouga/baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft) +- Any other models that strictly follows the [BaiChuanForCausalLM](https://github.com/baichuan-inc/Baichuan-7B) architecture + +### Supported backends + +- PyTorch (Default): + + ```bash + openllm start baichuan --model-id baichuan-inc/baichuan-13b-base --backend pt + ``` + +- vLLM: + + ```bash + pip install "openllm[baichuan, vllm]" + openllm start baichuan --model-id baichuan-inc/baichuan-13b-base --backend vllm + ``` + +> [!NOTE] +> Currently when using the vLLM backend, quantization and adapters are not supported. + +
+ +More models will be integrated with OpenLLM and we welcome your contributions if you want to incorporate your custom LLMs into the ecosystem. Check out [Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md) to learn more. + +## ๐Ÿ’ป Run your model on multiple GPUs + +OpenLLM allows you to start your model server on multiple GPUs and specify the number of workers per resource assigned using the `--workers-per-resource` option. For example, if you have 4 available GPUs, you set the value as one divided by the number as only one instance of the Runner server will be spawned. + +```bash +openllm start opt --workers-per-resource 0.25 ``` > [!NOTE] -> For GPU support on Flax, refers to -> [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier) -> to make sure that you have Jax support for the corresponding CUDA version. +> The amount of GPUs required depends on the model size itself. +> You can use [the Model Memory Calculator from Hugging Face](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to +> calculate how much vRAM is needed to train and perform big model +> inference on a model and then plan your GPU strategy based on it. -> [!IMPORTANT] -> To use vLLM backend, at least a GPU with Ampere or newer architecture and CUDA 11.8 is required. +When using the `--workers-per-resource` option with the `openllm build` command, the environment variable is saved into the resulting Bento. -### Quantisation +For more information, see [Resource scheduling strategy](https://docs.bentoml.org/en/latest/guides/scheduling.html#). -OpenLLM supports quantisation with -[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and -[GPTQ](https://arxiv.org/abs/2210.17323) +## ๐Ÿ›ž Runtime implementations (Experimental) + +Different LLMs may support multiple runtime implementations. For instance, they might use frameworks and libraries such as PyTorch (`pt`), TensorFlow (`tf`), Flax (`flax`), and vLLM (`vllm`). + +To specify a specific runtime for your chosen model, use the `--backend` option. For example: ```bash -openllm start mpt --quantize int8 +openllm start llama --model-id meta-llama/Llama-2-7b-chat-hf --backend vllm ``` -To run inference with `gptq`, simply pass `--quantize gptq`: +Note: + +1. For GPU support on Flax, refers toย [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)ย to make sure that you have Jax support for the corresponding CUDA version. +2. To use the vLLM backend, you need a GPU with at least the Ampere architecture or newer and CUDA version 11.8. +3. To see the backend options of each model supported by OpenLLM, see the Supported models section or run `openllm models`. + +## ๐Ÿ“ Quantization + +Quantization is a technique to reduce the storage and computation requirements for machine learning models, particularly during inference. By approximating floating-point numbers as integers (quantized values), quantization allows for faster computations, reduced memory footprint, and can make it feasible to deploy large models on resource-constrained devices. + +OpenLLM supports quantization through two methods - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)ย andย [GPTQ](https://arxiv.org/abs/2210.17323). + +To run a model using the `bitsandbytes` method for quantization, you can use the following command: + +```bash +openllm start opt --quantize int8 +``` + +To run inference withย `gptq`, simply passย `--quantize gptq`: ```bash openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gptq --device 0 ``` > [!NOTE] -> In order to run GPTQ, make sure to install with -> `pip install "openllm[gptq]"`. The weights of all supported models should be -> quantized before serving. See -> [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) for more -> information on GPTQ quantisation. +> In order to run GPTQ, make sure you runย `pip install "openllm[gptq]"` first +> to install the dependency. The weights of all supported models should be quantized before serving. +> Seeย [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa)ย for more information on GPTQ quantization. -### Fine-tuning support (Experimental) +## ๐Ÿ› ๏ธ Fine-tuning support (Experimental) -One can serve OpenLLM models with any PEFT-compatible layers with -`--adapter-id`: +[PEFT](https://huggingface.co/docs/peft/index), or Parameter-Efficient Fine-Tuning, is a methodology designed to fine-tune pre-trained models more efficiently. Instead of adjusting all model parameters, PEFT focuses on tuning only a subset, reducing computational and storage costs. [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) (Low-Rank Adaptation) is one of the techniques supported by PEFT. It streamlines fine-tuning by using low-rank decomposition to represent weight updates, thereby drastically reducing the number of trainable parameters. + +With OpenLLM, you can take advantage of the fine-tuning feature by serving models with any PEFT-compatible layers using the `--adapter-id` option. For example: ```bash openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes ``` -It also supports adapters from custom paths: +OpenLLM also provides flexibility by supporting adapters from custom file paths: ```bash openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters @@ -474,36 +841,31 @@ To use multiple adapters, use the following format: openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora ``` -By default, the first adapter-id will be the default Lora layer, but optionally -users can change what Lora layer to use for inference via `/v1/adapters`: +By default, the first specified `adapter-id` is the default LoRA layer, but optionally you can specify a different LoRA layer for inference using the `/v1/adapters` endpoint: ```bash curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}' ``` -Note that for multiple adapter-name and adapter-id, it is recommended to update -to use the default adapter before sending the inference, to avoid any -performance degradation +Note that if you are using multiple adapter names and IDs, it is recommended to set the default adapter before sending the inference to avoid any performance degradation. -To include this into the Bento, one can also provide a `--adapter-id` into -`openllm build`: +To include this into the Bento, you can specify theย `--adapter-id`ย option when using theย `openllm build` command: ```bash openllm build opt --model-id facebook/opt-6.7b --adapter-id ... ``` +If you use a relative path for `--adapter-id`, you need to add `--build-ctx`. + +```bash +openllm build opt --adapter-id ./path/to/adapter_id --build-ctx . +``` + > [!NOTE] -> We will gradually roll out support for fine-tuning all models. The -> following models contain fine-tuning support: OPT, Falcon, LlaMA. +> We will gradually roll out support for fine-tuning all models. +> Currently, the models supporting fine-tuning with OpenLLM include: OPT, Falcon, and LlaMA. -### Integrating a New Model - -OpenLLM encourages contributions by welcoming users to incorporate their custom -LLMs into the ecosystem. Check out -[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md) -to see how you can do it yourself. - -### Embeddings +## ๐Ÿงฎ Embeddings OpenLLM provides embeddings endpoint for embeddings calculation. This can be accessed via `/v1/embeddings`. @@ -544,7 +906,7 @@ client.embed("I like to eat apples") > we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) > for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento) -### Playground and Chat UI +## ๐Ÿฅ… Playground and Chat UI The following UIs are currently available for OpenLLM: @@ -683,7 +1045,7 @@ client.ask_agent( -## ๐Ÿš€ Deploying to Production +## ๐Ÿš€ Deploying models to production There are several ways to deploy your LLMs: diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index c2d225b1..b2219980 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -2,74 +2,74 @@ [build-system] build-backend = "hatchling.build" requires = [ - "hatchling==1.18.0", - "hatch-vcs==0.3.0", - "hatch-fancy-pypi-readme==23.1.0", + "hatchling==1.18.0", + "hatch-vcs==0.3.0", + "hatch-fancy-pypi-readme==23.1.0", ] [project] authors = [ - { name = "Aaron Pham", email = "aarnphm@bentoml.com" }, - { name = "BentoML Team", email = "contact@bentoml.com" }, + {name = "Aaron Pham",email = "aarnphm@bentoml.com"}, + {name = "BentoML Team",email = "contact@bentoml.com"}, ] classifiers = [ - "Development Status :: 5 - Production/Stable", - "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 12", - "Environment :: GPU :: NVIDIA CUDA :: 11.8", - "Environment :: GPU :: NVIDIA CUDA :: 11.7", - "License :: OSI Approved :: Apache Software License", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", - "Operating System :: OS Independent", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Intended Audience :: System Administrators", - "Typing :: Typed", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "Typing :: Typed", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "bentoml[io]>=1.1.2", - "transformers[torch,tokenizers,accelerate]>=4.32.1", - "openllm-client", - "safetensors", - "optimum>=1.12.0", - "accelerate", - "ghapi", - "tabulate[widechars]>=0.9.0", - "click>=8.1.3", - "cuda-python;platform_system!=\"Darwin\"", - "bitsandbytes<0.42", + "bentoml[io]>=1.1.2", + "transformers[torch,tokenizers,accelerate]>=4.32.1", + "openllm-client", + "safetensors", + "optimum>=1.12.0", + "accelerate", + "ghapi", + "tabulate[widechars]>=0.9.0", + "click>=8.1.3", + "cuda-python;platform_system!=\"Darwin\"", + "bitsandbytes<0.42", ] description = "OpenLLM: Operating LLMs in production" dynamic = ["version", "readme"] keywords = [ - "MLOps", - "AI", - "BentoML", - "Model Serving", - "Model Deployment", - "LLMOps", - "Falcon", - "Vicuna", - "Llama 2", - "Fine tuning", - "Serverless", - "Large Language Model", - "Generative AI", - "StableLM", - "Alpaca", - "PyTorch", - "Transformers", + "MLOps", + "AI", + "BentoML", + "Model Serving", + "Model Deployment", + "LLMOps", + "Falcon", + "Vicuna", + "Llama 2", + "Fine tuning", + "Serverless", + "Large Language Model", + "Generative AI", + "StableLM", + "Alpaca", + "PyTorch", + "Transformers", ] license = "Apache-2.0" name = "openllm" diff --git a/tools/assert-model-table-latest.py b/tools/assert-model-table-latest.py deleted file mode 100755 index c1dc6efa..00000000 --- a/tools/assert-model-table-latest.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations -import os, sys -from markdown_it import MarkdownIt - -md = MarkdownIt() - -ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -with open(os.path.join(ROOT, 'README.md'), 'r') as f: - readme = md.parse(f.read()) -sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src')) -import openllm - -# NOTE: Currently, we only have one table in README, which is the Model readme. -table = [r for r in readme if r.type == 'html_block' and r.content.startswith('\n' -END_COMMENT = f'\n' - -ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src')) -import openllm - -def main() -> int: - with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f: - deps = tomlkit.parse(f.read()).value['project']['optional-dependencies'] - with open(os.path.join(ROOT, 'README.md'), 'r') as f: - readme = f.readlines() - - start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT) - formatted: dict[t.Literal['Model', 'Architecture', 'URL', 'Installation', 'Model Ids'], list[str | list[str]]] = { - 'Model': [], 'Architecture': [], 'URL': [], 'Model Ids': [], 'Installation': [], - } - max_install_len_div = 0 - for name, config_cls in openllm.CONFIG_MAPPING.items(): - dashed = inflection.dasherize(name) - formatted['Model'].append(dashed) - formatted['Architecture'].append(config_cls.__openllm_architecture__) - formatted['URL'].append(config_cls.__openllm_url__) - formatted['Model Ids'].append(config_cls.__openllm_model_ids__) - if dashed in deps: instruction = f'```bash\npip install "openllm[{dashed}]"\n```' - else: instruction = '```bash\npip install openllm\n```' - if len(instruction) > max_install_len_div: max_install_len_div = len(instruction) - formatted['Installation'].append(instruction) - meta: list[str] = ['\n', "\n"] - - # NOTE: headers - meta += ['\n'] - meta.extend([f'\n' for header in formatted.keys() if header not in ('URL',)]) - meta += ['\n'] - # NOTE: rows - for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], zip(*formatted.values())): - meta += '\n' - # configure architecture URL - cfg_cls = openllm.CONFIG_MAPPING[name] - if cfg_cls.__openllm_trust_remote_code__: arch = f'\n' - else: - arch = f"\n" - meta.extend([f'\n\n', arch]) - format_with_links: list[str] = [] - for lid in model_ids: - format_with_links.append(f'
  • {lid}
  • ') - meta.append('\n') - meta.append(f'\n') - meta += '\n' - meta.extend(['
    {header}
    {architecture}{architecture}{name}\n\n
      ' + '\n'.join(format_with_links) + '
    \n\n
    \n\n{installation}\n\n
    \n', '\n']) - - readme = readme[:start_index] + [START_COMMENT] + meta + [END_COMMENT] + readme[stop_index + 1:] - with open(os.path.join(ROOT, 'README.md'), 'w') as f: - f.writelines(readme) - return 0 - -if __name__ == '__main__': raise SystemExit(main()) diff --git a/wheels.sh b/wheels.sh index 65581f79..fac97021 100644 --- a/wheels.sh +++ b/wheels.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash - set -eo pipefail GIT_ROOT="$(git rev-parse --show-toplevel)"