diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml index e28c7181..c27339f3 100644 --- a/.github/workflows/binary-releases.yml +++ b/.github/workflows/binary-releases.yml @@ -61,13 +61,7 @@ jobs: runs-on: ubuntu-latest needs: get_commit_message if: >- - contains(needs.get_commit_message.outputs.message, '[binary build]') || - github.event_name == 'workflow_dispatch' || - github.event_name == 'workflow_call' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + contains(needs.get_commit_message.outputs.message, '[binary build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main'))) steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3 with: diff --git a/.github/workflows/build-embedding.yml b/.github/workflows/build-embedding.yml new file mode 100644 index 00000000..a8e976f7 --- /dev/null +++ b/.github/workflows/build-embedding.yml @@ -0,0 +1,155 @@ +name: Build Embedding Bentos +on: + workflow_dispatch: + push: + branches: + - "main" + tags: + - "*" + paths: + - ".github/workflows/build-embedding.yml" + - "openllm-python/src/openllm/**" + pull_request: + branches: + - "main" + paths: + - ".github/workflows/build-embedding.yml" + - "openllm-python/src/openllm/**" +# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun +defaults: + run: + shell: bash --noprofile --norc -exo pipefail {0} +env: + OPENLLM_OPT_MODEL_ID: facebook/opt-125m + BENTOML_HOME: ${{ github.workspace }}/bentoml + OPENLLM_DEV_BUILD: True + OPENLLM_DO_NOT_TRACK: True +jobs: + build-and-push-embedding-bento: + name: Build embedding container + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: + contents: write + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + security-events: write + steps: + - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3 + with: + fetch-depth: 0 + - uses: bentoml/setup-bentoml-action@59beefe94e2e8f8ebbedf555fc86bd5d1ae0a708 # ratchet:bentoml/setup-bentoml-action@v1 + with: + bentoml-version: 'main' + python-version: '3.11' + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@102b1a064a9b145e56556e22b18b19c624538d94 # ratchet:rlespinasse/github-slug-action@v4.4.1 + - name: Set up QEMU + uses: docker/setup-qemu-action@2b82ce82d56a2a04d2637cd93a637ae1b359c0a7 # ratchet:docker/setup-qemu-action@v2.2.0 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@4c0219f9ac95b02789c1075625400b2acbff50b1 # ratchet:docker/setup-buildx-action@v2.9.1 + with: + install: true + driver-opts: | + image=moby/buildkit:master + network=host + - name: Install cosign + if: github.event_name != 'pull_request' + uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 # ratchet:sigstore/cosign-installer@v3.1.1 + with: + cosign-release: 'v2.1.1' + - name: Login to GitHub Container Registry + uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # ratchet:docker/login-action@v2.2.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata tags and labels on PRs + if: github.event_name == 'pull_request' + id: meta-pr + uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0 + with: + images: ghcr.io/bentoml/openllm-embedding + tags: | + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }} + - name: Extract metadata tags and labels for main, release or tag + if: github.event_name != 'pull_request' + id: meta + uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0 + with: + flavor: latest=auto + images: ghcr.io/bentoml/openllm-embedding + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }} + - name: Build OPT Bento with base embeddings + id: bento-tag + run: | + pip install -e ${{ github.workspace }}/openllm-python + openllm build opt --serialisation legacy --bento-version sha-${{ env.GITHUB_SHA_SHORT }} --machine --dockerfile-template - < /etc/apt/apt.conf.d/keep-cache + {% call common.RUN(__enable_buildkit__) -%} {{ common.mount_cache(__lib_apt__) }} {{ common.mount_cache(__cache_apt__) }} {% endcall -%} set -eux && \ + apt-get update -y && \ + apt-get install -q -y --no-install-recommends --allow-remove-essential \ + ca-certificates gnupg2 bash build-essential {% if __options__system_packages is not none %}{{ __options__system_packages | join(' ') }}{% endif -%} + {% endblock %} + EOF + bento_tag=$(python -c "import openllm;print(str(openllm.build('opt',bento_version='sha-${{ env.GITHUB_SHA_SHORT }}',serialisation_format='legacy').tag))") + echo "tag=$bento_tag" >> $GITHUB_OUTPUT + - name: Build and push Embedding Bento + id: build-and-push + uses: bentoml/containerize-push-action@main # ratchet:exclude + with: + bento-tag: ${{ steps.bento-tag.outputs.tag }} + platforms: linux/amd64 + push: true + build-args: | + GIT_SHA=${{ env.GITHUB_SHA }} + DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }} + tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} + labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} + - name: Sign the released image + if: ${{ github.event_name != 'pull_request' }} + env: + COSIGN_EXPERIMENTAL: "true" + run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }} + - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph + uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master + if: ${{ github.event_name != 'pull_request' }} + with: + image-ref: 'ghcr.io/bentoml/openllm-embedding:sha-${{ env.GITHUB_SHA_SHORT }}' + format: 'github' + output: 'dependency-results.sbom.json' + github-pat: ${{ secrets.UI_GITHUB_TOKEN }} + scanners: 'vuln' + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master + if: ${{ github.event_name != 'pull_request' }} + with: + image-ref: 'ghcr.io/bentoml/openllm-embedding:sha-${{ env.GITHUB_SHA_SHORT }}' + format: 'sarif' + output: 'trivy-results.sarif' + severity: 'CRITICAL' + scanners: 'vuln' + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@a09933a12a80f87b87005513f0abb1494c27a716 # ratchet:github/codeql-action/upload-sarif@v2 + if: ${{ github.event_name != 'pull_request' }} + with: + sarif_file: 'trivy-results.sarif' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 286b82ce..0972cd8d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,12 +41,7 @@ jobs: runs-on: ubuntu-latest needs: get_commit_message if: >- - contains(needs.get_commit_message.outputs.message, '[ec2 build]') || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main'))) env: EC2_INSTANCE_TYPE: g5.12xlarge EC2_AMI_ID: ami-089dafe9af191a0fd @@ -211,12 +206,7 @@ jobs: - get_commit_message runs-on: ubuntu-latest if: >- - (contains(needs.get_commit_message.outputs.message, '[ec2 build]') || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))) && always() + (contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))) && always() steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@5fd3084fc36e372ff1fff382a39b10d03659f355 # ratchet:aws-actions/configure-aws-credentials@v2 diff --git a/.github/workflows/clojure-frontend.yml b/.github/workflows/clojure-frontend.yml index 9a168f69..2772c91a 100644 --- a/.github/workflows/clojure-frontend.yml +++ b/.github/workflows/clojure-frontend.yml @@ -2,13 +2,18 @@ name: Build Clojure UI on: workflow_dispatch: push: + branches: [main] + tags: + - "*" paths: - 'contrib/clojure/**' + - ".github/workflows/clojure-frontend.yml" pull_request: types: [labeled, opened, synchronize, reopened] branches: [main] paths: - 'contrib/clojure/**' + - ".github/workflows/clojure-frontend.yml" # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun defaults: run: @@ -37,12 +42,7 @@ jobs: runs-on: ubuntu-latest needs: get_commit_message if: >- - contains(needs.get_commit_message.outputs.message, '[clojure-ui build]') || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, '01 - Clojure Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + contains(needs.get_commit_message.outputs.message, '[clojure-ui build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '01 - Clojure Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true @@ -104,6 +104,7 @@ jobs: with: context: contrib/clojure file: contrib/clojure/Dockerfile + platforms: linux/amd64,linux/arm64 push: true build-args: | GIT_SHA=${{ env.GITHUB_SHA }} diff --git a/.github/workflows/compile-pypi.yml b/.github/workflows/compile-pypi.yml index c48b817f..57d897ac 100644 --- a/.github/workflows/compile-pypi.yml +++ b/.github/workflows/compile-pypi.yml @@ -41,8 +41,8 @@ env: HATCH_VERBOSE: 2 CIBW_BUILD_FRONTEND: build CIBW_ENVIRONMENT_PASS_LINUX: > - HATCH_BUILD_HOOKS_ENABLE - MYPYPATH + HATCH_BUILD_HOOKS_ENABLE MYPYPATH + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true @@ -70,13 +70,7 @@ jobs: runs-on: ubuntu-latest needs: get_commit_message if: >- - contains(needs.get_commit_message.outputs.message, '[wheel build]') || - github.event_name == 'workflow_dispatch' || - github.event_name == 'workflow_call' || - (github.event_name == 'pull_request' - && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main'))) steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3 with: @@ -121,13 +115,7 @@ jobs: macos_arch: "universal2" needs: get_commit_message if: >- - contains(needs.get_commit_message.outputs.message, '[wheel build]') || - github.event_name == 'workflow_dispatch' || - github.event_name == 'workflow_call' || - (github.event_name == 'pull_request' - && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || - (github.event_name == 'push' && - startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main'))) steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3 with: diff --git a/README.md b/README.md index 2e1d908e..3ce8b7fa 100644 --- a/README.md +++ b/README.md @@ -500,7 +500,7 @@ to see how you can do it yourself. ### Embeddings -OpenLLM tentatively provides embeddings endpoint for supported models. This can +OpenLLM provides embeddings endpoint for embeddings calculation. This can be accessed via `/v1/embeddings`. To use via CLI, simply call `openllm embed`: @@ -534,8 +534,10 @@ client.embed("I like to eat apples") ``` > [!NOTE] -> Currently, the following model family supports embeddings: Llama, T5 -> (Flan-T5, FastChat, etc.), ChatGLM +> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM +> For the remaining LLM that doesn't have specific embedding implementation, +> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) +> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento) ### Playground and Chat UI @@ -543,7 +545,7 @@ The following UIs are currently available for OpenLLM: | UI | Owner | Type | Progress | |-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------| -| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@ GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 | +| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 | | TS | BentoML Team | | 🚧 | ## ⚙️ Integrations diff --git a/changelog.d/227.feature.md b/changelog.d/227.feature.md new file mode 100644 index 00000000..a4508a1a --- /dev/null +++ b/changelog.d/227.feature.md @@ -0,0 +1,2 @@ +Added a generic embedding implementation largely based on https://github.com/bentoml/sentence-embedding-bento +For all unsupported models. diff --git a/openllm-python/README.md b/openllm-python/README.md index 1327dd53..3ce8b7fa 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -6,6 +6,8 @@

🦾 OpenLLM

pypi_status + + test_pypi_status Twitter @@ -493,12 +495,12 @@ openllm build opt --model-id facebook/opt-6.7b --adapter-id ... OpenLLM encourages contributions by welcoming users to incorporate their custom LLMs into the ecosystem. Check out -[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md) +[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md) to see how you can do it yourself. ### Embeddings -OpenLLM tentatively provides embeddings endpoint for supported models. This can +OpenLLM provides embeddings endpoint for embeddings calculation. This can be accessed via `/v1/embeddings`. To use via CLI, simply call `openllm embed`: @@ -532,8 +534,19 @@ client.embed("I like to eat apples") ``` > [!NOTE] -> Currently, the following model family supports embeddings: Llama, T5 -> (Flan-T5, FastChat, etc.), ChatGLM +> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM +> For the remaining LLM that doesn't have specific embedding implementation, +> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) +> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento) + +### Playground and Chat UI + +The following UIs are currently available for OpenLLM: + +| UI | Owner | Type | Progress | +|-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------| +| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧 | +| TS | BentoML Team | | 🚧 | ## ⚙️ Integrations diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 206ebfb7..6451bc64 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -2,80 +2,80 @@ [build-system] build-backend = "hatchling.build" requires = [ - "hatchling==1.18.0", - "hatch-vcs==0.3.0", - "hatch-fancy-pypi-readme==23.1.0", - "hatch-mypyc==0.16.0", + "hatchling==1.18.0", + "hatch-vcs==0.3.0", + "hatch-fancy-pypi-readme==23.1.0", + "hatch-mypyc==0.16.0", ] [project] authors = [ - {name = "Aaron Pham",email = "aarnphm@bentoml.com"}, - {name = "BentoML Team",email = "contact@bentoml.com"}, + { name = "Aaron Pham", email = "aarnphm@bentoml.com" }, + { name = "BentoML Team", email = "contact@bentoml.com" }, ] classifiers = [ - "Development Status :: 5 - Production/Stable", - "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 12", - "Environment :: GPU :: NVIDIA CUDA :: 11.8", - "Environment :: GPU :: NVIDIA CUDA :: 11.7", - "License :: OSI Approved :: Apache Software License", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", - "Operating System :: OS Independent", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Intended Audience :: System Administrators", - "Typing :: Typed", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "Typing :: Typed", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "bentoml[grpc,io]>=1.0.25", - "transformers[torch,tokenizers,accelerate]>=4.29.0", - "safetensors", - "optimum", - "attrs>=23.1.0", - "cattrs>=23.1.0", - "orjson", - "inflection", - "tabulate[widechars]>=0.9.0", - "httpx", - "click>=8.1.3", - "typing_extensions", - "mypy_extensions", - "ghapi", - "cuda-python;platform_system!=\"Darwin\"", - "bitsandbytes<0.42", + "bentoml[grpc,io]>=1.0.25", + "transformers[torch,tokenizers,accelerate]>=4.29.0", + "safetensors", + "optimum", + "attrs>=23.1.0", + "cattrs>=23.1.0", + "orjson", + "inflection", + "tabulate[widechars]>=0.9.0", + "httpx", + "click>=8.1.3", + "typing_extensions", + "mypy_extensions", + "ghapi", + "cuda-python;platform_system!=\"Darwin\"", + "bitsandbytes<0.42", ] description = "OpenLLM: Operating LLMs in production" dynamic = ["version", "readme"] keywords = [ - "MLOps", - "AI", - "BentoML", - "Model Serving", - "Model Deployment", - "LLMOps", - "Falcon", - "Vicuna", - "Llama 2", - "Fine tuning", - "Serverless", - "Large Language Model", - "Generative AI", - "StableLM", - "Alpaca", - "PyTorch", - "Transformers", + "MLOps", + "AI", + "BentoML", + "Model Serving", + "Model Deployment", + "LLMOps", + "Falcon", + "Vicuna", + "Llama 2", + "Fine tuning", + "Serverless", + "Large Language Model", + "Generative AI", + "StableLM", + "Alpaca", + "PyTorch", + "Transformers", ] license = "Apache-2.0" name = "openllm" @@ -103,21 +103,21 @@ Twitter = "https://twitter.com/bentomlai" [project.optional-dependencies] agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"] all = [ - "openllm[agents]", - "openllm[baichuan]", - "openllm[chatglm]", - "openllm[falcon]", - "openllm[fine-tune]", - "openllm[flan-t5]", - "openllm[ggml]", - "openllm[gptq]", - "openllm[llama]", - "openllm[mpt]", - "openllm[openai]", - "openllm[opt]", - "openllm[playground]", - "openllm[starcoder]", - "openllm[vllm]", + "openllm[agents]", + "openllm[baichuan]", + "openllm[chatglm]", + "openllm[falcon]", + "openllm[fine-tune]", + "openllm[flan-t5]", + "openllm[ggml]", + "openllm[gptq]", + "openllm[llama]", + "openllm[mpt]", + "openllm[openai]", + "openllm[opt]", + "openllm[playground]", + "openllm[starcoder]", + "openllm[vllm]", ] baichuan = ["cpm-kernels", "sentencepiece"] chatglm = ["cpm-kernels", "sentencepiece"] @@ -141,12 +141,12 @@ source = "vcs" version-file = "src/openllm/_version.py" [tool.hatch.version.raw-options] git_describe_command = [ - "git", - "describe", - "--dirty", - "--tags", - "--long", - "--first-parent", + "git", + "describe", + "--dirty", + "--tags", + "--long", + "--first-parent", ] local_scheme = "no-local-version" root = ".." @@ -157,55 +157,54 @@ only-include = ["src/openllm"] sources = ["src"] [tool.hatch.build.targets.sdist] exclude = [ - "/.git_archival.txt", - "tests", - "/.python-version-default", - "ADDING_NEW_MODEL.md", + "/.git_archival.txt", + "tests", + "/.python-version-default", + "ADDING_NEW_MODEL.md", ] [tool.hatch.build.targets.wheel.hooks.mypyc] dependencies = [ - "hatch-mypyc==0.16.0", - "mypy==1.4.1", - # avoid https://github.com/pallets/click/issues/2558 - "click==8.1.3", - "bentoml==1.1.1", - "transformers>=4.31.0", - "pandas-stubs", - "types-psutil", - "types-tabulate", - "types-PyYAML", - "types-protobuf", + "hatch-mypyc==0.16.0", + "mypy==1.4.1", + # avoid https://github.com/pallets/click/issues/2558 + "click==8.1.3", + "bentoml==1.1.1", + "transformers>=4.31.0", + "pandas-stubs", + "types-psutil", + "types-tabulate", + "types-PyYAML", + "types-protobuf", ] enable-by-default = false +exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"] include = [ - "src/openllm/bundle", - "src/openllm/models/__init__.py", - "src/openllm/models/auto/__init__.py", - "src/openllm/utils/__init__.py", - "src/openllm/utils/codegen.py", - "src/openllm/__init__.py", - "src/openllm/_prompt.py", - "src/openllm/_schema.py", - "src/openllm/_quantisation.py", - "src/openllm/_generation.py", - "src/openllm/_strategies.py", - "src/openllm/exceptions.py", - "src/openllm/testing.py", + "src/openllm/models/__init__.py", + "src/openllm/models/auto/__init__.py", + "src/openllm/utils/__init__.py", + "src/openllm/__init__.py", + "src/openllm/_prompt.py", + "src/openllm/_schema.py", + "src/openllm/_quantisation.py", + "src/openllm/_generation.py", + "src/openllm/_strategies.py", + "src/openllm/exceptions.py", + "src/openllm/testing.py", ] # NOTE: This is consistent with pyproject.toml mypy-args = [ - "--strict", - # this is because all transient library doesn't have types - "--allow-subclassing-any", - "--follow-imports=skip", - "--check-untyped-defs", - "--ignore-missing-imports", - "--no-warn-return-any", - "--warn-unreachable", - "--no-warn-no-return", - "--no-warn-unused-ignores", - "--exclude='/src\\/openllm\\/playground\\/**'", - "--exclude='/src\\/openllm\\/_typing_compat\\.py$'", + "--strict", + # this is because all transient library doesn't have types + "--follow-imports=skip", + "--allow-subclassing-any", + "--check-untyped-defs", + "--ignore-missing-imports", + "--no-warn-return-any", + "--warn-unreachable", + "--no-warn-no-return", + "--no-warn-unused-ignores", + "--exclude='/src\\/openllm\\/playground\\/**'", + "--exclude='/src\\/openllm\\/_typing_compat\\.py$'", ] options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true } require-runtime-dependencies = true diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 20e2245c..eb9048d5 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -28,7 +28,9 @@ else: _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated") _import_structure: dict[str, list[str]] = { - "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], + "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], + "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], + "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"], "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"] } COMPILED = _Path(__file__).suffix in (".pyd", ".so") @@ -40,6 +42,8 @@ if _t.TYPE_CHECKING: from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner from ._quantisation import infer_quantisation_config as infer_quantisation_config from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs + from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable + from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig from .models.baichuan import BaichuanConfig as BaichuanConfig @@ -54,7 +58,7 @@ if _t.TYPE_CHECKING: from .models.stablelm import StableLMConfig as StableLMConfig from .models.starcoder import StarCoderConfig as StarCoderConfig from .serialisation import ggml as ggml, transformers as transformers - from openllm.utils import infer_auto_class as infer_auto_class + from .utils import infer_auto_class as infer_auto_class try: if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py new file mode 100644 index 00000000..1db41ca9 --- /dev/null +++ b/openllm-python/src/openllm/_embeddings.py @@ -0,0 +1,48 @@ +# See https://github.com/bentoml/sentence-embedding-bento for more information. +from __future__ import annotations +import bentoml, openllm, transformers, typing as t +from huggingface_hub import snapshot_download +from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION +from bentoml._internal.models.model import ModelOptions, ModelSignature +if t.TYPE_CHECKING: import torch + +_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2" +_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2" + +def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: + try: + return bentoml.transformers.get(ids) + except bentoml.exceptions.NotFound: + model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")} + with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel: + snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"]) + return bentomodel + +class GenericEmbeddingRunnable(bentoml.Runnable): + SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") + SUPPORTS_CPU_MULTI_THREADING = True + def __init__(self) -> None: + self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu" + self._bentomodel = get_or_download() + self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path) + self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path) + self.model.to(self.device) + @bentoml.Runnable.method(batchable=True, batch_dim=0) + def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]: + import torch, torch.nn.functional as F + encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device) + attention_mask = encoded_input["attention_mask"] + # Compute token embeddings + with torch.no_grad(): model_output = self.model(**encoded_input) + # Perform pooling and normalize + sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1) + return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))] + @staticmethod + def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + import torch + # Mean Pooling - Take attention mask into account for correct averaging + token_embeddings = model_output[0] # First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + +__all__ = ["GenericEmbeddingRunnable"] diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 77313d89..6a43dde2 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin): scheduling_strategy = CascadingResourceStrategy generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) - embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) + embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0))) generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True))) # NOTE: returning the two langchain API's to the runner @@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate logger.info("Successfully apply LoRA layer %s", adapter_name) @bentoml.Runnable.method(**method_signature(embeddings_sig)) - def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings: - return self.embeddings([prompt] if isinstance(prompt, str) else prompt) + def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: + return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] @bentoml.Runnable.method(**method_signature(generate_sig)) def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 7c80ee61..7ca3b3ef 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,9 +1,3 @@ -# mypy: disable-error-code="arg-type,misc" -"""The service definition for running any LLMService. - -For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM. -Codegen can be found under 'openllm.utils.codegen' -""" from __future__ import annotations import os, warnings, orjson, bentoml, openllm, typing as t from starlette.applications import Starlette @@ -12,6 +6,7 @@ from starlette.routing import Route if t.TYPE_CHECKING: from starlette.requests import Request from starlette.responses import Response + from bentoml._internal.runner.runner import RunnerMethod # The following warnings from bitsandbytes, and probably not that important for users to see warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization") @@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map llm_config = openllm.AutoConfig.for_model(model) runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map)) -svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner]) +generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300) +runners: t.Sequence[bentoml.Runner] = [runner] +if not runner.supports_embeddings: runners.append(generic_embedding_runner) +svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) @svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: @@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: def metadata_v1(_: str) -> openllm.MetadataOutput: return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent) -if runner.supports_embeddings: - @svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20})) - async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: - responses = await runner.embeddings.async_run(phrases) - return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"]) +@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20})) +async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: + embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode + responses = (await embed_call.async_run(phrases))[0] + return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"]) if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): async def hf_agent(request: Request) -> Response: diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py index e40d43c1..63255fa0 100644 --- a/openllm-python/src/openllm/_strategies.py +++ b/openllm-python/src/openllm/_strategies.py @@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}") dev = str(gpus[idx]) return dev + +__all__=["CascadingResourceStrategy", "get_resource"] diff --git a/openllm-python/src/openllm/_typing_compat.py b/openllm-python/src/openllm/_typing_compat.py index 8fe8d92e..dd6b35eb 100644 --- a/openllm-python/src/openllm/_typing_compat.py +++ b/openllm-python/src/openllm/_typing_compat.py @@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): supports_embeddings: bool supports_hf_agent: bool has_adapters: bool - embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings] + embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]] generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]] diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 288cf487..9e3bf20d 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"]) def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: - _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "") - _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}'] - _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts) + from openllm.cli._factory import parse_config_options + environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy()) env: openllm.utils.EnvVarMixin = llm.config["env"] if env["framework_value"] == "vllm": serialisation_format = "legacy" env_dict = { env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, - "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'", + "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'", } if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1") @@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A ) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/")) - # NOTE: the model_id_path here are only used for setting this environment variable within the container - # built with for BentoLLM. + # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM. service_fs_path = fs.path.join("src", llm.config["service_name"]) service_path = bento._fs.getsyspath(service_fs_path) with open(service_path, "r") as f: diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index c93e649f..53b974ab 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res if device: if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') + _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}') if cors: _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"']) _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])]) diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index 0e1d39d3..30bdc717 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30 return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False) @inject -def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento: +def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento: """Package a LLM into a Bento. The LLM will be built into a BentoService with the following structure: @@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st model_name: The model name to start this LLM model_id: Optional model id for this given LLM model_version: Optional model version for this given LLM + bento_version: Optional bento veresion for this given BentoLLM quantize: Quantize the model weights. This is only applicable for PyTorch models. Possible quantisation strategies: - int8: Quantize the model with 8bit (bitsandbytes required) @@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st Returns: ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud. """ - args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,] + args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format] if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.") if quantize: args.extend(["--quantize", quantize]) if bettertransformer: args.append("--bettertransformer") @@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st if overwrite: args.append("--overwrite") if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()]) if model_version: args.extend(["--model-version", model_version]) + if bento_version: args.extend(["--bento-version", bento_version]) if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template]) if container_registry is None: container_registry = "ecr" if container_version_strategy is None: container_version_strategy = "release" diff --git a/openllm-python/src/openllm/client/runtimes/http.py b/openllm-python/src/openllm/client/runtimes/http.py index f59f6f5a..a8a73bf7 100644 --- a/openllm-python/src/openllm/client/runtimes/http.py +++ b/openllm-python/src/openllm/client/runtimes/http.py @@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]): def health(self) -> t.Any: return self._cached.health() def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: - if not self.supports_embeddings: raise ValueError("This model does not support embeddings.") if isinstance(prompt, str): prompt = [prompt] result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt)) return openllm.EmbeddingsOutput(**result) @@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]): async def health(self) -> t.Any: return await self._cached.async_health() async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: - if not self.supports_embeddings: raise ValueError("This model does not support embeddings.") if isinstance(prompt, str): prompt = [prompt] res = await self.acall("embeddings", list(prompt)) return openllm.EmbeddingsOutput(**res) diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 5ab9148d..0e0c07b6 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, else: # we will clone the all tings into the bentomodel path without loading model into memory snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm)) - except Exception: - raise + except Exception: raise else: bentomodel.flush() # type: ignore[no-untyped-call] bentomodel.save(_model_store) @@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, # NOTE: We need to free up the cache after importing the model # in the case where users first run openllm start without the model available locally. if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() - return bentomodel def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: diff --git a/openllm-python/src/openllm/utils/codegen.py b/openllm-python/src/openllm/utils/codegen.py index 49431d5c..994e8818 100644 --- a/openllm-python/src/openllm/utils/codegen.py +++ b/openllm-python/src/openllm/utils/codegen.py @@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter): class ModelAdapterMapFormatter(ModelNameFormatter): model_keyword: LiteralString = "__model_adapter_map__" -_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py" +_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py" def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None: from openllm.utils import DEBUG model_name = llm.config["model_name"] diff --git a/pyproject.toml b/pyproject.toml index 2c427fb3..b3368009 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,61 +1,61 @@ [build-system] build-backend = "hatchling.build" requires = [ - "hatchling==1.18.0", - "hatch-vcs==0.3.0", - "hatch-fancy-pypi-readme==23.1.0", - "hatch-mypyc==0.16.0", + "hatchling==1.18.0", + "hatch-vcs==0.3.0", + "hatch-fancy-pypi-readme==23.1.0", + "hatch-mypyc==0.16.0", ] [project] authors = [ - { name = "Aaron Pham", email = "aarnphm@bentoml.com" }, - { name = "BentoML Team", email = "contact@bentoml.com" }, + { name = "Aaron Pham", email = "aarnphm@bentoml.com" }, + { name = "BentoML Team", email = "contact@bentoml.com" }, ] classifiers = [ - "Development Status :: 5 - Production/Stable", - "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 12", - "Environment :: GPU :: NVIDIA CUDA :: 11.8", - "Environment :: GPU :: NVIDIA CUDA :: 11.7", - "License :: OSI Approved :: Apache Software License", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", - "Operating System :: OS Independent", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Intended Audience :: System Administrators", - "Typing :: Typed", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "Typing :: Typed", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] description = "OpenLLM monorepo" dynamic = ["version", "readme", "dependencies"] keywords = [ - "MLOps", - "AI", - "BentoML", - "Model Serving", - "Model Deployment", - "LLMOps", - "Falcon", - "Vicuna", - "Llama 2", - "Fine tuning", - "Serverless", - "Large Language Model", - "Generative AI", - "StableLM", - "Alpaca", - "PyTorch", - "Transformers", + "MLOps", + "AI", + "BentoML", + "Model Serving", + "Model Deployment", + "LLMOps", + "Falcon", + "Vicuna", + "Llama 2", + "Fine tuning", + "Serverless", + "Large Language Model", + "Generative AI", + "StableLM", + "Alpaca", + "PyTorch", + "Transformers", ] license = "Apache-2.0" name = "openllm-monorepo" @@ -74,12 +74,12 @@ fallback-version = "0.0.0" source = "vcs" [tool.hatch.version.raw-options] git_describe_command = [ - "git", - "describe", - "--dirty", - "--tags", - "--long", - "--first-parent", + "git", + "describe", + "--dirty", + "--tags", + "--long", + "--first-parent", ] local_scheme = "no-local-version" [tool.hatch.metadata] @@ -151,59 +151,59 @@ toplevel = ["openllm"] [tool.ruff] extend-exclude = [ - "tools", - "examples", - "openllm-python/src/openllm/playground", - "openllm-python/src/openllm/__init__.py", - "openllm-python/src/openllm/_version.py", - "openllm-python/src/openllm/utils/dummy_*.py", - "openllm-python/src/openllm/models/__init__.py", + "tools", + "examples", + "openllm-python/src/openllm/playground", + "openllm-python/src/openllm/__init__.py", + "openllm-python/src/openllm/_version.py", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/models/__init__.py", ] extend-include = ["*.ipynb"] extend-select = [ - "B", # flake8-bugbear - "I", # isort - "G", # flake8-logging-format - "D", # pydocstyle - "W", # pycodestyle - "Q", # flake8-quotes - "FA", # flake8-future-annotations - "S", # flake8-bandit - "TCH", # flake8-type-checking - "PLW", # pylint-warning - "PLR", # pylint-refactor - "PT", # flake8-pytest-style - "PERF", # perflint - "FLY", # flynt - "RUF", # Ruff-specific rules - "YTT", # flake8-2020 + "B", # flake8-bugbear + "I", # isort + "G", # flake8-logging-format + "D", # pydocstyle + "W", # pycodestyle + "Q", # flake8-quotes + "FA", # flake8-future-annotations + "S", # flake8-bandit + "TCH", # flake8-type-checking + "PLW", # pylint-warning + "PLR", # pylint-refactor + "PT", # flake8-pytest-style + "PERF", # perflint + "FLY", # flynt + "RUF", # Ruff-specific rules + "YTT", # flake8-2020 ] fix = true ignore = [ - "B027", # Allow non-abstract empty methods in abstract base classes - "FBT003", # Allow boolean positional values in function calls, like `dict.get(... True)` - "S105", # Ignore checks for possible passwords - "S106", - "S107", - "S603", # ignore subprocess.call - "PLR0911", - "PLR0912", - "PLR0913", - "PLR0915", - "PLR2004", # magic value to use constant - "E501", # ignore line length violation - "E401", # ignore multiple line import - "E702", - "I001", # unsorted imports - "D103", # Just missing docstring for magic methods. - "D102", - "D101", - "D100", - "TCH004", # don't move runtime import out, just warn about it - "RUF012", # mutable attributes to be used with ClassVar - "B905", # zip warning about strict, only applicable for 3.10+ - "D105", # magic docstring - "E701", # multiple statement on single line + "B027", # Allow non-abstract empty methods in abstract base classes + "FBT003", # Allow boolean positional values in function calls, like `dict.get(... True)` + "S105", # Ignore checks for possible passwords + "S106", + "S107", + "S603", # ignore subprocess.call + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + "PLR2004", # magic value to use constant + "E501", # ignore line length violation + "E401", # ignore multiple line import + "E702", + "I001", # unsorted imports + "D103", # Just missing docstring for magic methods. + "D102", + "D101", + "D100", + "TCH004", # don't move runtime import out, just warn about it + "RUF012", # mutable attributes to be used with ClassVar + "B905", # zip warning about strict, only applicable for 3.10+ + "D105", # magic docstring + "E701", # multiple statement on single line ] line-length = 768 target-version = "py38" @@ -212,10 +212,10 @@ unfixable = ["TCH004"] [tool.ruff.flake8-type-checking] exempt-modules = ["typing", "typing_extensions", "openllm._typing_compat"] runtime-evaluated-base-classes = [ - "pydantic.BaseModel", - "openllm._configuration.LLMConfig", - "openllm._configuration.GenerationConfig", - "openllm._configuration.ModelSettings", + "pydantic.BaseModel", + "openllm._configuration.LLMConfig", + "openllm._configuration.GenerationConfig", + "openllm._configuration.ModelSettings", ] runtime-evaluated-decorators = ["attrs.define", "attrs.frozen", "trait"] [tool.ruff.pydocstyle] @@ -228,12 +228,12 @@ force-single-line = false force-wrap-aliases = true known-first-party = ["openllm", "bentoml"] known-third-party = [ - "transformers", - "click", - "huggingface_hub", - "torch", - "vllm", - "auto_gptq", + "transformers", + "click", + "huggingface_hub", + "torch", + "vllm", + "auto_gptq", ] lines-after-imports = 0 lines-between-types = 0 @@ -305,44 +305,44 @@ openllm = ["openllm-python/src/openllm", "*/openllm-python/src/openllm"] [tool.coverage.run] branch = true omit = [ - "__pypackages__/*", - "openllm-python/src/openllm/_version.py", - "openllm-python/src/openllm/playground/", - "openllm-python/src/openllm/__init__.py", - "openllm-python/src/openllm/__main__.py", - "openllm-python/src/openllm/utils/dummy_*.py", - "openllm-python/src/openllm/_typing_compat.py", + "__pypackages__/*", + "openllm-python/src/openllm/_version.py", + "openllm-python/src/openllm/playground/", + "openllm-python/src/openllm/__init__.py", + "openllm-python/src/openllm/__main__.py", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/_typing_compat.py", ] source_pkgs = ["openllm"] [tool.coverage.report] exclude_lines = [ - "no cov", - "pragma: no cover", - "if __name__ == .__main__.:", - "if t.TYPE_CHECKING:", - "if _t.TYPE_CHECKING:", - 'if TYPE_CHECKING:', - 'if typing.TYPE_CHECKING:', - 'if t.TYPE_CHECKING and not MYPY:', - 'if DEBUG:', - 'if utils.DEBUG', - 'if openllm.utils.DEBUG', - '@_overload', - '@overload', - '@t.overload', - '@typing.overload', - 'raise NotImplementedError', - 'raise NotImplemented', - 'except MissingDependencyError:', + "no cov", + "pragma: no cover", + "if __name__ == .__main__.:", + "if t.TYPE_CHECKING:", + "if _t.TYPE_CHECKING:", + 'if TYPE_CHECKING:', + 'if typing.TYPE_CHECKING:', + 'if t.TYPE_CHECKING and not MYPY:', + 'if DEBUG:', + 'if utils.DEBUG', + 'if openllm.utils.DEBUG', + '@_overload', + '@overload', + '@t.overload', + '@typing.overload', + 'raise NotImplementedError', + 'raise NotImplemented', + 'except MissingDependencyError:', ] omit = [ - "__pypackages__/*", - "openllm-python/src/openllm/_version.py", - "openllm-python/src/openllm/playground/", - "openllm-python/src/openllm/__init__.py", - "openllm-python/src/openllm/__main__.py", - "openllm-python/src/openllm/utils/dummy_*.py", - "openllm-python/src/openllm/_typing_compat.py", + "__pypackages__/*", + "openllm-python/src/openllm/_version.py", + "openllm-python/src/openllm/playground/", + "openllm-python/src/openllm/__init__.py", + "openllm-python/src/openllm/__main__.py", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/_typing_compat.py", ] precision = 2 show_missing = true @@ -353,19 +353,19 @@ show_missing = true [tool.pyright] analysis.useLibraryCodeForTypes = true exclude = [ - "__pypackages__/*", - "openllm-python/src/openllm/playground/", - "openllm-python/src/openllm/models/", - "openllm-python/src/openllm/__init__.py", - "openllm-python/src/openllm/__main__.py", - "openllm-python/src/openllm/utils/dummy_*.py", - "openllm-python/src/openllm/_typing_compat.py", - "openllm-python/tests", - "cz.py", - "hatch_build.py", - "tools", - "examples", - "typings", + "__pypackages__/*", + "openllm-python/src/openllm/playground/", + "openllm-python/src/openllm/models/", + "openllm-python/src/openllm/__init__.py", + "openllm-python/src/openllm/__main__.py", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/_typing_compat.py", + "openllm-python/tests", + "cz.py", + "hatch_build.py", + "tools", + "examples", + "typings", ] executionEnvironments = [{ root = "openllm-python/src" }] pythonVersion = "3.12" @@ -382,14 +382,15 @@ typeCheckingMode = "strict" [tool.mypy] exclude = [ - "examples", - "tools", - "cz.py", - "openllm-python/tests", - "openllm-python/src/openllm/playground", - "openllm-python/src/openllm/utils/dummy_*.py", - "openllm-python/src/openllm/models", - "openllm-python/src/openllm/_typing_compat.py", + "examples", + "tools", + "cz.py", + "openllm-python/tests", + "openllm-python/src/openllm/playground", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/models", + "openllm-python/src/openllm/_typing_compat.py", + "openllm-python/src/openllm/_service.py", ] modules = ["openllm"] mypy_path = "typings" @@ -404,24 +405,24 @@ warn_unused_ignores = false [[tool.mypy.overrides]] ignore_missing_imports = true module = [ - "IPython.*", - "numpy.*", - "tensorflow.*", - "torch.*", - "optimum.*", - "inflection.*", - "huggingface_hub.*", - "click_option_group.*", - "peft.*", - "auto_gptq.*", - "vllm.*", - "orjson.*", - "httpx.*", - "cloudpickle.*", - "circus.*", - "grpc_health.v1.*", - "transformers.*", - "ghapi.*", + "IPython.*", + "numpy.*", + "tensorflow.*", + "torch.*", + "optimum.*", + "inflection.*", + "huggingface_hub.*", + "click_option_group.*", + "peft.*", + "auto_gptq.*", + "vllm.*", + "orjson.*", + "httpx.*", + "cloudpickle.*", + "circus.*", + "grpc_health.v1.*", + "transformers.*", + "ghapi.*", ] [[tool.mypy.overrides]] ignore_errors = true diff --git a/tools/run-release-action b/tools/run-release-action index b454becd..501a851d 100755 --- a/tools/run-release-action +++ b/tools/run-release-action @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -e -o pipefail +set -ex -o pipefail # Function to print script usage print_usage() { @@ -55,21 +55,55 @@ else exit 1 fi +#get highest tags across all branches, not just the current branch +version="$(git describe --tags "$(git rev-list --tags --max-count=1)")" +VERSION="${version#v}" +# Save the current value of IFS to restore it later +OLD_IFS=$IFS +IFS='.' +# split into array +read -ra VERSION_BITS <<< "$VERSION" +# Restore the original value of IFS +IFS=$OLD_IFS +VNUM1=${VERSION_BITS[0]} +VNUM2=${VERSION_BITS[1]} +VNUM3=${VERSION_BITS[2]} + +if [[ $release == 'major' ]]; then + VNUM1=$((VNUM1 + 1)) + VNUM2=0 + VNUM3=0 +elif [[ $release == 'minor' ]]; then + VNUM2=$((VNUM2 + 1)) + VNUM3=0 +else + VNUM3=$((VNUM3 + 1)) +fi +#create new tag +RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3" + echo "Running release actions (create-releases.yml)..." echo '{"release_type": "'"$release"'"}' | gh workflow run create-releases.yml --repo bentoml/openllm --json -sleep 10 +sleep 20 -echo "Getting 'create-releases.yml' workflow id..." -WORKFLOW_ID=$(gh run list -w release --repo bentoml/openllm -L 1 --json databaseId | jq '.[]| .databaseId') - -echo "Waiting for workflow create-releases.yml to complete..." +echo "Waiting for new tags to be released from 'create-releases.yml'" while true; do - STATUS=$(gh run view "$WORKFLOW_ID" --json status --jq '.status') - if [[ "${STATUS}" == "completed" ]]; then + git pull --autostash --no-edit --gpg-sign --ff origin main + if git ls-remote -t --exit-code origin "refs/tags/${RELEASE_TAG}" &> /dev/null; then break fi sleep 10 done -sleep 10 +# sleep 5 +# echo "Building OpenLLM container for ${RELEASE_TAG}..." +# gh workflow run build.yml -R bentoml/openllm -r "${RELEASE_TAG}" +# +# sleep 5 +# echo "Building OpenLLM embedding container for ${RELEASE_TAG}..." +# gh workflow run build-embedding.yml -R bentoml/openllm -r "${RELEASE_TAG}" +# +# sleep 5 +# echo "Building Clojure UI (community-maintained) for ${RELEASE_TAG}..." +# gh workflow run clojure-frontend.yml -R bentoml/openllm -r "${RELEASE_TAG}"