feat(ci): running CI on paperspace (#998)

* chore: update tiny script Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * feat(ci): running on paperspace machines Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update models and increase timeout readiness Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: schema validation for inputs and update client supporting stop Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: update coverage config Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: remove some non-essentials Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: update locks Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-04-17 21:48:41 -04:00 · 2024-05-26 13:14:54 -04:00
parent a58e12d116
commit f248ea25cd
18 changed files with 180 additions and 760 deletions
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -8,8 +8,6 @@ on:
    paths-ignore:
      - '*.md'
      - 'docs/**'
-      - 'bazel/**'
-      - 'typings/**'
      - 'changelog.d/**'
      - 'assets/**'
      - 'openllm-node/**'
@@ -19,8 +17,6 @@ on:
    paths-ignore:
      - '*.md'
      - 'docs/**'
-      - 'bazel/**'
-      - 'typings/**'
      - 'changelog.d/**'
      - 'assets/**'
      - 'openllm-node/**'
@@ -74,10 +70,10 @@ jobs:
        run: python -m pip install --upgrade build
      - name: Build
        run: |
-          bash local.sh
+          bash local.sh -e vllm
          python -m build -sw openllm-python/
      - name: Upload artifacts
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
          name: binary-artefacts
          path: openllm-python/dist/*
@@ -146,7 +142,7 @@ jobs:
      - name: Install Hatch
        run: pip install -U hatch
      - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@78c6b5541adb5849f5d72d15da722aedb26327ca # ratchet:dtolnay/rust-toolchain@stable
+        uses: dtolnay/rust-toolchain@d388a4836fcdbde0e50e395dc79a2670ccdef13f # ratchet:dtolnay/rust-toolchain@stable
        with:
          targets: ${{ matrix.job.target }}
      - name: Set up cross compiling
@@ -165,7 +161,7 @@ jobs:
          fi
      - name: Download Python artifacts
        if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # ratchet:actions/download-artifact@v4
        with:
          name: binary-artefacts
          path: openllm-python/dist
@@ -211,14 +207,14 @@ jobs:
          done
      - name: Upload staged archive
        if: runner.os != 'Linux'
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
          name: staged-${{ runner.os }}
          path: openllm-python/packaging/*
          if-no-files-found: error
      - name: Upload archive
        if: runner.os == 'Linux'
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
          name: standalone
          path: openllm-python/packaging/*
@@ -245,7 +241,7 @@ jobs:
      - name: Install PyOxidizer ${{ env.PYOXIDIZER_VERSION }}
        run: pip install pyoxidizer==${{ env.PYOXIDIZER_VERSION }}
      - name: Download staged binaries
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # ratchet:actions/download-artifact@v4
        with:
          name: staged-${{ runner.os }}
          path: openllm-python/archives
@@ -274,13 +270,13 @@ jobs:
          mkdir installers
          mv build/*/release/*/*.{exe,msi} installers
      - name: Upload binaries
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
          name: standalone
          path: openllm-python/archives/*
          if-no-files-found: error
      - name: Upload installers
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
          name: installers
          path: openllm-python/installers/*
--- a/.github/workflows/build-pypi.yml
+++ b/.github/workflows/build-pypi.yml
@@ -86,9 +86,9 @@ jobs:
        run: hatch build
        working-directory: ${{ matrix.directory }}
      - name: Upload artifacts
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # ratchet:actions/upload-artifact@v4
        with:
-          name: python-artefacts
+          name: python-artefacts-${{ matrix.directory }}
          path: ${{ matrix.directory }}/dist/*
          if-no-files-found: error
  check-download-artefacts:
@@ -98,9 +98,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Download Python artifacts
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # ratchet:actions/download-artifact@v4
        with:
-          name: python-artefacts
+          pattern: python-artefacts-*
+          merge-multiple: true
          path: dist
      - name: dry ls
        run: ls -rthlaR
@@ -110,13 +111,13 @@ jobs:
    runs-on: ubuntu-latest
    permissions:
      id-token: write
-    # needs: [pure-wheels-sdist, mypyc]
    needs: [pure-wheels-sdist]
    steps:
      - name: Download Python artifacts
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # ratchet:actions/download-artifact@v4
        with:
-          name: python-artefacts
+          pattern: python-artefacts-*
+          merge-multiple: true
          path: dist
      - name: Publish nightly wheels to test.pypi.org
        uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # ratchet:pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,222 +0,0 @@
-name: Build and push OpenLLM base container
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - '*'
-    paths:
-      - 'openllm-python/src/openllm/**'
-      - 'openllm-python/src/openllm_cli/**'
-      - 'openllm-core/src/openllm_core/**'
-      - 'openllm-client/src/openllm_client/**'
-  pull_request:
-    branches:
-      - 'main'
-    paths:
-      - 'openllm-python/src/openllm/**'
-      - 'openllm-python/src/openllm_cli/**'
-      - 'openllm-core/src/openllm_core/**'
-      - 'openllm-client/src/openllm_client/**'
-    types: [labeled, opened, synchronize, reopened]
-  workflow_call:
-    inputs:
-      tags:
-        required: true
-        type: string
-env:
-  LINES: 120
-  COLUMNS: 120
-  OPENLLM_DO_NOT_TRACK: True
-  PYTHONUNBUFFERED: '1'
-  AWS_REGION: us-west-2
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-jobs:
-  get_commit_message:
-    name: Get commit message
-    runs-on: ubuntu-latest
-    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
-    outputs:
-      message: ${{ steps.commit_message.outputs.message }}
-    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
-        # Gets the correct commit message for pull request
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-      - name: Get commit message
-        id: commit_message
-        run: |
-          set -xe
-          COMMIT_MSG=$(git log --no-merges -1 --oneline)
-          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
-          echo github.ref ${{ github.ref }}
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    needs: get_commit_message
-    if: >-
-      contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
-    env:
-      EC2_INSTANCE_TYPE: t3.2xlarge
-      EC2_AMI_ID: ami-089dafe9af191a0fd
-      EC2_SUBNET_ID: subnet-0ca63188fe98788c1,subnet-05997205433b249d0,subnet-07ef5d3e974275fed,subnet-0161ef0151089bb0b
-      EC2_SECURITY_GROUP: sg-051366641bf2b8049
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # ratchet:aws-actions/configure-aws-credentials@v4.0.2
-        with:
-          aws-access-key-id: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 Runner
-        id: start-ec2-runner
-        uses: aarnphm/ec2-github-runner@main # ratchet:exclude
-        with:
-          mode: start
-          github-token: ${{ secrets.OPENLLM_PAT }}
-          ec2-region: ${{ env.AWS_REGION }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-  build-and-push-image:
-    name: Build and push OpenLLM base image
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
-      cancel-in-progress: true
-    needs: start-runner
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
-    permissions:
-      contents: write
-      packages: write
-      # This is used to complete the identity challenge
-      # with sigstore/fulcio when running outside of PRs.
-      id-token: write
-      security-events: write
-    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
-        with:
-          fetch-depth: 0
-          ref: '${{ inputs.tags }}'
-      - name: Inject slug/short variables
-        uses: rlespinasse/github-slug-action@797d68864753cbceedc271349d402da4590e6302 # ratchet:rlespinasse/github-slug-action@v4.5.0
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3 # ratchet:docker/setup-qemu-action@v3.0.0
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # ratchet:docker/setup-buildx-action@v3.3.0
-        with:
-          install: true
-          driver-opts: |
-            image=moby/buildkit:master
-            network=host
-      - name: Install cosign
-        if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@59acb6260d9c0ba8f4a2f9d9b48431a222b68e20 # ratchet:sigstore/cosign-installer@v3.5.0
-        with:
-          cosign-release: 'v2.1.1'
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # ratchet:docker/login-action@v3.1.0
-        if: github.event_name != 'pull_request'
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata tags and labels for main, release or tag
-        if: github.event_name != 'pull_request'
-        id: meta
-        uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # ratchet:docker/metadata-action@v5.5.1
-        with:
-          flavor: |
-            latest=auto
-          images: |
-            ghcr.io/bentoml/openllm
-          tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
-          labels: |
-            maintainer=aarnphm
-            org.opencontainers.image.source="https://github.com/bentoml/OpenLLM"
-      - name: Build and push Docker image
-        id: build-and-push
-        uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # ratchet:docker/build-push-action@v5.3.0
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
-        with:
-          context: .
-          file: Dockerfile
-          push: true
-          platforms: 'linux/amd64'
-          build-args: |
-            GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
-          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          # TODO: Once https://github.com/aws/containers-roadmap/issues/876 is supported with OCI 1.1
-          # then move back to saving cache within the public repo. For now we will save the cache manifest within our internal S3 buckets.
-          # NOTE: the region of the S3 on prod is us-east-1, where the EC2 machine is at us-west-2
-          cache-from: type=s3,region=us-east-1,bucket=openllm-cache,name=y5w8i4y6
-          # @aarnphm: max is fine here, since we didn't do any custom code yet, so it is ok to cache every layer for optimal build time
-          # We also ignore-error for now, just upload anything to the blob storage
-          cache-to: type=s3,region=us-east-1,bucket=openllm-cache,name=y5w8i4y6,mode=max,compression=zstd,ignore-error=true
-      - name: Sign the released image
-        if: ${{ github.event_name != 'pull_request' }}
-        env:
-          COSIGN_EXPERIMENTAL: 'true'
-        run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }}
-      - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
-        uses: aquasecurity/trivy-action@b2933f565dbc598b29947660e66259e3c7bc8561 # ratchet:aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/bentoml/openllm:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'github'
-          output: 'dependency-results.sbom.json'
-          github-pat: ${{ secrets.GITHUB_TOKEN }}
-          scanners: 'vuln'
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@b2933f565dbc598b29947660e66259e3c7bc8561 # ratchet:aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/bentoml/openllm:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'sarif'
-          output: 'trivy-results.sarif'
-          severity: 'CRITICAL'
-          scanners: 'vuln'
-      - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # ratchet:github/codeql-action/upload-sarif@v3.25.5
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          sarif_file: 'trivy-results.sarif'
-  # TODO: Add snapshot tests here.
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - build-and-push-image
-      - get_commit_message
-    runs-on: ubuntu-latest
-    if: >-
-      (contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))) && always()
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # ratchet:aws-actions/configure-aws-credentials@v4.0.2
-        with:
-          aws-access-key-id: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: aarnphm/ec2-github-runner@af796d217e24ecbbc5a2c49e780cd90616e2b962 # ratchet:aarnphm/ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.OPENLLM_PAT }}
-          ec2-region: ${{ env.AWS_REGION }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,31 +1,12 @@
-name: Continuous Integration
+name: CI
 on:
  workflow_call:
  push:
    branches: [main]
-    paths-ignore:
-      - 'docs/**'
-      - 'bazel/**'
-      - 'typings/**'
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
  pull_request:
    branches: [main]
-    paths-ignore:
-      - 'docs/**'
-      - 'bazel/**'
-      - 'typings/**'
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
 env:
-  LINES: 120
-  COLUMNS: 120
-  OPENLLM_DO_NOT_TRACK: True
-  PYTHONUNBUFFERED: '1'
-  HATCH_VERBOSE: 2
-# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
+  PAPERSPACE_API_KEY: ${{secrets.PAPERSPACE_API_KEY}}
 defaults:
  run:
    shell: bash --noprofile --norc -exo pipefail {0}
@@ -34,111 +15,35 @@ concurrency:
  cancel-in-progress: true
 jobs:
  tests:
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
    if: ${{ github.event_name == 'pull_request' || github.event_name == 'push'|| github.event_name == 'workflow_call' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        python-version: ['3.9', '3.12']
-    name: tests (${{ matrix.python-version }}.${{ matrix.os }})
+    name: General API tests
    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4
+      - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # ratchet:actions/setup-python@v4
        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.sha }}
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
+          python-version-file: '.python-version-default'
+      - name: setup tooling
+        run: |
+          python -m pip install uv
+          uv pip install --system httpx
+      - name: startup machine
+        run: python tools/machines.py --start ${{ secrets.PAPERSPACE_MACHINE_ID }} || true
+      - name: executing remote ssh commands using password
+        uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
        with:
-          bentoml-version: 'main'
-          python-version: ${{ matrix.python-version }}
-      # - name: Run tests
-      #   run: hatch run tests:python
-      # - name: Disambiguate coverage filename
-      #   run: mv .coverage ".coverage.${{ matrix.os }}.${{ matrix.python-version }}"
-      # - name: Upload coverage data
-      #   uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
-      #   with:
-      #     name: coverage-data
-      #     path: .coverage.*
-  # coverage:
-  #   name: report-coverage
-  #   runs-on: ubuntu-latest
-  #   if: false
-  #   needs: tests
-  #   steps:
-  #     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4.1.1
-  #       with:
-  #         fetch-depth: 0
-  #         ref: ${{ github.event.pull_request.head.sha }}
-  #     - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-  #       with:
-  #         bentoml-version: 'main'
-  #         python-version-file: .python-version-default
-  #     - name: Download coverage data
-  #       uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
-  #       with:
-  #         name: coverage-data
-  #     - name: Combine coverage data
-  #       run: hatch run coverage:combine
-  #     - name: Export coverage reports
-  #       run: |
-  #         hatch run coverage:report-xml openllm-python
-  #         hatch run coverage:report-uncovered-html openllm-python
-  #     - name: Upload uncovered HTML report
-  #       uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # ratchet:actions/upload-artifact@v3
-  #       with:
-  #         name: uncovered-html-report
-  #         path: htmlcov
-  #     - name: Generate coverage summary
-  #       run: hatch run coverage:generate-summary
-  #     - name: Write coverage summary report
-  #       if: github.event_name == 'pull_request'
-  #       run: hatch run coverage:write-summary-report
-  #     - name: Update coverage pull request comment
-  #       if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork
-  #       uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # ratchet:marocchino/sticky-pull-request-comment@v2
-  #       with:
-  #         path: coverage-report.md
-  # cli-benchmark:
-  #   name: Check for CLI responsiveness
-  #   runs-on: ubuntu-latest
-  #   env:
-  #     HYPERFINE_VERSION: '1.12.0'
-  #   steps:
-  #     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4.1.1
-  #       with:
-  #         fetch-depth: 0
-  #     - name: Install hyperfine
-  #       run: |
-  #         wget https://github.com/sharkdp/hyperfine/releases/download/v${HYPERFINE_VERSION}/hyperfine_${HYPERFINE_VERSION}_amd64.deb
-  #         sudo dpkg -i hyperfine_${HYPERFINE_VERSION}_amd64.deb
-  #     - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-  #       with:
-  #         bentoml-version: 'main'
-  #         python-version-file: .python-version-default
-  #     - name: Install self
-  #       run: bash local.sh
-  #     - name: Speed
-  #       run: hyperfine -m 100 --warmup 10 openllm
-  # brew-dry-run:
-  #   name: Running dry-run tests for brew
-  #   runs-on: macos-latest
-  #   steps:
-  #     - name: Install tap and dry-run
-  #       run: |
-  #         brew tap bentoml/openllm https://github.com/bentoml/openllm
-  #         brew install openllm
-  #         openllm --help
-  #         openllm models --show-available
-  # evergreen: # https://github.com/marketplace/actions/alls-green#why
-  #   if: always()
-  #   needs:
-  #     - tests
-  #     # - cli-benchmark
-  #     # - brew-dry-run
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Decide whether the needed jobs succeeded or failed
-  #       uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
-  #       with:
-  #         jobs: ${{ toJSON(needs) }}
+          host: ${{secrets.PAPERSPACE_HOST}}
+          username: ${{secrets.PAPERSPACE_USERNAME}}
+          key: ${{secrets.PAPERSPACE_SSH_KEY}}
+          port: ${{secrets.PAPERSPACE_PORT}}
+          script: bash ci.sh --pr ${{github.event.number}}
+  evergreen: # https://github.com/marketplace/actions/alls-green#why
+    if: always()
+    needs:
+      - tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Decide whether the needed jobs succeeded or failed
+        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
+        with:
+          jobs: ${{ toJSON(needs) }}
--- a/.github/workflows/cleanup.yml
+++ b/.github/workflows/cleanup.yml
@@ -1,16 +0,0 @@
-name: Cleanup PR cache
-on:
-  pull_request:
-    types:
-      - closed
-jobs:
-  cleanup:
-    runs-on: ubuntu-latest
-    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
-    steps:
-      - name: Check out code
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
-      - name: Cleanup
-        run: "gh extension install actions/gh-actions-cache\n\nREPO=${{ github.repository }}\nBRANCH=\"refs/pull/${{ github.event.pull_request.number }}/merge\"\n\necho \"Fetching list of cache key\"\ncacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )\n\n## Setting this to not fail the workflow while deleting cache keys. \nset +e\necho \"Deleting caches...\"\nfor cacheKey in $cacheKeysForPR\ndo\n    gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm\ndone\necho \"Done\"\n"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -102,7 +102,6 @@ jobs:
    needs:
      - release
      - publish-python
-      - binary-distribution
    runs-on: ubuntu-latest
    permissions:
      contents: write
@@ -155,62 +154,12 @@ jobs:
          popd &>/dev/null
          git add package.json openllm-node/package.json && git commit -S -sm "infra: bump to dev version of ${DEV_VERSION} [generated] [skip ci]"
          git push origin HEAD:main
-  binary-distribution:
-    if: github.repository_owner == 'bentoml'
-    needs: build-pypi
-    name: Create binary/wheels distribution
-    uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@main # ratchet:exclude
  release-notes:
    if: github.repository_owner == 'bentoml'
    needs:
      - release
      - publish-python
-      - binary-distribution
    name: Create release notes and setup for next cycle
    uses: bentoml/OpenLLM/.github/workflows/release-notes.yml@main # ratchet:exclude
    with:
      tags: ${{ needs.release.outputs.version }}
-  bump-homebrew-tap:
-    needs:
-      - release-notes
-      - prepare-next-dev-cycle
-      - release
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      id-token: write
-    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
-        with:
-          fetch-depth: 0
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-        with:
-          bentoml-version: 'main'
-          python-version-file: .python-version-default
-      - name: Install jq and curl
-        run: sudo apt-get install -y jq curl
-      - name: Import bot's GPG key for signing commits
-        id: import-gpg-key
-        uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4 # ratchet:crazy-max/ghaction-import-gpg@v6
-        with:
-          gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
-          passphrase: ${{ secrets.GPG_PASSPHRASE }}
-          git_config_global: true
-          git_user_signingkey: true
-          git_commit_gpgsign: true
-          git_tag_gpgsign: true
-      - name: Update current formula
-        env:
-          GIT_AUTHOR_NAME: ${{ steps.import-gpg-key.outputs.name }}
-          GIT_AUTHOR_EMAIL: ${{ steps.import-gpg-key.outputs.email }}
-          GIT_COMMITTER_NAME: ${{ steps.import-gpg-key.outputs.name }}
-          GIT_COMMITTER_EMAIL: ${{ steps.import-gpg-key.outputs.email }}
-          GITHUB_TOKEN: ${{ secrets.HOMEBREW_GITHUB_TOKEN }}
-        run: |
-          git pull --autostash --no-edit --gpg-sign --ff origin main
-          SEMVER="${{ needs.release.outputs.version }}"
-          SEMVER="${SEMVER#v}"
-          pip install fs jinja2 ghapi plumbum
-          ./tools/update-brew-tap.py
-          git add Formula && git commit -S -sm "infra: bump to homebrew tap release to ${SEMVER} [generated] [skip ci]"
-          git push origin HEAD:main
--- a/.github/workflows/cron.yml
+++ b/.github/workflows/cron.yml
@@ -1,59 +0,0 @@
-name: Cron update
-on:
-  workflow_dispatch:
-  schedule:
-    #        ┌───────────── minute (0 - 59)
-    #        │  ┌───────────── hour (0 - 23)
-    #        │  │ ┌───────────── day of the month (1 - 31)
-    #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    #        │  │ │ │ │
-    - cron: '42 2 * * SUN-WED'
-# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
-defaults:
-  run:
-    shell: bash --noprofile --norc -exo pipefail {0}
-concurrency:
-  group: cron-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-jobs:
-  update-actions:
-    runs-on: 'ubuntu-latest'
-    name: Ratchet update
-    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
-    env:
-      ACTIONS_TOKEN: ${{ secrets.OPENLLM_PAT }}
-    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # ratchet:actions/checkout@v4.1.6
-        with:
-          fetch-depth: 0
-      - name: Install jq and curl
-        run: sudo apt-get install -y jq curl
-      - name: Import bot's GPG key for signing commits
-        id: import-gpg-key
-        uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4 # ratchet:crazy-max/ghaction-import-gpg@v6
-        with:
-          gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
-          passphrase: ${{ secrets.GPG_PASSPHRASE }}
-          git_config_global: true
-          git_user_signingkey: true
-          git_commit_gpgsign: true
-      - name: Locking dependencies
-        run: bash ./tools/update-actions.sh
-      - name: Create a PR
-        uses: peter-evans/create-pull-request@6d6857d36972b65feb161a90e484f2984215f83e # ratchet:peter-evans/create-pull-request@v6.0.5
-        env:
-          GIT_AUTHOR_NAME: ${{ steps.import-gpg-key.outputs.name }}
-          GIT_AUTHOR_EMAIL: ${{ steps.import-gpg-key.outputs.email }}
-          GIT_COMMITTER_NAME: ${{ steps.import-gpg-key.outputs.name }}
-          GIT_COMMITTER_EMAIL: ${{ steps.import-gpg-key.outputs.email }}
-          BRANCH_NAME: cron/ratchet
-        with:
-          title: 'ci: update lock actions [generated]'
-          commit-message: 'cron: ratchet update'
-          branch-suffix: timestamp
-          signoff: true
-          delete-branch: true
-          reviewers: aarnphm
-          author: ${{ env.GIT_AUTHOR_NAME }} <${{ env.GIT_AUTHOR_EMAIL }}>
-          branch: ${{ env.BRANCH_NAME }}
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -35,46 +35,11 @@ jobs:
      - name: Create release notes
        run: ./.github/actions/create_release_and_archive.sh ${{ inputs.tags }}
      - name: Download Python artifacts
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # ratchet:actions/download-artifact@v4
        with:
-          name: python-artefacts
+          pattern: python-artefacts-*
+          merge-multiple: true
          path: dist
-      # - name: Download Linux x86_64 compiled artifacts
-      #   uses: actions/download-artifact@7a1cd3216ca9260cd8022db641d960b1db4d1be4 # ratchet:actions/download-artifact@v3
-      #   with:
-      #     name: linux-x86_64-mypyc-wheels
-      #     path: dist
-      # - name: Download MacOS x86_64 compiled artifacts
-      #   uses: actions/download-artifact@7a1cd3216ca9260cd8022db641d960b1db4d1be4 # ratchet:actions/download-artifact@v3
-      #   with:
-      #     name: macos-x86_64-mypyc-wheels
-      #     path: dist
-      # - name: Download MacOS arm64 compiled artifacts
-      #   uses: actions/download-artifact@7a1cd3216ca9260cd8022db641d960b1db4d1be4 # ratchet:actions/download-artifact@v3
-      #   with:
-      #     name: macos-arm64-mypyc-wheels
-      #     path: dist
-      # - name: Download MacOS universal2 compiled artifacts
-      #   uses: actions/download-artifact@7a1cd3216ca9260cd8022db641d960b1db4d1be4 # ratchet:actions/download-artifact@v3
-      #   with:
-      #     name: macos-universal2-mypyc-wheels
-      #     path: dist
-      - name: Download binaries
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
-        with:
-          name: standalone
-          path: archives
-      - name: Download standalone MacOS
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
-        with:
-          name: staged-macOS
-          path: archives
-      # TODO: Uncomment me when I decided to bring back Windows support
-      # - name: Download installers
-      #   uses: actions/download-artifact@7a1cd3216ca9260cd8022db641d960b1db4d1be4 # ratchet:actions/download-artifact@v3
-      #   with:
-      #     name: installers
-      #     path: installers
      - name: Create release
        uses: softprops/action-gh-release@69320dbe05506a9a39fc8ae11030b214ec2d1f87 # ratchet:softprops/action-gh-release@v2.0.5
        with:
@@ -83,7 +48,7 @@ jobs:
          body_path: release_notes.txt
          fail_on_unmatched_files: true
          tag_name: '${{ inputs.tags }}'
-          # TODO: add instasllers/* once windows support is back
+          # TODO: add installers/* archives/* once windows support is back
          files: |-
            dist/*
            archives/*
--- a/Formula/README.md
+++ b/Formula/README.md
@@ -1,13 +0,0 @@
-This directory hosts the brew tap for OpenLLM
-
-```bash
-brew tap bentoml/openllm https://github.com/bentoml/openllm
-
-brew install openllm
-```
-
-> [!NOTE]
-> This will install the generated release binary from GitHub release. If any releases doesn't include the binary, then you will need to install from pip
-
-> [!IMPORTANT]
-> If you want to install from source, please refer to [Development Guide](https://github.com/bentoml/openllm/tree/main/DEVELOPMENT.md).
--- a/Formula/openllm.rb
+++ b/Formula/openllm.rb
@@ -1,47 +0,0 @@
-# Generated by tools/update-brew-tap.py. DO NOT EDIT!
-# Please refers to the original template file Formula/openllm.rb.j2
-# vim: set ft=ruby:
-class Openllm < Formula
-  desc "OpenLLM: Operating LLMs in production"
-  homepage "https://github.com/bentoml/OpenLLM"
-  version "0.4.44"
-  license "Apache-2.0"
-  head "https://github.com/bentoml/OpenLLM, branch: main"
-  url "https://github.com/bentoml/OpenLLM/archive/v0.4.44.tar.gz"
-  sha256 "5158eee3c4b771d3cabd8827439c148d3d710a9b63c7ad98a58a607f587d3acc"
-
-  on_linux do
-    url "https://github.com/bentoml/OpenLLM/releases/download/v0.4.44/openllm-0.4.44-x86_64-unknown-linux-musl.tar.gz"
-    sha256 "d56d14b032ffdb0de0ea2a7575f6039726987adccc74b634b0e943a73679232a"
-  end
-  on_macos do
-    on_arm do
-      url "https://github.com/bentoml/OpenLLM/releases/download/v0.4.44/openllm-0.4.44-aarch64-apple-darwin.tar.gz"
-      sha256 "ef57c27ab684179c6cccc02cd60506ea6a721908b56e7f483dda0b9931c74cdd"
-    end
-    on_intel do
-      url "https://github.com/bentoml/OpenLLM/releases/download/v0.4.44/openllm-0.4.44-x86_64-apple-darwin.tar.gz"
-      sha256 "65c5f176362f00cf29187c6c2062141abfc8165e63a89845038d2921715d270c"
-    end
-  end
-
-  def install
-    on_linux do
-      bin.install "openllm-0.4.44-x86_64-unknown-linux-musl" => "openllm"
-    end
-  on_macos do
-    on_arm do
-      bin.install "openllm-0.4.44-aarch64-apple-darwin" => "openllm"
-    end
-    on_intel do
-      bin.install "openllm-0.4.44-x86_64-apple-darwin" => "openllm"
-    end
-  end
-    ohai "To get started, run: 'openllm --help'"
-    ohai "To see supported models, run: 'openllm models'"
-  end
-
-  test do
-    shell_output "#{bin}/openllm --version"
-  end
-end
--- a/Formula/openllm.rb.j2
+++ b/Formula/openllm.rb.j2
@@ -1,47 +0,0 @@
-# Generated by {{ __cmd__ }}. DO NOT EDIT!
-# Please refers to the original template file {{ __template_file__ }}
-# vim: set ft=ruby:
-class Openllm < Formula
-  desc "{{ name }}: {{ description }}"
-  homepage "{{ html_url }}"
-  version "{{ __tag__|replace('v', '') }}"
-  license "{{ license["spdx_id"] }}"
-  head "{{ html_url }}, branch: {{ default_branch }}"
-  url "{{ determine_release_url(svn_url, __tag__, 'archive') }}"
-  sha256 "{{ shadict['archive'] }}"
-
-  on_linux do
-    url "{{ determine_release_url(svn_url, __tag__, 'linux_intel') }}"
-    sha256 "{{ shadict['linux_intel'] }}"
-  end
-  on_macos do
-    on_arm do
-      url "{{ determine_release_url(svn_url, __tag__, 'macos_arm') }}"
-      sha256 "{{ shadict['macos_arm'] }}"
-    end
-    on_intel do
-      url "{{ determine_release_url(svn_url, __tag__, 'macos_intel') }}"
-      sha256 "{{ shadict['macos_intel'] }}"
-    end
-  end
-
-  def install
-    on_linux do
-      bin.install "openllm-{{ __tag__|replace('v', '') }}-{{ __gz_extension__['linux_intel'] }}" => "openllm"
-    end
-  on_macos do
-    on_arm do
-      bin.install "openllm-{{ __tag__|replace('v', '') }}-{{ __gz_extension__['macos_arm'] }}" => "openllm"
-    end
-    on_intel do
-      bin.install "openllm-{{ __tag__|replace('v', '') }}-{{ __gz_extension__['macos_intel'] }}" => "openllm"
-    end
-  end
-    ohai "To get started, run: 'openllm --help'"
-    ohai "To see supported models, run: 'openllm models'"
-  end
-
-  test do
-    shell_output "#{bin}/openllm --version"
-  end
-end
--- a/openllm-client/src/openllm_client/_http.py
+++ b/openllm-client/src/openllm_client/_http.py
@@ -86,6 +86,12 @@ class HTTPClient(Client):
    else:
      llm_config = {**self._config, **attrs}

+    if stop is not None:
+      if isinstance(stop, str):
+        stop = [stop]
+      else:
+        stop = list(stop)
+
    return self._post(
      f'/{self._api_version}/generate',
      response_cls=Response,
@@ -110,6 +116,13 @@ class HTTPClient(Client):
      llm_config = {**self._config, **llm_config, **attrs}
    else:
      llm_config = {**self._config, **attrs}
+
+    if stop is not None:
+      if isinstance(stop, str):
+        stop = [stop]
+      else:
+        stop = list(stop)
+
    return self._post(
      f'/{self._api_version}/generate_stream',
      response_cls=Response,
@@ -181,12 +194,18 @@ class AsyncHTTPClient(AsyncClient, pydantic.BaseModel):
      timeout = self.timeout
    if verify is None:
      verify = self._verify  # XXX: need to support this again
-    _metadata = await self._metadata
    _config = await self._config
    if llm_config is not None:
      llm_config = {**_config, **llm_config, **attrs}
    else:
      llm_config = {**_config, **attrs}
+
+    if stop is not None:
+      if isinstance(stop, str):
+        stop = [stop]
+      else:
+        stop = list(stop)
+
    return await self._post(
      f'/{self._api_version}/generate',
      response_cls=Response,
@@ -209,13 +228,18 @@ class AsyncHTTPClient(AsyncClient, pydantic.BaseModel):
      timeout = self.timeout
    if verify is None:
      verify = self._verify  # XXX: need to support this again
-    _metadata = await self._metadata
    _config = await self._config
    if llm_config is not None:
      llm_config = {**_config, **llm_config, **attrs}
    else:
      llm_config = {**_config, **attrs}

+    if stop is not None:
+      if isinstance(stop, str):
+        stop = [stop]
+      else:
+        stop = list(stop)
+
    async for response_chunk in await self._post(
      f'/{self._api_version}/generate_stream',
      response_cls=Response,
--- a/openllm-core/src/openllm_core/_schemas.py
+++ b/openllm-core/src/openllm_core/_schemas.py
@@ -52,7 +52,7 @@ class GenerationInput(pydantic.BaseModel):
      raise RuntimeError('This class is not meant to be used directly. Use "from_config" instead')
    super().__init__(**data)

-  @pydantic.field_validator('stop')
+  @pydantic.field_validator('stop', mode='before')
  @classmethod
  def stop_validator(cls, data: str | list[str] | t.Iterable[str] | None) -> list[str] | None:
    if data is None:
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -1,16 +0,0 @@
-from __future__ import annotations
-
-import pytest, typing as t
-
-
-@pytest.fixture(
-  scope='function',
-  name='model_id',
-  params={
-    'meta-llama/Meta-Llama-3-8B-Instruct',
-    'casperhansen/llama-3-70b-instruct-awq',
-    'TheBloke/Nous-Hermes-2-Mixtral-8x7B-DPO-AWQ',
-  },
-)
-def fixture_model_id(request) -> t.Generator[str, None, None]:
-  yield request.param
--- a/openllm-python/tests/regression_test.py
+++ b/openllm-python/tests/regression_test.py
@@ -1,26 +1,27 @@
 from __future__ import annotations

-import pytest, subprocess, sys, openllm, bentoml, asyncio
-from openai import AsyncOpenAI
+import pytest, subprocess, sys, asyncio, openllm, bentoml
+from openai import OpenAI
 from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam

 SERVER_PORT = 53822


@pytest.mark.asyncio
-async def test_openai_compatible(model_id: str):
+async def test_openai_compatible():
+  model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
  server = subprocess.Popen([sys.executable, '-m', 'openllm', 'start', model_id, '--port', str(SERVER_PORT)])
-  await asyncio.sleep(5)
-  with bentoml.SyncHTTPClient(f'http://127.0.0.1:{SERVER_PORT}', server_ready_timeout=90) as client:
+  await asyncio.sleep(10)
+  with bentoml.SyncHTTPClient(f'http://127.0.0.1:{SERVER_PORT}', server_ready_timeout=120) as client:
    assert client.is_ready(30)

  try:
-    client = AsyncOpenAI(api_key='na', base_url=f'http://127.0.0.1:{SERVER_PORT}/v1')
-    serve_model = (await client.models.list()).data[0].id
+    client = OpenAI(api_key='na', base_url=f'http://127.0.0.1:{SERVER_PORT}/v1')
+    serve_model = client.models.list().data[0].id
    assert serve_model == openllm.utils.normalise_model_name(model_id)
-    streamable = await client.chat.completions.create(
+    streamable = client.chat.completions.create(
      model=serve_model,
-      max_tokens=512,
+      max_tokens=128,
      stream=False,
      messages=[
        ChatCompletionSystemMessageParam(
@@ -37,18 +38,27 @@ async def test_openai_compatible(model_id: str):


@pytest.mark.asyncio
-async def test_generate_endpoint(model_id: str):
-  server = subprocess.Popen([sys.executable, '-m', 'openllm', 'start', model_id, '--port', str(SERVER_PORT)])
-  await asyncio.sleep(5)
+async def test_generate_endpoint():
+  server = subprocess.Popen([
+    sys.executable,
+    '-m',
+    'openllm',
+    'start',
+    'microsoft/Phi-3-mini-4k-instruct',
+    '--trust-remote-code',
+    '--port',
+    str(SERVER_PORT),
+  ])
+  await asyncio.sleep(10)

-  with bentoml.SyncHTTPClient(f'http://127.0.0.1:{SERVER_PORT}', server_ready_timeout=90) as client:
+  with bentoml.SyncHTTPClient(f'http://127.0.0.1:{SERVER_PORT}', server_ready_timeout=120) as client:
    assert client.is_ready(30)

  try:
-    client = openllm.AsyncHTTPClient(f'http://127.0.0.1:{SERVER_PORT}', api_version='v1')
-    assert await client.health()
+    client = openllm.HTTPClient(f'http://127.0.0.1:{SERVER_PORT}', api_version='v1')
+    assert client.health()

-    response = await client.generate(
+    response = client.generate(
      'Tell me more about Apple as a company', stop='technology', llm_config={'temperature': 0.5, 'top_p': 0.2}
    )
    assert response is not None
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,8 +163,8 @@ testpaths = ["openllm-python/tests"]

 [tool.coverage.paths]
 openllm = [
-    "openllm-python/src/openllm",
-    "*/openllm-python/src/openllm",
+    "openllm-python/src/_openllm_tiny",
+    "*/openllm-python/src/_openllm_tiny",
    "openllm-client/src/openllm_client",
    "*/openllm-client/src/openllm_client",
    "openllm-core/src/openllm_core",
@@ -174,13 +174,11 @@ openllm = [
 branch = true
 omit = [
    "__pypackages__/*",
-    "openllm-python/src/openllm/_version.py",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/__main__.py",
    "openllm-core/src/openllm_core/_typing_compat.py",
-    "openllm-client/src/openllm_client/pb/**",
+    "openllm-core/src/openllm_core/_version.py",
+    "openllm-client/src/openllm_client/_version.py",
 ]
-source_pkgs = ["openllm", "openllm_core", "openllm_client"]
+source_pkgs = ["_openllm_tiny", "openllm_core", "openllm_client"]
 [tool.coverage.report]
 exclude_lines = [
    "no cov",
@@ -204,11 +202,9 @@ exclude_lines = [
 ]
 omit = [
    "__pypackages__/*",
-    "openllm-python/src/openllm/_version.py",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/__main__.py",
    "openllm-core/src/openllm_core/_typing_compat.py",
-    "openllm-client/src/openllm_client/pb/**",
+    "openllm-core/src/openllm_core/_version.py",
+    "openllm-client/src/openllm_client/_version.py",
 ]
 precision = 2
 show_missing = true
--- a/tools/machines.py
+++ b/tools/machines.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import httpx,os,dataclasses,logging,time,argparse,typing as t
+
+if (ENV := os.getenv("PAPERSPACE_API_KEY")) is None: raise RuntimeError('This script requires setting "PAPERSPACE_API_KEY"')
+HEADERS = httpx.Headers({'Authorization': f'Bearer {ENV}', 'Accept': 'application/json'})
+API_URL = 'https://api.paperspace.com/v1'
+
+logging.basicConfig(level=logging.ERROR)
+logger = logging.getLogger(__name__)
+
+@dataclasses.dataclass
+class Machine:
+  id: str
+  inner: httpx.Client = dataclasses.field(default_factory=lambda: httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60), repr=False)
+
+  def close(self): self.inner.close()
+  def __del__(self): self.close()
+  def __enter__(self): return self
+  def __exit__(self, *_: t.Any) -> None: self.close()
+  @property
+  def metadata(self) -> dict[str, t.Any]: return self.inner.get(f'/machines/{self.id}').json()
+  @property
+  def status(self) -> t.Literal['off', 'ready', 'stopping', 'starting']: return self.metadata['state']
+  def start(self) -> bool:
+    response = self.inner.patch(f'/machines/{self.id}/start')
+    if response.status_code == 400 or self.status == 'ready':
+      logger.error('machine is already running')
+      return False
+    elif response.status_code != 200:
+      logger.error('Error while starting machine "%s": %s', self.id, response.json())
+    return True
+  def stop(self) -> bool:
+    response = self.inner.patch(f'/machines/{self.id}/stop')
+    if response.status_code == 400 or self.status == 'off':
+      logger.error('machine is already off')
+      return False
+    elif response.status_code != 200:
+      logger.error('Error while stopping machine "%s": %s', self.id, response.json())
+    return True
+
+def main():
+  parser = argparse.ArgumentParser()
+  group = parser.add_mutually_exclusive_group(required=True)
+  group.add_argument('--start', metavar='ID')
+  group.add_argument('--stop', metavar='ID')
+  args = parser.parse_args()
+
+  if args.start:
+    with Machine(id=args.start) as machine:
+      if machine.start():
+        while machine.status != 'ready':
+          logger.info('Waiting for machine "%s" to be ready...', machine.id)
+          time.sleep(5)
+      else:
+        logger.error('Failed to start machine "%s"', machine.id)
+        return 1
+  elif args.stop:
+    with Machine(id=args.stop) as machine:
+      if machine.stop():
+        while machine.status != 'ready':
+          logger.info('Waiting for machine "%s" to stop...', machine.id)
+          time.sleep(5)
+      else:
+        logger.error('Failed to stopmachine "%s"', machine.id)
+        return 1
+  return 0
+
+if __name__ == "__main__": raise SystemExit(main())
--- a/tools/update-brew-tap.py
+++ b/tools/update-brew-tap.py
@@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-import os, typing as t, fs
-from pathlib import Path
-from ghapi.all import GhApi
-from jinja2 import Environment
-from jinja2.loaders import FileSystemLoader
-from plumbum.cmd import curl, cut, shasum
-
-if t.TYPE_CHECKING:
-  from plumbum.commands.base import Pipeline
-
-# get git root from this file
-ROOT = Path(__file__).parent.parent
-
-_OWNER = 'bentoml'
-_REPO = 'openllm'
-
-_gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = {
-  'macos_arm': 'aarch64-apple-darwin',
-  'macos_intel': 'x86_64-apple-darwin',
-  'linux_intel': 'x86_64-unknown-linux-musl',
-}
-
-
-def determine_release_url(
-  svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']
-) -> str:
-  if target == 'archive':
-    return f'{svn_url}/archive/{tag}.tar.gz'
-  return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz"
-
-
-# curl -sSL <svn_url>/archive/refs/tags/<tag>.tar.gz | shasum -a256 | cut -d'' -f1
-def get_release_hash_command(svn_url: str, tag: str) -> Pipeline:
-  return curl['-sSL', svn_url] | shasum['-a256'] | cut['-d', ' ', '-f1']
-
-
-def main() -> int:
-  api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
-  _info = api.repos.get()
-  release_tag = api.repos.get_latest_release().name
-
-  shadict: dict[str, t.Any] = {
-    k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip()
-    for k in _gz_strategies
-  }
-  shadict['archive'] = get_release_hash_command(
-    determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag
-  )().strip()
-
-  ENVIRONMENT = Environment(
-    extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'],
-    trim_blocks=True,
-    lstrip_blocks=True,
-    loader=FileSystemLoader((ROOT / 'Formula').__fspath__(), followlinks=True),
-  )
-  template_file = 'openllm.rb.j2'
-  with (ROOT / 'Formula' / 'openllm.rb').open('w') as f:
-    f.write(
-      ENVIRONMENT.get_template(template_file, globals={'determine_release_url': determine_release_url}).render(
-        shadict=shadict,
-        __tag__=release_tag,
-        __cmd__=fs.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)),
-        __template_file__=fs.path.join('Formula', template_file),
-        __gz_extension__=_gz_strategies,
-        **_info,
-      )
-    )
-    f.write('\n')
-  return 0
-
-
-if __name__ == '__main__':
-  raise SystemExit(main())