chore: ready for 0.6 releases

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-12-23 23:57:46 -05:00 · 2024-07-09 22:05:43 -04:00
parent 61997de34d
commit f4d822125e
60 changed files with 1930 additions and 4083 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,3 @@
 **/_next/ linguist-generated=true
 * text=auto eol=lf
 # Needed for setuptools-scm-git-archive
 .git_archival.txt  export-subst
--- a/.github/INFRA.md
+++ b/.github/INFRA.md
@@ -1,64 +0,0 @@
 ## OpenLLM CI/CD
 > [!NOTE]
 > All actions within this repository should always be locked to a specific version. We are using [ratchet](https://github.com/sethvargo/ratchet)
 > for doing this via [this script](https://github.com/bentoml/OpenLLM/blob/main/tools/lock-actions.sh)
 OpenLLM uses a GitHub Action to run all CI/CD workflows. It also use [pre-commit.ci](https://pre-commit.ci/) to run CI for all pre-commit hooks.
 The folder structure of this are as follow:
 ```prolog
 .
 ├── CODEOWNERS                # Code owners
 ├── CODE_OF_CONDUCT.md        # Code of conduct
 ├── ISSUE_TEMPLATE            # Contains issue templates
 ├── SECURITY.md               # Security policy
 ├── actions                   # Contains helpers script for all actions
 ├── assets                    # Contains static assets to be used throughout this repository
 ├── dependabot.yml            # Dependabot configuration
 └── workflows
    ├── binary-releases.yml   # Build and publish binary releases
    ├── build.yml             # Self-hosted EC2 runners
    ├── ci.yml                # CI workflow
    ├── cleanup.yml           # Cache cleanup
    ├── build-pypi.yml        # Build PyPI packages
    ├── create-releases.yml   # Create GitHub releases
    ├── cron.yml              # Cron jobs
    └── release-notes.yml     # Generate release notes
 ```
 > [!IMPORTANT]
 > All of the following jobs will and should only be run within the BentoML organisation and this repository.
 ### Self-hosted EC2 runners
 The workflow for self-hosted EC2 runners is located in [build.yml](/.github/workflows/build.yml).
 This workflow is currently used for building OpenLLM base images that contains all compiled kernels
 for serving. It will then be published to the following registry:
 - GitHub Container Registry (`ghcr.io/bentoml/openllm`): This is where users can extend the base image
  with their own custom kernels or use as base for building Bentos
 - AWS Elastic Container Registry (`public.ecr.aws/y5w8i4y6/bentoml/openllm`): This is where all Bento
  created with `openllm` will be using. This is purely for build optimisation on BentoCloud.
 There are a few ways to trigger this workflow:
 - Automatically triggered when a new commit is pushed to the `main` branch and tag release
 - On pull request: This will be triggered manually when the label `00 - EC2 Build`
 - On commit with the following `[ec2 build]`
 ### Wheel compilation
 The workflow for wheel compilation is located in [build-pypi.yml](/.github/workflows/build-pypi.yml).
 To speed up CI, opt in to the following label `02 - Wheel Build` on pull request or add `[wheel build]` to commit message.
 ### Binary releases
 The workflow for binary releases is located in [binary-releases.yml](/.github/workflows/binary-releases.yml).
 To speed up CI, opt in to the following label `03 - Standalone Build` on pull request or add `[binary build]` to commit message.
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -48,14 +48,8 @@ fi
 release_package() {
  local version="$1"
  echo "Releasing version ${version}..."
  jq --arg release_version "${version}" '.version = $release_version' <package.json >package.json.tmp && mv package.json.tmp package.json
  pushd openllm-node &>/dev/null
  jq --arg release_version "${version}" '.version = $release_version' <package.json >package.json.tmp && mv package.json.tmp package.json
  popd &>/dev/null
  towncrier build --yes --version "${version}"
-  ./tools/dependencies.py --release-version "${version}"
+  git add CHANGELOG.md changelog.d
  cp CHANGELOG.md openllm-python/CHANGELOG.md
  git add CHANGELOG.md openllm-python/pyproject.toml openllm-python/CHANGELOG.md changelog.d package.json openllm-node/package.json
  git commit -S -sm "infra: prepare for release ${version} [generated] [skip ci]"
  git push origin main
  echo "Releasing tag ${version}..." && git tag -a "v${version}" -sm "Release ${version} [generated by GitHub Actions]"
--- a/.github/assets/agent.gif
+++ b/.github/assets/agent.gif
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,28 +1,13 @@
 version: 2
 updates:
  - package-ecosystem: github-actions
-    directory: '/'
+    directory: "/"
    schedule:
-      interval: 'weekly'
+      interval: "weekly"
-      day: 'monday'
+      day: "monday"
-      time: '09:00'
+      time: "09:00"
  - package-ecosystem: pip
-    directory: '/openllm-python'
+    directory: "/"
    schedule:
-      interval: 'weekly'
+      interval: "weekly"
    open-pull-requests-limit: 5
  - package-ecosystem: pip
    directory: '/openllm-core'
    schedule:
      interval: 'weekly'
    open-pull-requests-limit: 5
  - package-ecosystem: pip
    directory: '/openllm-client'
    schedule:
      interval: 'weekly'
    open-pull-requests-limit: 5
  - package-ecosystem: npm
    directory: '/'
    schedule:
      interval: 'weekly'
    open-pull-requests-limit: 0
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -1,274 +0,0 @@
 # modified workflow from ofek/hatch, big kudos to them.
 name: Standalone build
 on:
  workflow_call:
  workflow_dispatch:
  push:
    branches: [main]
    paths-ignore:
      - '*.md'
      - 'changelog.d/**'
      - 'assets/**'
      - 'openllm-node/**'
      - 'Formula/**'
  pull_request:
    branches: [main]
    paths-ignore:
      - '*.md'
      - 'changelog.d/**'
      - 'assets/**'
      - 'openllm-node/**'
      - 'Formula/**'
 defaults:
  run:
    shell: bash --noprofile --norc -exo pipefail {0}
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
 env:
  APP_NAME: openllm
  PYOXIDIZER_VERSION: '0.24.0'
  HATCH_VERBOSE: 2
 jobs:
  get_commit_message:
    name: Get commit message
    runs-on: ubuntu-latest
    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
    outputs:
      message: ${{ steps.commit_message.outputs.message }}
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        # Gets the correct commit message for pull request
        with:
          ref: ${{ github.event.pull_request.head.sha }}
      - name: Get commit message
        id: commit_message
        run: |
          set -xe
          COMMIT_MSG=$(git log --no-merges -1 --oneline)
          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
          echo github.ref ${{ github.ref }}
  python-artifacts:
    name: Build wheel and source distribution
    runs-on: ubuntu-latest
    needs: get_commit_message
    if: >-
      contains(needs.get_commit_message.outputs.message, '[binary build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          fetch-depth: 0
      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
        with:
          bentoml-version: 'main'
          python-version-file: .python-version-default
      - name: Pull latest change
        if: ${{ github.event_name != 'pull_request' }}
        run: git pull --autostash --no-edit --gpg-sign --ff origin main
      - name: Install build frontend
        run: python -m pip install --upgrade build
      - name: Build
        run: python -m build -sw openllm-python/
      - name: Upload artifacts
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: binary-artefacts
          path: openllm-python/dist/*
          if-no-files-found: error
  binaries:
    name: ${{ matrix.job.target }} (${{ matrix.job.os }})
    needs:
      - python-artifacts
    defaults:
      run:
        working-directory: openllm-python
    runs-on: ${{ matrix.job.os }}
    strategy:
      fail-fast: false
      matrix:
        job:
          # Linux
          - target: aarch64-unknown-linux-gnu
            os: ubuntu-22.04
            cross: true
          - target: x86_64-unknown-linux-gnu
            os: ubuntu-22.04
            cross: true
          - target: x86_64-unknown-linux-musl
            os: ubuntu-22.04
            cross: true
          - target: i686-unknown-linux-gnu
            os: ubuntu-22.04
            cross: true
          - target: powerpc64le-unknown-linux-gnu
            os: ubuntu-22.04
            cross: true
          # macOS
          - target: aarch64-apple-darwin
            os: macos-12
          - target: x86_64-apple-darwin
            os: macos-12
    outputs:
      version: ${{ steps.version.outputs.version }}
    env:
      CARGO: cargo
      CARGO_BUILD_TARGET: ${{ matrix.job.target }}
      PYAPP_REPO: pyapp
      PYAPP_VERSION: '0.10.1'
      PYAPP_PIP_EXTERNAL: 'true'
    steps:
      - name: Checkout code
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          fetch-depth: 0
      - name: Pull latest change
        if: ${{ github.event_name != 'pull_request' }}
        run: git pull --autostash --no-edit --gpg-sign --ff origin main
      - name: Fetch PyApp
        run: >-
          mkdir $PYAPP_REPO && curl -L https://github.com/ofek/pyapp/releases/download/v$PYAPP_VERSION/source.tar.gz | tar --strip-components=1 -xzf - -C $PYAPP_REPO
      - name: Set up Python
        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5.1.0
        with:
          python-version-file: .python-version-default
      - name: Install Hatch
        run: pip install -U hatch
      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@d388a4836fcdbde0e50e395dc79a2670ccdef13f # ratchet:dtolnay/rust-toolchain@stable
        with:
          targets: ${{ matrix.job.target }}
      - name: Set up cross compiling
        if: matrix.job.cross
        uses: taiki-e/install-action@64e4e2f995104968c78bd697b253d55bf557af66 # ratchet:taiki-e/install-action@v2.41.11
        with:
          tool: cross
      - name: Configure cross compiling
        if: matrix.job.cross
        run: echo "CARGO=cross" >> $GITHUB_ENV
      - name: Configure target
        run: |-
          config_file="$PYAPP_REPO/.cargo/config_${{ matrix.job.target }}.toml"
          if [[ -f "$config_file" ]]; then
            mv "$config_file" "$PYAPP_REPO/.cargo/config.toml"
          fi
      - name: Download Python artifacts
        if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          name: binary-artefacts
          path: openllm-python/dist
      - name: Configure embedded project
        if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
        run: |-
          cd dist
          wheel="$(echo *.whl)"
          mv "$wheel" "../$PYAPP_REPO"
          echo "PYAPP_PROJECT_PATH=$wheel" >> $GITHUB_ENV
      - name: Build binary
        run: hatch build --target app
      # Windows installers don't accept non-integer versions so we ubiquitously
      # perform the following transformation: X.Y.Z.devN -> X.Y.Z.N
      - name: Set project version
        id: version
        run: |-
          old_version="$(hatch version)"
          version="${old_version/dev/}"
          if [[ "$version" != "$old_version" ]]; then
            cd dist/app
            find . -type f -iname "openllm-*" | while read -r file; do
              binary=$(echo "$file" | sed 's/dev//')
              mv "$file" "$binary"
            done
          fi
          echo "version=$version" >> $GITHUB_OUTPUT
          echo "$version"
      - name: Archive binary
        run: |-
          mkdir packaging
          cd dist/app
          find . -type f -iname "openllm-*" | while read -r file; do
          if [[ "$file" =~ -pc-windows- ]]; then
            7z a "../../packaging/${file:0:-4}.zip" "$file"
          else
            binary="${file/dev/}"
            chmod +x "$file"
            tar -czf "../../packaging/$binary.tar.gz" "$file"
          fi
          done
      - name: Upload staged archive
        if: runner.os != 'Linux'
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: staged-${{ runner.os }}-${{ matrix.job.target }}
          path: openllm-python/packaging/*
          if-no-files-found: error
      - name: Upload archive
        if: runner.os == 'Linux'
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: standalone-${{ runner.os }}-${{ matrix.job.target }}
          path: openllm-python/packaging/*
          if-no-files-found: error
  windows-packaging:
    name: Build Windows installers
    needs: binaries
    if: false # ${{ github.event_name != 'pull_request' }}
    runs-on: windows-2022
    env:
      VERSION: ${{ needs.binaries.outputs.version }}
    defaults:
      run:
        working-directory: openllm-python
    steps:
      - name: Checkout code
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
      - name: Set up Python
        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5.1.0
        with:
          python-version-file: .python-version-default
      - name: Pull latest change
        run: git pull --autostash --no-edit --gpg-sign --ff origin main
      - name: Install PyOxidizer ${{ env.PYOXIDIZER_VERSION }}
        run: pip install pyoxidizer==${{ env.PYOXIDIZER_VERSION }}
      - name: Download staged binaries
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          name: staged-${{ runner.os }}
          path: openllm-python/archives
      - name: Extract staged binaries
        run: |-
          mkdir -p bin
          for f in archives/*; do
            7z e "$f" -obin
          done
      # bin/<APP_NAME>-<VERSION>-<TARGET>.exe -> targets/<TARGET>/<APP_NAME>.exe
      - name: Prepare binaries
        run: |-
          mkdir -p targets
          for f in bin/*; do
            if [[ "$f" =~ openllm-${{ env.VERSION }}-(.+).exe$ ]]; then
              target="${BASH_REMATCH[1]}"
              mkdir -p "targets/$target"
              mv "$f" "targets/$target/${{ env.APP_NAME }}.exe"
            fi
          done
      - name: Build installers
        run: >-
          pyoxidizer build windows_installers --release --var version ${{ env.VERSION }}
      - name: Prepare installers
        run: |-
          mkdir installers
          mv build/*/release/*/*.{exe,msi} installers
      - name: Upload binaries
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: standalone
          path: openllm-python/archives/*
          if-no-files-found: error
      - name: Upload installers
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: installers
          path: openllm-python/installers/*
--- a/.github/workflows/build-pypi.yml
+++ b/.github/workflows/build-pypi.yml
@@ -1,143 +0,0 @@
 name: Compile PyPI package
 on:
  workflow_dispatch:
    inputs:
      tags:
        description: 'Tag for releases'
        required: true
        type: string
  workflow_call:
    inputs:
      tags:
        required: true
        type: string
    outputs:
      sucess:
        description: 'Whether the build is successful or not'
        value: ${{ jobs.sucess-build.outputs.success }}
  push:
    branches: [main]
    paths-ignore:
      - '*.md'
      - 'changelog.d/**'
      - 'assets/**'
  pull_request:
    branches: [main]
    paths-ignore:
      - '*.md'
      - 'changelog.d/**'
      - 'assets/**'
 env:
  LINES: 120
  COLUMNS: 120
  HATCH_VERBOSE: 2
  CIBW_BUILD_FRONTEND: build
  CIBW_ENVIRONMENT_PASS_LINUX: >
    HATCH_BUILD_HOOKS_ENABLE MYPYPATH
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
 jobs:
  get_commit_message:
    name: Get commit message
    runs-on: ubuntu-latest
    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
    outputs:
      message: ${{ steps.commit_message.outputs.message }}
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        # Gets the correct commit message for pull request
        with:
          ref: ${{ github.event.pull_request.head.sha }}
      - name: Get commit message
        id: commit_message
        run: |
          set -xe
          COMMIT_MSG=$(git log --no-merges -1 --oneline)
          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
          echo github.ref ${{ github.ref }}
  pure-wheels-sdist:
    name: Pure wheels and sdist distribution (${{ matrix.directory }})
    runs-on: ubuntu-latest
    needs: get_commit_message
    if: >-
      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
    strategy:
      fail-fast: false
      matrix:
        directory: ['openllm-core', 'openllm-python', 'openllm-client']
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          fetch-depth: 0
          ref: '${{ inputs.tags }}'
      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
        with:
          bentoml-version: 'main'
          python-version-file: .python-version-default
      - name: Build
        run: hatch build
        working-directory: ${{ matrix.directory }}
      - name: Upload artifacts
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: python-artefacts-${{ matrix.directory }}
          path: ${{ matrix.directory }}/dist/*
          if-no-files-found: error
  check-download-artefacts:
    name: dry-run for downloading artefacts
    if: github.event_name == 'pull_request'
    needs: [pure-wheels-sdist]
    runs-on: ubuntu-latest
    steps:
      - name: Download Python artifacts
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: python-artefacts-*
          merge-multiple: true
          path: dist
      - name: dry ls
        run: ls -rthlaR
  push-nightly:
    name: Push nightly wheels
    if: ${{ !github.event.repository.fork && github.event_name == 'push' }}
    runs-on: ubuntu-latest
    permissions:
      id-token: write
    needs: [pure-wheels-sdist]
    steps:
      - name: Download Python artifacts
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: python-artefacts-*
          merge-multiple: true
          path: dist
      - name: Publish nightly wheels to test.pypi.org
        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://test.pypi.org/legacy/
  sucess-push: # https://github.com/marketplace/actions/alls-green#why
    if: github.event_name == 'push'
    needs: [push-nightly, pure-wheels-sdist]
    runs-on: ubuntu-latest
    outputs:
      success: ${{ steps.everygreen.outputs.success }}
    steps:
      - name: Decide whether the needed jobs succeeded or failed
        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
        id: evergreen
        with:
          jobs: ${{ toJSON(needs) }}
  sucess-pr: # https://github.com/marketplace/actions/alls-green#why
    if: github.event_name == 'pull_request'
    needs: [pure-wheels-sdist]
    runs-on: ubuntu-latest
    outputs:
      success: ${{ steps.everygreen.outputs.success }}
    steps:
      - name: Decide whether the needed jobs succeeded or failed
        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
        id: evergreen
        with:
          jobs: ${{ toJSON(needs) }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,71 +0,0 @@
 name: CI
 on:
  workflow_call:
  push:
    branches: [main]
  pull_request:
    branches: [main]
 env:
  PAPERSPACE_API_KEY: ${{secrets.PAPERSPACE_API_KEY}}
  PAPERSPACE_NETWORK_ID: ${{secrets.PAPERSPACE_NETWORK_ID}}
 defaults:
  run:
    shell: bash --noprofile --norc -eo pipefail {0}
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
 jobs:
  tests:
    runs-on: ubuntu-latest
    if: ${{ github.event_name == 'pull_request' || github.event_name == 'push'|| github.event_name == 'workflow_call' }}
    name: General API tests
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4
      - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5
        with:
          python-version-file: '.python-version-default'
      - name: setup tooling
        run: |
          python -m pip install uv
          uv pip install --system httpx orjson
      - name: startup machine
        id: paperspace-machine
        run: |
          PUBLIC_IP=$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}} --output-ip)
          MACHINE_ID=$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}} --output-id)
          echo "::add-mask::$PUBLIC_IP"
          echo "::add-mask::$MACHINE_ID"
          echo "publicIp=$PUBLIC_IP" >> $GITHUB_OUTPUT
          echo "machineId=$MACHINE_ID" >> $GITHUB_OUTPUT
      - name: running regression tests (PR)
        uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
        if: github.event_name == 'pull_request'
        with:
          host: ${{ steps.paperspace-machine.outputs.publicIp }}
          username: paperspace
          key: ${{ secrets.PAPERSPACE_SSH_KEY }}
          port: ${{ secrets.PAPERSPACE_PORT }}
          command_timeout: 1h
          script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --pr ${{github.event.number}}
      - name: running regression tests (main)
        uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
        if: ${{ !github.event.repository.fork && github.event_name == 'push' }}
        with:
          host: ${{ steps.paperspace-machine.outputs.publicIp }}
          username: paperspace
          key: ${{ secrets.PAPERSPACE_SSH_KEY }}
          port: ${{ secrets.PAPERSPACE_PORT }}
          command_timeout: 1h
          script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --accept-flake-config --refresh -- --head
      - name: shutdown machine
        run: python tools/machines.py --delete ${{ steps.paperspace-machine.outputs.machineId }}
  evergreen: # https://github.com/marketplace/actions/alls-green#why
    if: always()
    needs:
      - tests
    runs-on: ubuntu-latest
    steps:
      - name: Decide whether the needed jobs succeeded or failed
        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
        with:
          jobs: ${{ toJSON(needs) }}
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -1,114 +0,0 @@
 name: release
 on:
  workflow_dispatch:
    inputs:
      release_type:
        description: 'Release type'
        required: true
        type: choice
        default: 'patch'
        options:
          - major
          - minor
          - patch
          - alpha
 defaults:
  run:
    shell: bash --noprofile --norc -exo pipefail {0}
 env:
  LINES: 120
  COLUMNS: 120
  OPENLLM_DO_NOT_TRACK: True
  PYTHONUNBUFFERED: '1'
  HATCH_VERBOSE: 2
 jobs:
  release:
    if: github.repository_owner == 'bentoml'
    runs-on: ubuntu-latest
    permissions:
      contents: write
      id-token: write
    outputs:
      version: ${{ steps.version.outputs.version }}
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          fetch-depth: 0
          token: ${{ secrets.GITHUB_TOKEN }}
      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
        with:
          bentoml-version: 'main'
          python-version-file: .python-version-default
      - name: Import bot's GPG key for signing commits
        id: import-gpg
        uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4 # ratchet:crazy-max/ghaction-import-gpg@v6
        with:
          gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
          passphrase: ${{ secrets.GPG_PASSPHRASE }}
          git_config_global: true
          git_user_signingkey: true
          git_commit_gpgsign: true
          git_tag_gpgsign: true
      - name: Create a release
        env:
          GIT_AUTHOR_NAME: ${{ steps.import-gpg.outputs.name }}
          GIT_AUTHOR_EMAIL: ${{ steps.import-gpg.outputs.email }}
          GIT_COMMITTER_NAME: ${{ steps.import-gpg.outputs.name }}
          GIT_COMMITTER_EMAIL: ${{ steps.import-gpg.outputs.email }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |-
          pip install ghapi inflection tomlkit orjson
          ./.github/actions/release.sh --release ${{ inputs.release_type }}
      - name: Set project version
        id: version
        run: |-
          version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
          echo "Release version: $version"
          echo "version=$version" >> $GITHUB_OUTPUT
  build-pypi:
    if: github.repository_owner == 'bentoml'
    needs:
      - release
    name: Compile PyPI distribution for OpenLLM
    uses: bentoml/OpenLLM/.github/workflows/build-pypi.yml@main # ratchet:exclude
    with:
      tags: ${{ needs.release.outputs.version }}
  publish-python:
    needs:
      - build-pypi
    runs-on: ubuntu-latest
    permissions:
      id-token: write
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          ref: '${{ needs.release.outputs.version }}'
          token: ${{ secrets.GITHUB_TOKEN }}
      # NOTE: Keep this section in sync with build-pypi.yml
      - name: Download Python artifacts
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: python-artefacts-*
          merge-multiple: true
          path: dist
      - name: Smoke test compiled artefacts
        run: ls -R dist
      - name: Publish to PyPI
        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
        with:
          print-hash: true
  binary-distribution:
    if: github.repository_owner == 'bentoml'
    needs: publish-python
    name: Create binary/wheels distribution
    uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@main # ratchet:exclude
  release-notes:
    if: github.repository_owner == 'bentoml'
    needs:
      - release
      - publish-python
      - binary-distribution
    name: Create release notes and setup for next cycle
    uses: bentoml/OpenLLM/.github/workflows/release-notes.yml@main # ratchet:exclude
    with:
      tags: ${{ needs.release.outputs.version }}
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -1,66 +0,0 @@
 name: release-notes
 on:
  workflow_dispatch:
    inputs:
      tags:
        description: 'Tag for releases'
        required: true
        type: string
  workflow_call:
    inputs:
      tags:
        required: true
        type: string
 env:
  HATCH_VERBOSE: 2
 defaults:
  run:
    shell: bash --noprofile --norc -exo pipefail {0}
 jobs:
  gen-release-notes:
    if: github.repository_owner == 'bentoml'
    runs-on: ubuntu-latest
    permissions:
      contents: write
      id-token: write
    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
        with:
          fetch-depth: 0
          ref: '${{ inputs.tags }}'
      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
        with:
          bentoml-version: 'main'
          python-version-file: .python-version-default
      - name: Create release notes
        run: ./.github/actions/create_release_and_archive.sh ${{ inputs.tags }}
      - name: Download Python artifacts
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: python-artefacts-*
          merge-multiple: true
          path: dist
      - name: Download binaries
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: standalone-*
          merge-multiple: true
          path: archives
      - name: Download standalone MacOS
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: staged-*
          merge-multiple: true
          path: archives
      - name: Create release
        uses: softprops/action-gh-release@a74c6b72af54cfa997e81df42d94703d6313a2d0 # ratchet:softprops/action-gh-release@v2.0.6
        with:
          # Use GH feature to populate the changelog automatically
          generate_release_notes: true
          body_path: release_notes.txt
          fail_on_unmatched_files: true
          tag_name: '${{ inputs.tags }}'
          # TODO: add installers/* once windows support is back
          files: |-
            dist/*
            archives/*
--- a/.github/workflows/releases.yml
+++ b/.github/workflows/releases.yml
@@ -0,0 +1,43 @@
 name: release
 on:
  push:
    tags:
      - "*"
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version-file: .python-version-default
      - name: Build
        run: pipx run build
      - name: Upload artifacts
        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
        with:
          name: python-artefacts-openllm
          path: dist/*
          if-no-files-found: error
  release:
    if: github.repository_owner == 'bentoml'
    needs:
      - build
    runs-on: ubuntu-latest
    name: Release
    permissions:
      id-token: write
    steps:
      - name: Download Python artifacts
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
        with:
          pattern: python-artefacts-*
          merge-multiple: true
          path: dist
      - name: dry ls
        run: ls -rthlaR
      - name: Publish to PyPI
        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
        with:
          print-hash: true
--- a/.gitignore
+++ b/.gitignore
@@ -161,3 +161,6 @@ cython_debug/
 *.whl
 # Environments
 venv/
 .envrc
 .python-version
 _version.py
--- a/.npmrc
+++ b/.npmrc
@@ -1,7 +0,0 @@
 strict-peer-dependencies=false
 shell-emulator=true
 engine-strict=true
 save-exact=true
 unsafe-perm=true
 prefer-offline=true
 prefer-workspace-packages=true
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,12 +31,7 @@ repos:
      - id: editorconfig-checker
        verbose: true
        alias: ec
-        types_or: [python, javascript]
+        types_or: [python]
        exclude: |
          (?x)^(
              openllm-python/src/openllm_cli/entrypoint.py |
              openllm-python/src/_openllm_tiny/_entrypoint.py
          )$
  - repo: meta
    hooks:
      - id: check-hooks-apply
@@ -55,11 +50,3 @@ repos:
      - id: check-added-large-files
      - id: debug-statements
      - id: check-merge-conflict
  - repo: https://github.com/econchick/interrogate
    rev: 1.7.0
    hooks:
      - id: interrogate
        verbose: true
        types: [python]
        exclude: ^(docs|tools|openllm-python/tests)
        args: [--config=pyproject.toml]
--- a/.prettierrc
+++ b/.prettierrc
@@ -1,9 +0,0 @@
 {
  "printWidth": 119,
  "quoteProps": "as-needed",
  "trailingComma": "none",
  "useTabs": false,
  "tabWidth": 2,
  "semi": false,
  "singleQuote": true
 }
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
 /Users/aarnphm/workspace/bentoml/projects/openllm/upstream/.python-version-default
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -1,4 +1,3 @@
 exclude = ["tools", "examples", "openllm-python/src/openllm_cli/playground/"]
 extend-include = ["*.ipynb"]
 preview = true
 line-length = 119
@@ -12,38 +11,35 @@ skip-magic-trailing-comma = true
 docstring-code-format = true
 [lint]
 typing-modules = [
    "openllm_core._typing_compat",
    "openllm_client._typing_compat",
 ]
 ignore = [
-    "RUF012",
+  "RUF012",
-    "ANN",    # Mypy is better at this
+  "ANN",    # Mypy is better at this
  "E722",
 ]
 select = [
-    "F",
+  "F",
-    "G",    # flake8-logging-format
+  "G",    # flake8-logging-format
-    "PERF", # perflint
+  "PERF", # perflint
-    "RUF",  # Ruff-specific rules
+  "RUF",  # Ruff-specific rules
-    "W6",
+  "W6",
-    "E71",
+  "E71",
-    "E72",
+  "E72",
-    "E112",
+  "E112",
-    "E113",
+  "E113",
-    # "E124",
+  # "E124",
-    "E203",
+  "E203",
-    "E272",
+  "E272",
-    # "E303",
+  # "E303",
-    # "E304",
+  # "E304",
-    # "E501",
+  # "E501",
-    # "E502",
+  # "E502",
-    "E702",
+  "E702",
-    "E703",
+  "E703",
-    "E731",
+  "E731",
-    "W191",
+  "W191",
-    "W291",
+  "W291",
-    "W293",
+  "W293",
-    "UP039", # unnecessary-class-parentheses
+  "UP039", # unnecessary-class-parentheses
 ]
 [lint.pydocstyle]
--- a/.taplo.toml
+++ b/.taplo.toml
@@ -1,2 +0,0 @@
 [formatting]
 indent_string = "    "
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,10 @@ This changelog is managed by towncrier and is compiled at release time.
 <!-- towncrier release notes start -->
 ## [0.6.0](https://github.com/bentoml/openllm/tree/v0.6.0)
 No significant changes.
 ## [0.5.7](https://github.com/bentoml/openllm/tree/v0.5.7)
 No significant changes.
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -44,6 +44,12 @@ Before you can start developing, you'll need to set up your environment:
   git branch --set-upstream-to=upstream/main
   ```
 6. (Optional) Link `.python-version-default` to `.python-version`:
   ```bash
   ln .python-version-default .python-version
   ```
 ## Development Workflow
 There are a few ways to contribute to the repository structure for OpenLLM:
--- a/README.md
+++ b/README.md
@@ -34,8 +34,6 @@ OpenLLM helps developers **run any open-source LLMs**, such as Llama 2 and Mistr
 <br/>
 <!-- hatch-fancy-pypi-readme interim start -->
 ## 💾 TL/DR
 For starter, we provide two ways to quickly try out OpenLLM:
@@ -98,7 +96,6 @@ OpenLLM currently supports the following models. By default, OpenLLM doesn't inc
 <summary>Baichuan</summary>
 ### Quickstart
 Run the following command to quickly spin up a Baichuan server:
@@ -106,23 +103,21 @@ Run the following command to quickly spin up a Baichuan server:
 ```bash
 openllm start baichuan-inc/baichuan-7b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Baichuan variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=baichuan) to see more Baichuan-compatible models.
 ### Supported models
 You can specify any of the following Baichuan models via `openllm start`:
 - [baichuan-inc/baichuan2-7b-base](https://huggingface.co/baichuan-inc/baichuan2-7b-base)
 - [baichuan-inc/baichuan2-7b-chat](https://huggingface.co/baichuan-inc/baichuan2-7b-chat)
 - [baichuan-inc/baichuan2-13b-base](https://huggingface.co/baichuan-inc/baichuan2-13b-base)
@@ -134,7 +129,6 @@ You can specify any of the following Baichuan models via `openllm start`:
 <summary>ChatGLM</summary>
 ### Quickstart
 Run the following command to quickly spin up a ChatGLM server:
@@ -142,23 +136,21 @@ Run the following command to quickly spin up a ChatGLM server:
 ```bash
 openllm start thudm/chatglm-6b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any ChatGLM variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=chatglm) to see more ChatGLM-compatible models.
 ### Supported models
 You can specify any of the following ChatGLM models via `openllm start`:
 - [thudm/chatglm-6b](https://huggingface.co/thudm/chatglm-6b)
 - [thudm/chatglm-6b-int8](https://huggingface.co/thudm/chatglm-6b-int8)
 - [thudm/chatglm-6b-int4](https://huggingface.co/thudm/chatglm-6b-int4)
@@ -172,7 +164,6 @@ You can specify any of the following ChatGLM models via `openllm start`:
 <summary>Cohere</summary>
 ### Quickstart
 Run the following command to quickly spin up a Cohere server:
@@ -180,23 +171,21 @@ Run the following command to quickly spin up a Cohere server:
 ```bash
 openllm start CohereForAI/c4ai-command-r-plus --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Cohere variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=commandr) to see more Cohere-compatible models.
 ### Supported models
 You can specify any of the following Cohere models via `openllm start`:
 - [CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
@@ -206,7 +195,6 @@ You can specify any of the following Cohere models via `openllm start`:
 <summary>Dbrx</summary>
 ### Quickstart
 Run the following command to quickly spin up a Dbrx server:
@@ -214,23 +202,21 @@ Run the following command to quickly spin up a Dbrx server:
 ```bash
 openllm start databricks/dbrx-instruct --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Dbrx variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dbrx) to see more Dbrx-compatible models.
 ### Supported models
 You can specify any of the following Dbrx models via `openllm start`:
 - [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)
 - [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)
@@ -240,7 +226,6 @@ You can specify any of the following Dbrx models via `openllm start`:
 <summary>DollyV2</summary>
 ### Quickstart
 Run the following command to quickly spin up a DollyV2 server:
@@ -248,23 +233,21 @@ Run the following command to quickly spin up a DollyV2 server:
 ```bash
 openllm start databricks/dolly-v2-3b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any DollyV2 variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dolly_v2) to see more DollyV2-compatible models.
 ### Supported models
 You can specify any of the following DollyV2 models via `openllm start`:
 - [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
 - [databricks/dolly-v2-7b](https://huggingface.co/databricks/dolly-v2-7b)
 - [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b)
@@ -275,7 +258,6 @@ You can specify any of the following DollyV2 models via `openllm start`:
 <summary>Falcon</summary>
 ### Quickstart
 Run the following command to quickly spin up a Falcon server:
@@ -283,23 +265,21 @@ Run the following command to quickly spin up a Falcon server:
 ```bash
 openllm start tiiuae/falcon-7b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Falcon variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=falcon) to see more Falcon-compatible models.
 ### Supported models
 You can specify any of the following Falcon models via `openllm start`:
 - [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
 - [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b)
 - [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)
@@ -311,7 +291,6 @@ You can specify any of the following Falcon models via `openllm start`:
 <summary>Gemma</summary>
 ### Quickstart
 Run the following command to quickly spin up a Gemma server:
@@ -319,23 +298,21 @@ Run the following command to quickly spin up a Gemma server:
 ```bash
 openllm start google/gemma-7b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Gemma variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=gemma) to see more Gemma-compatible models.
 ### Supported models
 You can specify any of the following Gemma models via `openllm start`:
 - [google/gemma-7b](https://huggingface.co/google/gemma-7b)
 - [google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)
 - [google/gemma-2b](https://huggingface.co/google/gemma-2b)
@@ -347,7 +324,6 @@ You can specify any of the following Gemma models via `openllm start`:
 <summary>GPTNeoX</summary>
 ### Quickstart
 Run the following command to quickly spin up a GPTNeoX server:
@@ -355,23 +331,21 @@ Run the following command to quickly spin up a GPTNeoX server:
 ```bash
 openllm start eleutherai/gpt-neox-20b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any GPTNeoX variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=gpt_neox) to see more GPTNeoX-compatible models.
 ### Supported models
 You can specify any of the following GPTNeoX models via `openllm start`:
 - [eleutherai/gpt-neox-20b](https://huggingface.co/eleutherai/gpt-neox-20b)
 </details>
@@ -380,7 +354,6 @@ You can specify any of the following GPTNeoX models via `openllm start`:
 <summary>Llama</summary>
 ### Quickstart
 Run the following command to quickly spin up a Llama server:
@@ -388,23 +361,21 @@ Run the following command to quickly spin up a Llama server:
 ```bash
 openllm start NousResearch/llama-2-7b-hf --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Llama variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=llama) to see more Llama-compatible models.
 ### Supported models
 You can specify any of the following Llama models via `openllm start`:
 - [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
 - [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 - [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
@@ -424,7 +395,6 @@ You can specify any of the following Llama models via `openllm start`:
 <summary>Mistral</summary>
 ### Quickstart
 Run the following command to quickly spin up a Mistral server:
@@ -432,23 +402,21 @@ Run the following command to quickly spin up a Mistral server:
 ```bash
 openllm start mistralai/Mistral-7B-Instruct-v0.1 --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Mistral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mistral) to see more Mistral-compatible models.
 ### Supported models
 You can specify any of the following Mistral models via `openllm start`:
 - [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
 - [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
 - [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
@@ -461,7 +429,6 @@ You can specify any of the following Mistral models via `openllm start`:
 <summary>Mixtral</summary>
 ### Quickstart
 Run the following command to quickly spin up a Mixtral server:
@@ -469,23 +436,21 @@ Run the following command to quickly spin up a Mixtral server:
 ```bash
 openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.
 ### Supported models
 You can specify any of the following Mixtral models via `openllm start`:
 - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
 - [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
@@ -495,7 +460,6 @@ You can specify any of the following Mixtral models via `openllm start`:
 <summary>MPT</summary>
 ### Quickstart
 Run the following command to quickly spin up a MPT server:
@@ -503,23 +467,21 @@ Run the following command to quickly spin up a MPT server:
 ```bash
 openllm start mosaicml/mpt-7b-instruct --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any MPT variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mpt) to see more MPT-compatible models.
 ### Supported models
 You can specify any of the following MPT models via `openllm start`:
 - [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b)
 - [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat)
@@ -534,7 +496,6 @@ You can specify any of the following MPT models via `openllm start`:
 <summary>OPT</summary>
 ### Quickstart
 Run the following command to quickly spin up a OPT server:
@@ -542,23 +503,21 @@ Run the following command to quickly spin up a OPT server:
 ```bash
 openllm start facebook/opt-1.3b
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any OPT variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=opt) to see more OPT-compatible models.
 ### Supported models
 You can specify any of the following OPT models via `openllm start`:
 - [facebook/opt-125m](https://huggingface.co/facebook/opt-125m)
 - [facebook/opt-350m](https://huggingface.co/facebook/opt-350m)
 - [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b)
@@ -572,7 +531,6 @@ You can specify any of the following OPT models via `openllm start`:
 <summary>Phi</summary>
 ### Quickstart
 Run the following command to quickly spin up a Phi server:
@@ -580,23 +538,21 @@ Run the following command to quickly spin up a Phi server:
 ```bash
 openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Phi variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=phi) to see more Phi-compatible models.
 ### Supported models
 You can specify any of the following Phi models via `openllm start`:
 - [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)
 - [microsoft/Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct)
@@ -610,7 +566,6 @@ You can specify any of the following Phi models via `openllm start`:
 <summary>Qwen</summary>
 ### Quickstart
 Run the following command to quickly spin up a Qwen server:
@@ -618,23 +573,21 @@ Run the following command to quickly spin up a Qwen server:
 ```bash
 openllm start qwen/Qwen-7B-Chat --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models.
 ### Supported models
 You can specify any of the following Qwen models via `openllm start`:
 - [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat)
 - [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8)
 - [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4)
@@ -648,7 +601,6 @@ You can specify any of the following Qwen models via `openllm start`:
 <summary>StableLM</summary>
 ### Quickstart
 Run the following command to quickly spin up a StableLM server:
@@ -656,23 +608,21 @@ Run the following command to quickly spin up a StableLM server:
 ```bash
 openllm start stabilityai/stablelm-tuned-alpha-3b --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any StableLM variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=stablelm) to see more StableLM-compatible models.
 ### Supported models
 You can specify any of the following StableLM models via `openllm start`:
 - [stabilityai/stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b)
 - [stabilityai/stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
 - [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
@@ -684,7 +634,6 @@ You can specify any of the following StableLM models via `openllm start`:
 <summary>StarCoder</summary>
 ### Quickstart
 Run the following command to quickly spin up a StarCoder server:
@@ -692,23 +641,21 @@ Run the following command to quickly spin up a StarCoder server:
 ```bash
 openllm start bigcode/starcoder --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any StarCoder variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=starcoder) to see more StarCoder-compatible models.
 ### Supported models
 You can specify any of the following StarCoder models via `openllm start`:
 - [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
 - [bigcode/starcoderbase](https://huggingface.co/bigcode/starcoderbase)
@@ -718,7 +665,6 @@ You can specify any of the following StarCoder models via `openllm start`:
 <summary>Yi</summary>
 ### Quickstart
 Run the following command to quickly spin up a Yi server:
@@ -726,23 +672,21 @@ Run the following command to quickly spin up a Yi server:
 ```bash
 openllm start 01-ai/Yi-6B --trust-remote-code
 ```
 You can run the following code in a different terminal to interact with the server:
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```
 > **Note:** Any Yi variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=yi) to see more Yi-compatible models.
 ### Supported models
 You can specify any of the following Yi models via `openllm start`:
 - [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)
 - [01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)
 - [01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)
@@ -830,8 +774,6 @@ async for it in llm.astream('What is the difference between a duck and a goose?
  print(it, flush=True, end='')
 ```
 <!-- hatch-fancy-pypi-readme interim stop -->
 <!-- hatch-fancy-pypi-readme meta start -->
 ## 🚀 Deploying models to production
--- a/hatch.toml
+++ b/hatch.toml
@@ -1,84 +0,0 @@
 [envs.default]
 installer = "uv"
 type = "virtual"
 dependencies = [
    "openllm-core @ {root:uri}/openllm-core",
    "openllm-client @ {root:uri}/openllm-client",
    "openllm[chatglm,fine-tune] @ {root:uri}/openllm-python",
    # NOTE: To run all hooks
    "pre-commit",
    # NOTE: towncrier for changelog
    "towncrier",
    "tomlkit",
    # NOTE: For fancy PyPI readme
    "hatch-fancy-pypi-readme",
    # NOTE: For working with shell pipe
    "plumbum",
    # The below sync with mypyc deps and pre-commit mypy
    "types-psutil",
    "types-tabulate",
    "types-PyYAML",
    "types-protobuf",
 ]
 [envs.default.scripts]
 changelog = "towncrier build --version main --draft"
 inplace-changelog = "towncrier build --version main --keep"
 setup = [
    "pre-commit install",
    "- ln -s .python-version-default .python-version",
 ]
 quality = [
    "bash ./local.sh --tools",
    "- pre-commit run --all-files",
    "- pnpm format",
 ]
 tool = ["quality", "bash ./clean.sh", 'python ./cz.py']
 [envs.tests]
 installer = "uv"
 type = "virtual"
 dependencies = [
    "openllm @ {root:uri}/openllm-python",
    "openllm-core @ {root:uri}/openllm-core",
    "openllm-client @ {root:uri}/openllm-client",
    # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
    "coverage[toml]>=6.5",
    "filelock>=3.7.1",
    "pytest",
    "pytest-cov",
    "pytest-mock",
    "pytest-randomly",
    "pytest-rerunfailures",
    "pytest-asyncio>=0.21.0",
    "pytest-xdist[psutil]",
    "trustme",
    "hypothesis",
    "syrupy",
 ]
 skip-install = false
 template = "tests"
 [envs.tests.scripts]
 _run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml -vv"
 python = "_run_script -r aR -x {args:openllm-python/tests}"
 [envs.tests.overrides]
 env.GITHUB_ACTIONS.env-vars = "COVERAGE_REPORT="
 [envs.coverage]
 dependencies = ["coverage[toml]>=6.5", "lxml", "orjson"]
 detached = true
 [envs.coverage.scripts]
 combine = "coverage combine {args}"
 generate-summary = "python tools/generate-coverage.py"
 report-uncovered-html = "coverage html --skip-covered --skip-empty {args}"
 report-xml = "coverage xml {args}"
 write-summary-report = "python tools/write-coverage-report.py"
 [envs.ui]
 detached = true
 [envs.ui.scripts]
 server = "openllm start {args:facebook/opt-250m} --working-dir {root:uri} --cors --debug"
 [envs.ci]
 detached = true
 [envs.ci.scripts]
 client-stubs = "bash openllm-client/generate-grpc-stubs"
 compile = "bash ./tools/compile.sh {args}"
 recompile = ["bash ./clean.sh", "compile"]
 edi = "bash local.sh"
 lock = ["bash tools/lock-actions.sh"]
--- a/openllm_next/main.py
+++ b/openllm_next/main.py
@@ -1,338 +0,0 @@
 import os
 import random
 import sys
 from collections import defaultdict
 from typing import Annotated, Optional
 import questionary
 import typer
 from openllm_next.accelerator_spec import (
    DeploymentTarget,
    can_run,
    get_local_machine_spec,
 )
 from openllm_next.analytic import DO_NOT_TRACK, OpenLLMTyper
 from openllm_next.clean import app as clean_app
 from openllm_next.cloud import deploy as cloud_deploy
 from openllm_next.cloud import ensure_cloud_context, get_cloud_machine_spec
 from openllm_next.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
 from openllm_next.local import run as local_run
 from openllm_next.local import serve as local_serve
 from openllm_next.model import app as model_app
 from openllm_next.model import ensure_bento, list_bento
 from openllm_next.repo import app as repo_app
 app = OpenLLMTyper(
    help="`openllm hello` to get started. "
    "OpenLLM is a CLI tool to manage and deploy open source LLMs and"
    " get an OpenAI API compatible chat server in seconds.",
 )
 app.add_typer(repo_app, name="repo")
 app.add_typer(model_app, name="model")
 app.add_typer(clean_app, name="clean")
 def _select_bento_name(models, target):
    from tabulate import tabulate
    options = []
    model_infos = [
        [model.repo.name, model.name, can_run(model, target)] for model in models
    ]
    model_name_groups = defaultdict(lambda: 0)
    for repo, name, score in model_infos:
        model_name_groups[(repo, name)] += score
    table_data = [
        [name, repo, CHECKED if score > 0 else ""]
        for (repo, name), score in model_name_groups.items()
    ]
    if not table_data:
        output("No model found", style="red")
        raise typer.Exit(1)
    table = tabulate(
        table_data,
        headers=["model", "repo", "locally runnable"],
    ).split("\n")
    headers = f"{table[0]}\n   {table[1]}"
    options.append(questionary.Separator(headers))
    for table_data, table_line in zip(table_data, table[2:]):
        options.append(questionary.Choice(table_line, value=table_data[:2]))
    selected = questionary.select("Select a model", options).ask()
    if selected is None:
        raise typer.Exit(1)
    return selected
 def _select_bento_version(models, target, bento_name, repo):
    from tabulate import tabulate
    model_infos = [
        [model, can_run(model, target)]
        for model in models
        if model.name == bento_name and model.repo.name == repo
    ]
    table_data = [
        [model.tag, CHECKED if score > 0 else ""]
        for model, score in model_infos
        if model.name == bento_name and model.repo.name == repo
    ]
    if not table_data:
        output(f"No model found for {bento_name} in {repo}", style="red")
        raise typer.Exit(1)
    table = tabulate(
        table_data,
        headers=["version", "locally runnable"],
    ).split("\n")
    options = []
    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
    for table_data, table_line in zip(model_infos, table[2:]):
        options.append(questionary.Choice(table_line, value=table_data))
    selected = questionary.select("Select a version", options).ask()
    if selected is None:
        raise typer.Exit(1)
    return selected
 def _select_target(bento, targets):
    from tabulate import tabulate
    options = []
    targets.sort(key=lambda x: can_run(bento, x), reverse=True)
    if not targets:
        output(
            "No available instance type, check your bentocloud account",
            style="red",
        )
        raise typer.Exit(1)
    table = tabulate(
        [
            [
                target.name,
                target.accelerators_repr,
                f"${target.price}",
                CHECKED if can_run(bento, target) else "insufficient res.",
            ]
            for target in targets
        ],
        headers=["instance type", "accelerator", "price/hr", "deployable"],
    ).split("\n")
    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
    for target, line in zip(targets, table[2:]):
        options.append(
            questionary.Choice(
                f"{line}",
                value=target,
            )
        )
    selected = questionary.select("Select an instance type", options).ask()
    if selected is None:
        raise typer.Exit(1)
    return selected
 def _select_action(bento, score):
    if score > 0:
        options = [
            questionary.Separator("Available actions"),
            questionary.Choice(
                "0. Run the model in terminal",
                value="run",
                shortcut_key="0",
            ),
            questionary.Separator(f"  $ openllm run {bento}"),
            questionary.Separator(" "),
            questionary.Choice(
                "1. Serve the model locally and get a chat server",
                value="serve",
                shortcut_key="1",
            ),
            questionary.Separator(f"  $ openllm serve {bento}"),
            questionary.Separator(" "),
            questionary.Choice(
                "2. Deploy the model to bentocloud and get a scalable chat server",
                value="deploy",
                shortcut_key="2",
            ),
            questionary.Separator(f"  $ openllm deploy {bento}"),
        ]
    else:
        options = [
            questionary.Separator("Available actions"),
            questionary.Choice(
                "0. Run the model in terminal",
                value="run",
                disabled="insufficient res.",
                shortcut_key="0",
            ),
            questionary.Separator(f"  $ openllm run {bento}"),
            questionary.Separator(" "),
            questionary.Choice(
                "1. Serve the model locally and get a chat server",
                value="serve",
                disabled="insufficient res.",
                shortcut_key="1",
            ),
            questionary.Separator(f"  $ openllm serve {bento}"),
            questionary.Separator(" "),
            questionary.Choice(
                "2. Deploy the model to bentocloud and get a scalable chat server",
                value="deploy",
                shortcut_key="2",
            ),
            questionary.Separator(f"  $ openllm deploy {bento}"),
        ]
    action = questionary.select("Select an action", options).ask()
    if action is None:
        raise typer.Exit(1)
    if action == "run":
        try:
            local_run(bento)
        finally:
            output("\nUse this command to run the action again:", style="green")
            output(f"  $ openllm run {bento}", style="orange")
    elif action == "serve":
        try:
            local_serve(bento)
        finally:
            output("\nUse this command to run the action again:", style="green")
            output(f"  $ openllm serve {bento}", style="orange")
    elif action == "deploy":
        ensure_cloud_context()
        targets = get_cloud_machine_spec()
        target = _select_target(bento, targets)
        try:
            cloud_deploy(bento, target)
        finally:
            output("\nUse this command to run the action again:", style="green")
            output(
                f"  $ openllm deploy {bento} --instance-type {target.name}",
                style="orange",
            )
@app.command(help="get started interactively")
 def hello():
    INTERACTIVE.set(True)
    VERBOSE_LEVEL.set(20)
    target = get_local_machine_spec()
    output(f"  Detected Platform: {target.platform}", style="green")
    if target.accelerators:
        output("  Detected Accelerators: ", style="green")
        for a in target.accelerators:
            output(f"   - {a.model} {a.memory_size}GB", style="green")
    else:
        output("  Detected Accelerators: None", style="yellow")
    models = list_bento()
    if not models:
        output(
            "No model found, you probably need to update the model repo:",
            style="red",
        )
        output(
            "  $ openllm repo update",
            style="orange",
        )
        raise typer.Exit(1)
    bento_name, repo = _select_bento_name(models, target)
    bento, score = _select_bento_version(models, target, bento_name, repo)
    _select_action(bento, score)
@app.command(help="start an OpenAI API compatible chat server and chat in browser")
 def serve(
    model: Annotated[str, typer.Argument()] = "",
    repo: Optional[str] = None,
    port: int = 3000,
    verbose: bool = False,
 ):
    if verbose:
        VERBOSE_LEVEL.set(20)
    target = get_local_machine_spec()
    bento = ensure_bento(model, target=target, repo_name=repo)
    local_serve(bento, port=port)
@app.command(help="run the model and chat in terminal")
 def run(
    model: Annotated[str, typer.Argument()] = "",
    repo: Optional[str] = None,
    port: Optional[int] = None,
    timeout: int = 600,
    verbose: bool = False,
 ):
    if verbose:
        VERBOSE_LEVEL.set(20)
    target = get_local_machine_spec()
    bento = ensure_bento(model, target=target, repo_name=repo)
    if port is None:
        port = random.randint(30000, 40000)
    local_run(bento, port=port, timeout=timeout)
@app.command(
    help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)",
 )
 def deploy(
    model: Annotated[str, typer.Argument()] = "",
    instance_type: Optional[str] = None,
    repo: Optional[str] = None,
    verbose: bool = False,
 ):
    if verbose:
        VERBOSE_LEVEL.set(20)
    bento = ensure_bento(model, repo_name=repo)
    if instance_type is not None:
        cloud_deploy(bento, DeploymentTarget(name=instance_type))
        return
    targets = get_cloud_machine_spec()
    targets = filter(lambda x: can_run(bento, x) > 0, targets)
    targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
    if not targets:
        output(
            "No available instance type, check your bentocloud account",
            style="red",
        )
        raise typer.Exit(1)
    target = targets[0]
    output(
        f"Recommended instance type: {target.name}",
        style="green",
    )
    cloud_deploy(bento, target)
@app.callback(invoke_without_command=True)
 def typer_callback(
    verbose: int = 0,
    do_not_track: bool = typer.Option(
        False,
        "--do-not-track",
        help="Whether to disable usage tracking",
        envvar=DO_NOT_TRACK,
    ),
 ):
    if verbose:
        VERBOSE_LEVEL.set(verbose)
    if do_not_track:
        os.environ[DO_NOT_TRACK] = str(True)
 def main():
    if sys.version_info < (3, 9):
        output("Python 3.8 or higher is required", style="red")
        sys.exit(1)
    app()
 if __name__ == "__main__":
    main()
--- a/openllm_next/accelerator_spec.py
+++ b/openllm_next/accelerator_spec.py
@@ -1,166 +0,0 @@
 from __future__ import annotations
 import functools
 import math
 import typing
 from types import SimpleNamespace
 import psutil
 from openllm_next.common import BentoInfo, DeploymentTarget, output
 class Accelerator(SimpleNamespace):
    model: str
    memory_size: float
    def __gt__(self, other):
        return self.memory_size > other.memory_size
    def __eq__(self, other):
        return self.memory_size == other.memory_size
    def __repr__(self):
        return f"{self.model}({self.memory_size}GB)"
 class Resource(SimpleNamespace):
    cpu: int = 0
    memory: float
    gpu: int = 0
    gpu_type: str = ""
    def __hash__(self):
        return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
    def __bool__(self):
        return any(value is not None for value in self.__dict__.values())
 ACCELERATOR_SPEC_DICT: dict[str, dict] = {
    "nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0},
    "nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0},
    "nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0},
    "nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0},
    "nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0},
    "nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0},
    "nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0},
    "nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0},
    "nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0},
    "nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0},
    "nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0},
    "nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0},
    "nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0},
    "nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0},
    "nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0},
    "nvidia-l4": {"model": "L4", "memory_size": 24.0},
    "nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0},
    "nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0},
    "nvidia-a100-80g": {"model": "A100", "memory_size": 80.0},
    "nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0},
    "nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0},
 }
 ACCELERATOR_SPECS: dict[str, Accelerator] = {
    key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()
 }
@functools.lru_cache
 def get_local_machine_spec():
    if psutil.MACOS:
        return DeploymentTarget(accelerators=[], source="local", platform="macos")
    if psutil.WINDOWS:
        platform = "windows"
    elif psutil.LINUX:
        platform = "linux"
    else:
        raise NotImplementedError(f"Unsupported platform")
    from pynvml import (
        nvmlDeviceGetCount,
        nvmlDeviceGetCudaComputeCapability,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetMemoryInfo,
        nvmlDeviceGetName,
        nvmlInit,
        nvmlShutdown,
    )
    try:
        nvmlInit()
        device_count = nvmlDeviceGetCount()
        accelerators: list[Accelerator] = []
        for i in range(device_count):
            handle = nvmlDeviceGetHandleByIndex(i)
            name = nvmlDeviceGetName(handle)
            memory_info = nvmlDeviceGetMemoryInfo(handle)
            accelerators.append(
                Accelerator(
                    model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
                )
            )
            compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
            if compute_capability < (7, 5):
                output(
                    f"GPU {name} with compute capability {compute_capability} "
                    "may not be supported, 7.5 or higher is recommended. check "
                    "https://developer.nvidia.com/cuda-gpus for more information",
                    style="yellow",
                )
        nvmlShutdown()
        return DeploymentTarget(
            accelerators=accelerators,
            source="local",
            platform=platform,
        )
    except Exception as e:
        output(
            f"Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment",
            style="yellow",
        )
        output(f"Error: {e}", style="red", level=20)
        return DeploymentTarget(accelerators=[], source="local", platform=platform)
@functools.lru_cache()
 def can_run(
    bento: typing.Union[Resource, BentoInfo],
    target: typing.Optional[DeploymentTarget] = None,
 ) -> float:
    """
    Calculate if the bento can be deployed on the target.
    """
    if target is None:
        target = get_local_machine_spec()
    resource_spec = Resource(**(bento.bento_yaml["services"][0]["config"].get("resources", {})))
    labels = bento.bento_yaml.get("labels", {})
    platforms = labels.get("platforms", "linux").split(",")
    if target.platform not in platforms:
        return 0.0
    # return 1.0 if no resource is specified
    if not resource_spec:
        return 0.5
    if resource_spec.gpu > 0:
        required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
        filtered_accelerators = [
            ac
            for ac in target.accelerators
            if ac.memory_size >= required_gpu.memory_size
        ]
        if resource_spec.gpu > len(filtered_accelerators):
            return 0.0
        return (
            required_gpu.memory_size
            * resource_spec.gpu
            / sum(ac.memory_size for ac in target.accelerators)
        )
    if target.accelerators:
        return 0.01 / sum(ac.memory_size for ac in target.accelerators)
    return 1.0
--- a/openllm_next/analytic.py
+++ b/openllm_next/analytic.py
@@ -1,118 +0,0 @@
 from __future__ import annotations
 import functools
 import os
 import re
 import time
 import typing
 from abc import ABC
 import attr
 import click
 import typer
 import typer.core
 DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK"
 class EventMeta(ABC):
    @property
    def event_name(self):
        # camel case to snake case
        event_name = re.sub(r"(?<!^)(?=[A-Z])", "_", self.__class__.__name__).lower()
        # remove "_event" suffix
        suffix_to_remove = "_event"
        if event_name.endswith(suffix_to_remove):
            event_name = event_name[: -len(suffix_to_remove)]
        return event_name
@attr.define
 class CliEvent(EventMeta):
    cmd_group: str
    cmd_name: str
    duration_in_ms: float = attr.field(default=0)
    error_type: typing.Optional[str] = attr.field(default=None)
    return_code: typing.Optional[int] = attr.field(default=None)
@attr.define
 class OpenllmCliEvent(CliEvent):
    pass
 class OrderedCommands(typer.core.TyperGroup):
    def list_commands(self, _: click.Context) -> typing.Iterable[str]:
        return list(self.commands)
 class OpenLLMTyper(typer.Typer):
    def __init__(self, *args: typing.Any, **kwargs: typing.Any):
        no_args_is_help = kwargs.pop("no_args_is_help", True)
        context_settings = kwargs.pop("context_settings", {})
        if "help_option_names" not in context_settings:
            context_settings["help_option_names"] = ("-h", "--help")
        if "max_content_width" not in context_settings:
            context_settings["max_content_width"] = int(
                os.environ.get("COLUMNS", str(120))
            )
        klass = kwargs.pop("cls", OrderedCommands)
        super().__init__(
            *args,
            cls=klass,
            no_args_is_help=no_args_is_help,
            context_settings=context_settings,
            **kwargs,
        )
    def command(self, *args: typing.Any, **kwargs: typing.Any):
        def decorator(f):
            @functools.wraps(f)
            @click.pass_context
            def wrapped(ctx: click.Context, *args, **kwargs):
                from bentoml._internal.utils.analytics import track
                do_not_track = (
                    os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true"
                )
                # so we know that the root program is openllm
                command_name = ctx.info_name
                if ctx.parent.parent is not None:
                    # openllm model list
                    command_group = ctx.parent.info_name
                elif ctx.parent.info_name == ctx.find_root().info_name:
                    # openllm run
                    command_group = "openllm"
                if do_not_track:
                    return f(*args, **kwargs)
                start_time = time.time_ns()
                try:
                    return_value = f(*args, **kwargs)
                    duration_in_ns = time.time_ns() - start_time
                    track(
                        OpenllmCliEvent(
                            cmd_group=command_group,
                            cmd_name=command_name,
                            duration_in_ms=duration_in_ns / 1e6,
                        )
                    )
                    return return_value
                except BaseException as e:
                    duration_in_ns = time.time_ns() - start_time
                    track(
                        OpenllmCliEvent(
                            cmd_group=command_group,
                            cmd_name=command_name,
                            duration_in_ms=duration_in_ns / 1e6,
                            error_type=type(e).__name__,
                            return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
                        )
                    )
                    raise
            return typer.Typer.command(self, *args, **kwargs)(wrapped)
        return decorator
--- a/openllm_next/clean.py
+++ b/openllm_next/clean.py
@@ -1,75 +0,0 @@
 import pathlib
 import shutil
 import questionary
 from openllm_next.analytic import OpenLLMTyper
 from openllm_next.common import (
    CONFIG_FILE,
    REPO_DIR,
    VENV_DIR,
    VERBOSE_LEVEL,
    output,
 )
 app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM")
 HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub"
@app.command(help="Clean up all the cached models from huggingface")
 def model_cache(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob("*"))
    sure = questionary.confirm(
        f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
    ).ask()
    if not sure:
        return
    shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
    output("All models cached by Huggingface have been removed", style="green")
@app.command(help="Clean up all the virtual environments created by OpenLLM")
 def venvs(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    used_space = sum(f.stat().st_size for f in VENV_DIR.rglob("*"))
    sure = questionary.confirm(
        f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
    ).ask()
    if not sure:
        return
    shutil.rmtree(VENV_DIR, ignore_errors=True)
    output("All virtual environments have been removed", style="green")
@app.command(help="Clean up all the repositories cloned by OpenLLM")
 def repos(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    shutil.rmtree(REPO_DIR, ignore_errors=True)
    output("All repositories have been removed", style="green")
@app.command(help="Reset configurations to default")
 def configs(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    shutil.rmtree(CONFIG_FILE, ignore_errors=True)
    output("All configurations have been reset", style="green")
@app.command(
    name="all",
    help="Clean up all above and bring OpenLLM to a fresh start",
 )
 def all_cache(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    repos()
    venvs()
    model_cache()
    configs()
--- a/openllm_next/cloud.py
+++ b/openllm_next/cloud.py
@@ -1,174 +0,0 @@
 import json
 import os
 import pathlib
 import shutil
 import subprocess
 import typing
 import typer
 from openllm_next.accelerator_spec import ACCELERATOR_SPECS
 from openllm_next.analytic import OpenLLMTyper
 from openllm_next.common import (
    INTERACTIVE,
    BentoInfo,
    DeploymentTarget,
    output,
    run_command,
 )
 app = OpenLLMTyper()
 def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
    cmd = ["bentoml", "deploy", bento.bentoml_tag]
    env = {
        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
    }
    required_envs = bento.bento_yaml.get("envs", [])
    required_env_names = [env["name"] for env in required_envs if "name" in env]
    if required_env_names:
        output(
            f"This model requires the following environment variables to run: {repr(required_env_names)}",
            style="yellow",
        )
    for env_info in bento.bento_yaml.get("envs", []):
        if "name" not in env_info:
            continue
        if os.environ.get(env_info["name"]):
            default = os.environ[env_info["name"]]
        elif "value" in env_info:
            default = env_info["value"]
        else:
            default = ""
        if INTERACTIVE.get():
            import questionary
            value = questionary.text(
                f"{env_info['name']}:",
                default=default,
            ).ask()
        else:
            if default == "":
                output(
                    f"Environment variable {env_info['name']} is required but not provided",
                    style="red",
                )
                raise typer.Exit(1)
            else:
                value = default
        if value is None:
            raise typer.Exit(1)
        cmd += ["--env", f"{env_info['name']}={value}"]
    if target:
        cmd += ["--instance-type", target.name]
    assert (pathlib.Path.home() / "bentoml" / ".yatai.yaml").exists()
    shutil.copy(
        pathlib.Path.home() / "bentoml" / ".yatai.yaml",
        bento.repo.path / "bentoml" / ".yatai.yaml",
    )
    return cmd, env, None
 def ensure_cloud_context():
    import questionary
    cmd = ["bentoml", "cloud", "current-context"]
    try:
        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
        context = json.loads(result)
        output(f"  bentoml already logged in: {context['endpoint']}", style="green")
    except subprocess.CalledProcessError:
        output("  bentoml not logged in", style="red")
        if not INTERACTIVE.get():
            output(
                "\n  get bentoml logged in by:",
            )
            output(
                "    $ bentoml cloud login",
                style="orange",
            )
            output("")
            output(
                """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
                style="yellow",
            )
            raise typer.Exit(1)
        else:
            action = questionary.select(
                "Choose an action:",
                choices=[
                    "I have a BentoCloud account",
                    "get an account in two minutes",
                ],
            ).ask()
            if action is None:
                raise typer.Exit(1)
            elif action == "get an account in two minutes":
                output(
                    "Please visit https://cloud.bentoml.com to get your token",
                    style="yellow",
                )
            endpoint = questionary.text(
                "Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)"
            ).ask()
            if endpoint is None:
                raise typer.Exit(1)
            token = questionary.text(
                "Enter your token: (similar to cniluaxxxxxxxx)"
            ).ask()
            if token is None:
                raise typer.Exit(1)
            cmd = [
                "bentoml",
                "cloud",
                "login",
                "--api-token",
                token,
                "--endpoint",
                endpoint,
            ]
            try:
                result = subprocess.check_output(cmd)
                output("  Logged in successfully", style="green")
            except subprocess.CalledProcessError:
                output("  Failed to login", style="red")
                raise typer.Exit(1)
 def get_cloud_machine_spec():
    ensure_cloud_context()
    cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
    try:
        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
        instance_types = json.loads(result)
        return [
            DeploymentTarget(
                source="cloud",
                name=it["name"],
                price=it["price"],
                platform="linux",
                accelerators=(
                    [ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))]
                    if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS
                    else []
                ),
            )
            for it in instance_types
        ]
    except (subprocess.CalledProcessError, json.JSONDecodeError):
        output("Failed to get cloud instance types", style="red")
        return []
 def deploy(bento: BentoInfo, target: DeploymentTarget):
    ensure_cloud_context()
    cmd, env, cwd = _get_deploy_cmd(bento, target)
    run_command(cmd, env=env, cwd=cwd)
--- a/openllm_next/common.py
+++ b/openllm_next/common.py
@@ -1,422 +0,0 @@
 from __future__ import annotations
 import asyncio
 import functools
 import hashlib
 import io
 import json
 import os
 import pathlib
 import signal
 import subprocess
 import sys
 import sysconfig
 import typing
 from contextlib import asynccontextmanager, contextmanager
 from types import SimpleNamespace
 import typer
 import typer.core
 ERROR_STYLE = "red"
 SUCCESS_STYLE = "green"
 CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
 REPO_DIR = CLLAMA_HOME / "repos"
 TEMP_DIR = CLLAMA_HOME / "temp"
 VENV_DIR = CLLAMA_HOME / "venv"
 REPO_DIR.mkdir(exist_ok=True, parents=True)
 TEMP_DIR.mkdir(exist_ok=True, parents=True)
 VENV_DIR.mkdir(exist_ok=True, parents=True)
 CONFIG_FILE = CLLAMA_HOME / "config.json"
 CHECKED = "☆"
 T = typing.TypeVar("T")
 class ContextVar(typing.Generic[T]):
    def __init__(self, default: T):
        self._stack: list[T] = []
        self._default = default
    def get(self) -> T:
        if self._stack:
            return self._stack[-1]
        return self._default
    def set(self, value):
        self._stack.append(value)
    @contextmanager
    def patch(self, value):
        self._stack.append(value)
        try:
            yield
        finally:
            self._stack.pop()
 VERBOSE_LEVEL = ContextVar(10)
 INTERACTIVE = ContextVar(False)
 FORCE = ContextVar(False)
 def output(content, level=0, style=None, end=None):
    import questionary
    if level > VERBOSE_LEVEL.get():
        return
    if not isinstance(content, str):
        import pyaml
        out = io.StringIO()
        pyaml.pprint(
            content,
            dst=out,
            sort_dicts=False,
            sort_keys=False,
        )
        questionary.print(out.getvalue(), style=style, end="" if end is None else end)
        out.close()
    if isinstance(content, str):
        questionary.print(content, style=style, end="\n" if end is None else end)
 class Config(SimpleNamespace):
    repos: dict[str, str] = {
        "default": "git+https://github.com/bentoml/openllm-models@main"
    }
    default_repo: str = "default"
    def tolist(self):
        return dict(
            repos=self.repos,
            default_repo=self.default_repo,
        )
 def load_config():
    if CONFIG_FILE.exists():
        try:
            with open(CONFIG_FILE) as f:
                return Config(**json.load(f))
        except json.JSONDecodeError:
            return Config()
    return Config()
 def save_config(config):
    with open(CONFIG_FILE, "w") as f:
        json.dump(config.tolist(), f, indent=2)
 class RepoInfo(SimpleNamespace):
    name: str
    path: pathlib.Path
    url: str
    server: str
    owner: str
    repo: str
    branch: str
    def tolist(self):
        if VERBOSE_LEVEL.get() <= 0:
            return f"{self.name} ({self.url})"
        if VERBOSE_LEVEL.get() <= 10:
            return dict(
                name=self.name,
                url=self.url,
                path=str(self.path),
            )
        if VERBOSE_LEVEL.get() <= 20:
            return dict(
                name=self.name,
                url=self.url,
                path=str(self.path),
                server=self.server,
                owner=self.owner,
                repo=self.repo,
                branch=self.branch,
            )
 class BentoInfo(SimpleNamespace):
    repo: RepoInfo
    path: pathlib.Path
    alias: str = ""
    def __str__(self):
        if self.repo.name == "default":
            return f"{self.tag}"
        else:
            return f"{self.repo.name}/{self.tag}"
    def __hash__(self):
        return md5(str(self.path))
    @property
    def tag(self) -> str:
        if self.alias:
            return f"{self.path.parent.name}:{self.alias}"
        return f"{self.path.parent.name}:{self.path.name}"
    @property
    def bentoml_tag(self) -> str:
        return f"{self.path.parent.name}:{self.path.name}"
    @property
    def name(self) -> str:
        return self.path.parent.name
    @property
    def version(self) -> str:
        return self.path.name
    @property
    def labels(self) -> dict[str, str]:
        return self.bento_yaml["labels"]
    @functools.cached_property
    def bento_yaml(self) -> dict:
        import yaml
        bento_file = self.path / "bento.yaml"
        return yaml.safe_load(bento_file.read_text())
    @functools.cached_property
    def platforms(self) -> list[str]:
        return self.bento_yaml["labels"].get("platforms", "linux").split(",")
    @functools.cached_property
    def pretty_yaml(self) -> dict:
        def _pretty_routes(routes):
            return {
                route["route"]: {
                    "input": {
                        k: v["type"] for k, v in route["input"]["properties"].items()
                    },
                    "output": route["output"]["type"],
                }
                for route in routes
            }
        if len(self.bento_yaml["services"]) == 1:
            pretty_yaml = {
                "apis": _pretty_routes(self.bento_yaml["schema"]["routes"]),
                "resources": self.bento_yaml["services"][0]["config"]["resources"],
                "envs": self.bento_yaml["envs"],
                "platforms": self.platforms,
            }
            return pretty_yaml
        return self.bento_yaml
    @functools.cached_property
    def pretty_gpu(self) -> str:
        from openllm_next.accelerator_spec import ACCELERATOR_SPECS
        try:
            resources = self.bento_yaml["services"][0]["config"]["resources"]
            if resources["gpu"] > 1:
                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
                return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
            elif resources["gpu"] > 0:
                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
                return f"{acc.memory_size:.0f}G"
        except KeyError:
            pass
        return ""
    def tolist(self):
        verbose = VERBOSE_LEVEL.get()
        if verbose <= 0:
            return str(self)
        if verbose <= 10:
            return dict(
                tag=self.tag,
                repo=self.repo.tolist(),
                path=str(self.path),
                model_card=self.pretty_yaml,
            )
        if verbose <= 20:
            return dict(
                tag=self.tag,
                repo=self.repo.tolist(),
                path=str(self.path),
                bento_yaml=self.bento_yaml,
            )
 class VenvSpec(SimpleNamespace):
    python_version: str
    python_packages: dict[str, str]
    name_prefix = ""
    def __hash__(self):
        return md5(
            # self.python_version,
            *sorted(self.python_packages),
        )
 class Accelerator(SimpleNamespace):
    model: str
    memory_size: float
    def __gt__(self, other):
        return self.memory_size > other.memory_size
    def __eq__(self, other):
        return self.memory_size == other.memory_size
 class DeploymentTarget(SimpleNamespace):
    source: str = "local"
    name: str = "local"
    price: str = ""
    platform = "linux"
    accelerators: list[Accelerator]
    def __hash__(self):
        return hash(self.source)
    @property
    def accelerators_repr(self) -> str:
        accs = {a.model for a in self.accelerators}
        if len(accs) == 0:
            return "null"
        if len(accs) == 1:
            a = self.accelerators[0]
            return f"{a.model} x{len(self.accelerators)}"
        return ", ".join((f"{a.model}" for a in self.accelerators))
 def run_command(
    cmd,
    cwd=None,
    env=None,
    copy_env=True,
    venv=None,
    silent=False,
 ) -> subprocess.CompletedProcess:
    import shlex
    env = env or {}
    cmd = [str(c) for c in cmd]
    bin_dir = "Scripts" if os.name == "nt" else "bin"
    if not silent:
        output("\n")
        if cwd:
            output(f"$ cd {cwd}", style="orange")
        if env:
            for k, v in env.items():
                output(f"$ export {k}={shlex.quote(v)}", style="orange")
        if venv:
            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
        output(f"$ {' '.join(cmd)}", style="orange")
    if venv:
        py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
    else:
        py = sys.executable
    if copy_env:
        env = {**os.environ, **env}
    if cmd and cmd[0] == "bentoml":
        cmd = [py, "-m", "bentoml"] + cmd[1:]
    if cmd and cmd[0] == "python":
        cmd = [py] + cmd[1:]
    try:
        if silent:
            return subprocess.run(  # type: ignore
                cmd,
                cwd=cwd,
                env=env,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
        else:
            return subprocess.run(
                cmd,
                cwd=cwd,
                env=env,
            )
    except subprocess.CalledProcessError:
        output("Command failed", style="red")
        raise typer.Exit(1)
 async def stream_command_output(stream, style="gray"):
    async for line in stream:
        output(line.decode(), style=style, end="")
@asynccontextmanager
 async def async_run_command(
    cmd,
    cwd=None,
    env=None,
    copy_env=True,
    venv=None,
    silent=True,
 ):
    import shlex
    env = env or {}
    cmd = [str(c) for c in cmd]
    if not silent:
        output("\n")
        if cwd:
            output(f"$ cd {cwd}", style="orange")
        if env:
            for k, v in env.items():
                output(f"$ export {k}={shlex.quote(v)}", style="orange")
        if venv:
            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
        output(f"$ {' '.join(cmd)}", style="orange")
    if venv:
        py = venv / "bin" / "python"
    else:
        py = sys.executable
    if copy_env:
        env = {**os.environ, **env}
    if cmd and cmd[0] == "bentoml":
        cmd = [py, "-m", "bentoml"] + cmd[1:]
    if cmd and cmd[0] == "python":
        cmd = [py] + cmd[1:]
    proc = None
    try:
        proc = await asyncio.create_subprocess_shell(
            " ".join(map(str, cmd)),
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
            cwd=cwd,
            env=env,
        )
        yield proc
    except subprocess.CalledProcessError:
        output("Command failed", style="red")
        raise typer.Exit(1)
    finally:
        if proc:
            proc.send_signal(signal.SIGINT)
            await proc.wait()
 def md5(*strings: str) -> int:
    m = hashlib.md5()
    for s in strings:
        m.update(s.encode())
    return int(m.hexdigest(), 16)
--- a/openllm_next/local.py
+++ b/openllm_next/local.py
@@ -1,117 +0,0 @@
 import asyncio
 import time
 import httpx
 from openllm_next.common import (
    BentoInfo,
    async_run_command,
    output,
    run_command,
    stream_command_output,
 )
 from openllm_next.venv import ensure_venv
 def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
    cmd = ["bentoml", "serve", bento.bentoml_tag]
    if port != 3000:
        cmd += ["--port", str(port)]
    env = {
        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
    }
    return cmd, env, None
 def serve(
    bento: BentoInfo,
    port: int = 3000,
 ):
    venv = ensure_venv(bento)
    cmd, env, cwd = _get_serve_cmd(bento, port=port)
    run_command(cmd, env=env, cwd=cwd, venv=venv)
 async def _run_model(
    bento: BentoInfo,
    port: int = 3000,
    timeout: int = 600,
 ):
    venv = ensure_venv(bento)
    cmd, env, cwd = _get_serve_cmd(bento, port)
    async with async_run_command(
        cmd,
        env=env,
        cwd=cwd,
        venv=venv,
        silent=False,
    ) as server_proc:
        output(f"Model server started {server_proc.pid}")
        stdout_streamer = None
        stderr_streamer = None
        start_time = time.time()
        output("Model loading...", style="green")
        for _ in range(timeout):
            try:
                resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3)
                if resp.status_code == 200:
                    break
            except httpx.RequestError:
                if time.time() - start_time > 30:
                    if not stdout_streamer:
                        stdout_streamer = asyncio.create_task(
                            stream_command_output(server_proc.stdout, style="gray")
                        )
                    if not stderr_streamer:
                        stderr_streamer = asyncio.create_task(
                            stream_command_output(server_proc.stderr, style="#BD2D0F")
                        )
                await asyncio.sleep(1)
        else:
            output("Model failed to load", style="red")
            server_proc.terminate()
            return
        if stdout_streamer:
            stdout_streamer.cancel()
        if stderr_streamer:
            stderr_streamer.cancel()
        output("Model is ready", style="green")
        messages: list[dict[str, str]] = []
        from openai import AsyncOpenAI
        client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local")
        model_id = (await client.models.list()).data[0].id
        while True:
            try:
                message = input("user: ")
                if message == "":
                    output("empty message, please enter something", style="yellow")
                    continue
                messages.append(dict(role="user", content=message))
                output("assistant: ", end="", style="lightgreen")
                assistant_message = ""
                stream = await client.chat.completions.create(
                    model=model_id,
                    messages=messages,  # type: ignore
                    stream=True,
                )
                async for chunk in stream:
                    text = chunk.choices[0].delta.content or ""
                    assistant_message += text
                    output(text, end="", style="lightgreen")
                messages.append(dict(role="assistant", content=assistant_message))
                output("")
            except KeyboardInterrupt:
                break
        output("\nStopping model server...", style="green")
    output("Stopped model server", style="green")
 def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
    asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/openllm_next/model.py
+++ b/openllm_next/model.py
@@ -1,173 +0,0 @@
 import typing
 from typing import Optional
 import tabulate
 import typer
 from openllm_next.accelerator_spec import DeploymentTarget, can_run
 from openllm_next.analytic import OpenLLMTyper
 from openllm_next.common import (
    FORCE,
    VERBOSE_LEVEL,
    BentoInfo,
    load_config,
    output,
 )
 from openllm_next.repo import ensure_repo_updated, parse_repo_url
 app = OpenLLMTyper(help="manage models")
@app.command()
 def get(
    tag: str,
    repo: Optional[str] = None,
    verbose: bool = False,
 ):
    if verbose:
        VERBOSE_LEVEL.set(20)
    bento_info = ensure_bento(tag, repo_name=repo)
    if bento_info:
        output(bento_info)
@app.command(name="list")
 def list_(
    tag: Optional[str] = None,
    repo: Optional[str] = None,
    verbose: bool = False,
 ):
    if verbose:
        VERBOSE_LEVEL.set(20)
    bentos = list_bento(tag=tag, repo_name=repo)
    bentos.sort(key=lambda x: x.name)
    seen = set()
    def is_seen(value):
        if value in seen:
            return True
        seen.add(value)
        return False
    table = tabulate.tabulate(
        [
            [
                "" if is_seen(bento.name) else bento.name,
                bento.tag,
                bento.repo.name,
                bento.pretty_gpu,
                ",".join(bento.platforms),
            ]
            for bento in bentos
        ],
        headers=["model", "version", "repo", "required VRAM", "platforms"],
    )
    output(table)
 def ensure_bento(
    model: str,
    target: Optional[DeploymentTarget] = None,
    repo_name: Optional[str] = None,
 ) -> BentoInfo:
    bentos = list_bento(model, repo_name=repo_name)
    if len(bentos) == 0:
        output(f"No model found for {model}", style="red")
        raise typer.Exit(1)
    if len(bentos) == 1:
        if FORCE.get():
            output(f"Found model {bentos[0]}", style="green")
            return bentos[0]
        if target is None:
            return bentos[0]
        if can_run(bentos[0], target) <= 0:
            return bentos[0]
        output(f"Found model {bentos[0]}", style="green")
        return bentos[0]
    if target is None:
        output(
            f"Multiple models match {model}, did you mean one of these?",
            style="red",
        )
        for bento in bentos:
            output(f"  {bento}")
        raise typer.Exit(1)
    filtered = [bento for bento in bentos if can_run(bento, target) > 0]
    if len(filtered) == 0:
        output(f"No deployment target found for {model}", style="red")
        raise typer.Exit(1)
    if len(filtered) == 0:
        output(f"No deployment target found for {model}", style="red")
        raise typer.Exit(1)
    if len(bentos) > 1:
        output(
            f"Multiple models match {model}, did you mean one of these?",
            style="red",
        )
        for bento in bentos:
            output(f"  {bento}")
        raise typer.Exit(1)
    return bentos[0]
 def list_bento(
    tag: typing.Optional[str] = None,
    repo_name: typing.Optional[str] = None,
    include_alias: bool = False,
 ) -> typing.List[BentoInfo]:
    ensure_repo_updated()
    if repo_name is not None:
        config = load_config()
        if repo_name not in config.repos:
            output(f"Repo `{repo_name}` not found, did you mean one of these?")
            for repo_name in config.repos:
                output(f"  {repo_name}")
            raise typer.Exit(1)
    if not tag:
        glob_pattern = "bentoml/bentos/*/*"
    elif ":" in tag:
        bento_name, version = tag.split(":")
        glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
    else:
        glob_pattern = f"bentoml/bentos/{tag}/*"
    model_list = []
    config = load_config()
    for _repo_name, repo_url in config.repos.items():
        if repo_name is not None and _repo_name != repo_name:
            continue
        repo = parse_repo_url(repo_url, _repo_name)
        for path in repo.path.glob(glob_pattern):
            if path.is_dir() and (path / "bento.yaml").exists():
                model = BentoInfo(repo=repo, path=path)
            elif path.is_file():
                with open(path) as f:
                    origin_name = f.read().strip()
                origin_path = path.parent / origin_name
                model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
            else:
                model = None
            if model:
                model_list.append(model)
    model_list.sort(key=lambda x: x.tag)
    if not include_alias:
        seen = set()
        model_list = [
            x
            for x in model_list
            if not (
                f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
                or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
            )
        ]
    return model_list
--- a/openllm_next/repo.py
+++ b/openllm_next/repo.py
@@ -1,203 +0,0 @@
 import datetime
 import re
 import shutil
 import pyaml
 import questionary
 import typer
 from openllm_next.analytic import OpenLLMTyper
 from openllm_next.common import (
    INTERACTIVE,
    REPO_DIR,
    VERBOSE_LEVEL,
    RepoInfo,
    load_config,
    output,
    save_config,
 )
 UPDATE_INTERVAL = datetime.timedelta(days=3)
 app = OpenLLMTyper(help="manage repos")
@app.command()
 def list(verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    config = load_config()
    pyaml.pprint(
        [parse_repo_url(repo, name) for name, repo in config.repos.items()],
        sort_dicts=False,
        sort_keys=False,
    )
@app.command()
 def remove(name: str):
    config = load_config()
    if name not in config.repos:
        output(f"Repo {name} does not exist", style="red")
        return
    del config.repos[name]
    save_config(config)
    output(f"Repo {name} removed", style="green")
 def _complete_alias(repo_name: str):
    from openllm_next.model import list_bento
    for bento in list_bento(repo_name=repo_name):
        alias = bento.labels.get("openllm_alias", "").strip()
        if alias:
            for a in alias.split(","):
                with open(bento.path.parent / a, "w") as f:
                    f.write(bento.version)
@app.command()
 def update():
    import dulwich
    import dulwich.errors
    import dulwich.porcelain
    config = load_config()
    repos_in_use = set()
    for repo_name, repo in config.repos.items():
        repo = parse_repo_url(repo, repo_name)
        repos_in_use.add((repo.server, repo.owner, repo.repo))
        if repo.path.exists():  # TODO: use update instead of remove and clone
            shutil.rmtree(repo.path, ignore_errors=True)
        if not repo.path.exists():
            repo.path.parent.mkdir(parents=True, exist_ok=True)
            try:
                dulwich.porcelain.clone(
                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
                    str(repo.path),
                    checkout=True,
                    depth=1,
                    branch=repo.branch,
                )
                output("")
                output(f"Repo `{repo.name}` updated", style="green")
            except:
                shutil.rmtree(repo.path, ignore_errors=True)
                output(f"Failed to clone repo {repo.name}", style="red")
        else:
            try:
                import dulwich.porcelain
                dulwich.porcelain.pull(
                    str(repo.path),
                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
                    refspecs=repo.branch,
                    force=True,
                )
                dulwich.porcelain.clean(str(repo.path), str(repo.path))
                output("")
                output(f"Repo `{repo.name}` updated", style="green")
            except:
                shutil.rmtree(repo.path, ignore_errors=True)
                output(f"Failed to update repo {repo.name}", style="red")
    for c in REPO_DIR.glob("*/*/*"):
        repo_spec = tuple(c.parts[-3:])
        if repo_spec not in repos_in_use:
            shutil.rmtree(c, ignore_errors=True)
            output(f"Removed unused repo cache {c}")
    with open(REPO_DIR / "last_update", "w") as f:
        f.write(datetime.datetime.now().isoformat())
    for repo_name in config.repos:
        _complete_alias(repo_name)
 def ensure_repo_updated():
    last_update_file = REPO_DIR / "last_update"
    if not last_update_file.exists():
        if INTERACTIVE.get():
            choice = questionary.confirm(
                "The repo cache is never updated, do you want to update it to fetch the latest model list?"
            ).ask()
            if choice:
                update()
            return
        else:
            output(
                "The repo cache is never updated, please run `openllm repo update` to fetch the latest model list",
                style="red",
            )
            raise typer.Exit(1)
    last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
    if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
        if INTERACTIVE.get():
            choice = questionary.confirm(
                "The repo cache is outdated, do you want to update it to fetch the latest model list?"
            ).ask()
            if choice:
                update()
        else:
            output(
                "The repo cache is outdated, please run `openllm repo update` to fetch the latest model list",
                style="yellow",
            )
 GIT_REPO_RE = re.compile(
    r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
 )
 def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
    """
    parse the git repo url to server, owner, repo name, branch
    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm@main")
    ('github.com', 'bentoml', 'bentovllm', 'main')
    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm")
    ('github.com', 'bentoml', 'bentovllm', 'main')
    """
    match = GIT_REPO_RE.match(repo_url)
    if not match:
        raise ValueError(f"Invalid git repo url: {repo_url}")
    server = match.group("server")
    owner = match.group("owner")
    repo = match.group("repo")
    branch = match.group("branch") or "main"
    path = REPO_DIR / server / owner / repo
    return RepoInfo(
        name=repo if repo_name is None else repo_name,
        url=repo_url,
        server=server,
        owner=owner,
        repo=repo,
        branch=branch,
        path=path,
    )
@app.command()
 def add(name: str, repo: str):
    name = name.lower()
    if not name.isidentifier():
        output(
            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
            style="red",
        )
        return
    config = load_config()
    if name in config.repos:
        override = questionary.confirm(
            f"Repo {name} already exists({config.repos[name]}), override?"
        ).ask()
        if not override:
            return
    config.repos[name] = repo
    save_config(config)
    output(f"Repo {name} added", style="green")
 if __name__ == "__main__":
    app()
--- a/openllm_next/venv.py
+++ b/openllm_next/venv.py
@@ -1,164 +0,0 @@
 import functools
 import os
 import pathlib
 import shutil
 import typing
 from typing import Iterable
 import typer
 from openllm_next.common import (
    VENV_DIR,
    VERBOSE_LEVEL,
    BentoInfo,
    VenvSpec,
    output,
    run_command,
 )
@functools.lru_cache
 def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
    from pip_requirements_parser import RequirementsFile
    requirements_txt = RequirementsFile.from_file(
        str(requirement),
        include_nested=True,
    )
    return requirements_txt.requirements
 def _filter_preheat_packages(requirements: Iterable) -> list[str]:
    PREHEAT_PIP_PACKAGES = ["torch", "vllm"]
    deps: list[str] = []
    for req in requirements:
        if (
            req.is_editable
            or req.is_local_path
            or req.is_url
            or req.is_wheel
            or not req.name
            or not req.specifier
        ):
            continue
        for sp in req.specifier:
            if sp.operator == "==" and req.name in PREHEAT_PIP_PACKAGES:
                assert req.line is not None
                deps.append(req.line)
                break
    return deps
@functools.lru_cache
 def _resolve_bento_env_specs(bento: BentoInfo):
    ver_file = bento.path / "env" / "python" / "version.txt"
    assert ver_file.exists(), f"cannot find version file in {bento.path}"
    lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
    if not lock_file.exists():
        lock_file = bento.path / "env" / "python" / "requirements.txt"
    reqs = _resolve_packages(lock_file)
    preheat_packages = _filter_preheat_packages(reqs)
    ver = ver_file.read_text().strip()
    return (
        VenvSpec(
            python_version=ver,
            python_packages=preheat_packages,
            name_prefix=f"{bento.tag.replace(':', '_')}-1-",
        ),
        VenvSpec(
            python_version=ver,
            python_packages=[v.line for v in reqs],
            name_prefix=f"{bento.tag.replace(':', '_')}-2-",
        ),
    )
 def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
    if os.name == "nt":
        return venv / "Lib/site-packages"
    else:
        return next(venv.glob("lib/python*")) / "site-packages"
 def _ensure_venv(
    env_spec: VenvSpec,
    parrent_venv: typing.Optional[pathlib.Path] = None,
 ) -> pathlib.Path:
    venv = VENV_DIR / str(hash(env_spec))
    if venv.exists() and not (venv / "DONE").exists():
        shutil.rmtree(venv, ignore_errors=True)
    if not venv.exists():
        output(f"Installing model dependencies({venv})...", style="green")
        venv_py = (
            venv / "Scripts" / "python.exe"
            if os.name == "nt"
            else venv / "bin" / "python"
        )
        try:
            run_command(
                ["python", "-m", "uv", "venv", venv],
                silent=VERBOSE_LEVEL.get() < 10,
            )
            lib_dir = _get_lib_dir(venv)
            if parrent_venv is not None:
                parent_lib_dir = _get_lib_dir(parrent_venv)
                with open(lib_dir / f"{parrent_venv.name}.pth", "w+") as f:
                    f.write(str(parent_lib_dir))
            with open(venv / "requirements.txt", "w") as f:
                f.write("\n".join(sorted(env_spec.python_packages)))
            run_command(
                [
                    "python",
                    "-m",
                    "uv",
                    "pip",
                    "install",
                    "-p",
                    str(venv_py),
                    "-r",
                    venv / "requirements.txt",
                ],
                silent=VERBOSE_LEVEL.get() < 10,
            )
            with open(venv / "DONE", "w") as f:
                f.write("DONE")
        except Exception:
            shutil.rmtree(venv, ignore_errors=True)
            output(
                f"Failed to install dependencies to {venv}. Cleaned up.",
                style="red",
            )
            raise typer.Exit(1)
        output(f"Successfully installed dependencies to {venv}.", style="green")
        return venv
    else:
        return venv
 def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
    last_venv = None
    for env_spec in env_spec_list:
        last_venv = _ensure_venv(env_spec, last_venv)
    assert last_venv is not None
    return last_venv
 def ensure_venv(bento: BentoInfo) -> pathlib.Path:
    return _ensure_venvs(_resolve_bento_env_specs(bento))
 def _check_venv(env_spec: VenvSpec) -> bool:
    venv = VENV_DIR / str(hash(env_spec))
    if not venv.exists():
        return False
    if venv.exists() and not (venv / "DONE").exists():
        return False
    return True
 def check_venv(bento: BentoInfo) -> bool:
    return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))
--- a/package.json
+++ b/package.json
@@ -1,57 +0,0 @@
 {
  "name": "@bentoml/openllm-monorepo",
  "version": "0.5.7",
  "description": "OpenLLM: Operating LLMs in production",
  "author": "Aaron Pham <29749331+aarnphm@users.noreply.github.com>",
  "license": "Apache-2.0",
  "repository": {
    "type": "git",
    "url": "git+https://github.com/bentoml/OpenLLM.git"
  },
  "bugs": {
    "url": "https://github.com/bentoml/OpenLLM/issues"
  },
  "engines": {
    "node": ">=18"
  },
  "type": "module",
  "private": true,
  "scripts": {
    "lint": "eslint . --cache",
    "check": "npx prettier . --check",
    "format": "npx prettier . --write"
  },
  "packageManager": "pnpm@8.6.3",
  "devDependencies": {
    "@edge-runtime/vm": "3.1.6",
    "@next/eslint-plugin-next": "13.5.5",
    "@types/node": "20.8.7",
    "@typescript-eslint/eslint-plugin": "6.10.0",
    "@typescript-eslint/parser": "6.8.0",
    "eslint": "^8.53.0",
    "eslint-config-prettier": "9.0.0",
    "eslint-plugin-import": "2.29.0",
    "eslint-plugin-react": "7.33.2",
    "eslint-plugin-react-hooks": "4.6.0",
    "eslint-plugin-tailwindcss": "3.13.0",
    "eslint-plugin-unicorn": "48.0.1",
    "prettier": "^3.0.3",
    "prettier-plugin-pkg": "0.18.0",
    "prettier-plugin-tailwindcss": "0.5.6",
    "turbo": "1.10.16",
    "typescript": "5.2.2"
  },
  "pnpm": {
    "overrides": {
      "vitest": "^0.27.1",
      "react": "^18.2.0",
      "react-dom": "^18.2.0",
      "@mdx-js/react": "^2.3.0",
      "@vercel/analytics": "1.0.2",
      "next-themes": "^0.2.1",
      "next": "^13.4.8",
      "@types/react": "^18.0.15",
      "@types/react-dom": "^18.0.6"
    }
  }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,34 +1,189 @@
 [build-system]
 requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
-name = "openllm-next"
+name = "openllm"
-version = "0.0.1"
+description = "OpenLLM: Self-hosting LLMs Made Easy."
-description = "A description of your package."
+readme = { file = "README.md", content-type = "text/markdown" }
-authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
+authors = [{ name = "BentoML Team", email = "contact@bentoml.com" }]
-license = {file = "LICENSE"}
+dynamic = ["version"]
-dependencies = [
+classifiers = [
-    "bentoml",
+  "Development Status :: 5 - Production/Stable",
-    "typer",
+  "Environment :: GPU :: NVIDIA CUDA",
-    "questionary",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
-    "pyaml",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-    "psutil",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
-    "pathlib",
+  "License :: OSI Approved :: Apache Software License",
-    "pip_requirements_parser",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "nvidia-ml-py",
+  "Topic :: Software Development :: Libraries",
-    "dulwich",
+  "Operating System :: OS Independent",
-    "tabulate",
+  "Intended Audience :: Developers",
-    "uv",
+  "Intended Audience :: Science/Research",
-    "openai==1.35.9",
+  "Intended Audience :: System Administrators",
  "Typing :: Typed",
  "Programming Language :: Python",
  "Programming Language :: Python :: 3",
  "Programming Language :: Python :: 3 :: Only",
  "Programming Language :: Python :: 3.8",
  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
  "bentoml",
  "typer",
  "questionary",
  "pyaml",
  "psutil",
  "pathlib",
  "pip_requirements_parser",
  "nvidia-ml-py",
  "dulwich",
  "tabulate",
  "uv",
  "openai==1.35.9",
 ]
 keywords = [
  "MLOps",
  "AI",
  "BentoML",
  "Model Serving",
  "Model Deployment",
  "LLMOps",
  "Falcon",
  "Vicuna",
  "Llama 2",
  "Fine tuning",
  "Serverless",
  "Large Language Model",
  "Generative AI",
  "StableLM",
  "Alpaca",
  "PyTorch",
  "Mistral",
  "vLLM",
  "Transformers",
 ]
 license = "Apache-2.0"
 requires-python = ">=3.9"
 [project.scripts]
-openllm = "openllm_next.__main__:main"
+openllm = "openllm.__main__:app"
 [project.urls]
 Blog = "https://modelserving.com"
 Documentation = "https://github.com/bentoml/OpenLLM#readme"
 GitHub = "https://github.com/bentoml/OpenLLM"
 Homepage = "https://bentoml.com"
 Tracker = "https://github.com/bentoml/OpenLLM/issues"
 Twitter = "https://twitter.com/bentomlai"
 [tool.typer]
-src-dir = "openllm_next"
+src-dir = "src/openllm"
-[tool.isort]
+[build-system]
-multi_line_output = 3
+requires = [
-include_trailing_comma = true
+  "hatchling==1.25.0",
  "hatch-vcs==0.4.0",
  "hatch-fancy-pypi-readme==24.1.0",
 ]
 build-backend = 'hatchling.build'
 [tool.hatch.version]
 source = "vcs"
 fallback-version = "0.0.0"
 [tool.hatch.build.hooks.vcs]
 version-file = "src/openllm/_version.py"
 [tool.hatch.version.raw-options]
 git_describe_command = [
  "git",
  "describe",
  "--dirty",
  "--tags",
  "--long",
  "--first-parent",
 ]
 version_scheme = "post-release"
 fallback_version = "0.0.0"
 [tool.hatch.metadata]
 allow-direct-references = true
 [tool.hatch.build.targets.wheel]
 only-include = ["src/openllm"]
 sources = ["src"]
 [tool.hatch.build.targets.sdist]
 exclude = ["/.git_archival.txt", "/.python-version-default"]
 [tool.hatch.metadata.hooks.fancy-pypi-readme]
 content-type = "text/markdown"
 # PyPI doesn't support the <picture> tag.
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 text = """
 <p align="center">
  <a href="https://github.com/bentoml/openllm">
    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
  </a>
 </p>
 """
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
 path = "README.md"
 start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 text = """
 <p align="center">
  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
 </p>
 """
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 end-before = "\n<!-- hatch-fancy-pypi-readme meta stop -->"
 path = "README.md"
 start-after = "<!-- hatch-fancy-pypi-readme meta start -->\n"
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 text = """
 ## Release Information
 """
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 path = "CHANGELOG.md"
 pattern = "\n(###.+?\n)## "
 start-after = "<!-- towncrier release notes start -->"
 [[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
 text = """
 ---
 [Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
 """
 [tool.towncrier]
 directory = "changelog.d"
 filename = "CHANGELOG.md"
 issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
 name = "openllm"
 start_string = "<!-- towncrier release notes start -->\n"
 template = "changelog.d/template.md.jinja"
 title_format = ""
 underlines = ["", "", ""]
 [[tool.towncrier.section]]
 path = ""
 [[tool.towncrier.type]]
 directory = "breaking"
 name = "Backwards-incompatible Changes"
 showcontent = true
 [[tool.towncrier.type]]
 directory = "deprecation"
 name = "Deprecations"
 showcontent = true
 [[tool.towncrier.type]]
 directory = "change"
 name = "Changes"
 showcontent = true
 [[tool.towncrier.type]]
 directory = "refactor"
 name = "Refactor"
 showcontent = true
 [[tool.towncrier.type]]
 directory = "feature"
 name = "Features"
 showcontent = true
 [[tool.towncrier.type]]
 directory = "fix"
 name = "Bug fix"
 showcontent = true
--- a/openllm_next/init.py
+++ b/openllm_next/init.py
--- a/src/openllm/main.py
+++ b/src/openllm/main.py
@@ -0,0 +1,274 @@
 import os
 import random
 import sys
 import platform
 import importlib.metadata
 from collections import defaultdict
 from typing import Annotated, Optional
 import questionary
 import typer
 from openllm.accelerator_spec import DeploymentTarget, can_run, get_local_machine_spec
 from openllm.analytic import DO_NOT_TRACK, OpenLLMTyper
 from openllm.clean import app as clean_app
 from openllm.cloud import deploy as cloud_deploy
 from openllm.cloud import ensure_cloud_context, get_cloud_machine_spec
 from openllm.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
 from openllm.local import run as local_run
 from openllm.local import serve as local_serve
 from openllm.model import app as model_app
 from openllm.model import ensure_bento, list_bento
 from openllm.repo import app as repo_app
 app = OpenLLMTyper(
  help='`openllm hello` to get started. '
  'OpenLLM is a CLI tool to manage and deploy open source LLMs and'
  ' get an OpenAI API compatible chat server in seconds.'
 )
 app.add_typer(repo_app, name='repo')
 app.add_typer(model_app, name='model')
 app.add_typer(clean_app, name='clean')
 def _select_bento_name(models, target):
  from tabulate import tabulate
  options = []
  model_infos = [[model.repo.name, model.name, can_run(model, target)] for model in models]
  model_name_groups = defaultdict(lambda: 0)
  for repo, name, score in model_infos:
    model_name_groups[(repo, name)] += score
  table_data = [[name, repo, CHECKED if score > 0 else ''] for (repo, name), score in model_name_groups.items()]
  if not table_data:
    output('No model found', style='red')
    raise typer.Exit(1)
  table = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\n')
  headers = f'{table[0]}\n   {table[1]}'
  options.append(questionary.Separator(headers))
  for table_data, table_line in zip(table_data, table[2:]):
    options.append(questionary.Choice(table_line, value=table_data[:2]))
  selected = questionary.select('Select a model', options).ask()
  if selected is None:
    raise typer.Exit(1)
  return selected
 def _select_bento_version(models, target, bento_name, repo):
  from tabulate import tabulate
  model_infos = [
    [model, can_run(model, target)] for model in models if model.name == bento_name and model.repo.name == repo
  ]
  table_data = [
    [model.tag, CHECKED if score > 0 else '']
    for model, score in model_infos
    if model.name == bento_name and model.repo.name == repo
  ]
  if not table_data:
    output(f'No model found for {bento_name} in {repo}', style='red')
    raise typer.Exit(1)
  table = tabulate(table_data, headers=['version', 'locally runnable']).split('\n')
  options = []
  options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
  for table_data, table_line in zip(model_infos, table[2:]):
    options.append(questionary.Choice(table_line, value=table_data))
  selected = questionary.select('Select a version', options).ask()
  if selected is None:
    raise typer.Exit(1)
  return selected
 def _select_target(bento, targets):
  from tabulate import tabulate
  options = []
  targets.sort(key=lambda x: can_run(bento, x), reverse=True)
  if not targets:
    output('No available instance type, check your bentocloud account', style='red')
    raise typer.Exit(1)
  table = tabulate(
    [
      [
        target.name,
        target.accelerators_repr,
        f'${target.price}',
        CHECKED if can_run(bento, target) else 'insufficient res.',
      ]
      for target in targets
    ],
    headers=['instance type', 'accelerator', 'price/hr', 'deployable'],
  ).split('\n')
  options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
  for target, line in zip(targets, table[2:]):
    options.append(questionary.Choice(f'{line}', value=target))
  selected = questionary.select('Select an instance type', options).ask()
  if selected is None:
    raise typer.Exit(1)
  return selected
 def _select_action(bento, score):
  if score > 0:
    options = [
      questionary.Separator('Available actions'),
      questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),
      questionary.Separator(f'  $ openllm run {bento}'),
      questionary.Separator(' '),
      questionary.Choice('1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'),
      questionary.Separator(f'  $ openllm serve {bento}'),
      questionary.Separator(' '),
      questionary.Choice(
        '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
      ),
      questionary.Separator(f'  $ openllm deploy {bento}'),
    ]
  else:
    options = [
      questionary.Separator('Available actions'),
      questionary.Choice('0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'),
      questionary.Separator(f'  $ openllm run {bento}'),
      questionary.Separator(' '),
      questionary.Choice(
        '1. Serve the model locally and get a chat server',
        value='serve',
        disabled='insufficient res.',
        shortcut_key='1',
      ),
      questionary.Separator(f'  $ openllm serve {bento}'),
      questionary.Separator(' '),
      questionary.Choice(
        '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
      ),
      questionary.Separator(f'  $ openllm deploy {bento}'),
    ]
  action = questionary.select('Select an action', options).ask()
  if action is None:
    raise typer.Exit(1)
  if action == 'run':
    try:
      local_run(bento)
    finally:
      output('\nUse this command to run the action again:', style='green')
      output(f'  $ openllm run {bento}', style='orange')
  elif action == 'serve':
    try:
      local_serve(bento)
    finally:
      output('\nUse this command to run the action again:', style='green')
      output(f'  $ openllm serve {bento}', style='orange')
  elif action == 'deploy':
    ensure_cloud_context()
    targets = get_cloud_machine_spec()
    target = _select_target(bento, targets)
    try:
      cloud_deploy(bento, target)
    finally:
      output('\nUse this command to run the action again:', style='green')
      output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')
@app.command(help='get started interactively')
 def hello():
  INTERACTIVE.set(True)
  VERBOSE_LEVEL.set(20)
  target = get_local_machine_spec()
  output(f'  Detected Platform: {target.platform}', style='green')
  if target.accelerators:
    output('  Detected Accelerators: ', style='green')
    for a in target.accelerators:
      output(f'   - {a.model} {a.memory_size}GB', style='green')
  else:
    output('  Detected Accelerators: None', style='yellow')
  models = list_bento()
  if not models:
    output('No model found, you probably need to update the model repo:', style='red')
    output('  $ openllm repo update', style='orange')
    raise typer.Exit(1)
  bento_name, repo = _select_bento_name(models, target)
  bento, score = _select_bento_version(models, target, bento_name, repo)
  _select_action(bento, score)
@app.command(help='start an OpenAI API compatible chat server and chat in browser')
 def serve(
  model: Annotated[str, typer.Argument()] = '', repo: Optional[str] = None, port: int = 3000, verbose: bool = False
 ):
  if verbose:
    VERBOSE_LEVEL.set(20)
  target = get_local_machine_spec()
  bento = ensure_bento(model, target=target, repo_name=repo)
  local_serve(bento, port=port)
@app.command(help='run the model and chat in terminal')
 def run(
  model: Annotated[str, typer.Argument()] = '',
  repo: Optional[str] = None,
  port: Optional[int] = None,
  timeout: int = 600,
  verbose: bool = False,
 ):
  if verbose:
    VERBOSE_LEVEL.set(20)
  target = get_local_machine_spec()
  bento = ensure_bento(model, target=target, repo_name=repo)
  if port is None:
    port = random.randint(30000, 40000)
  local_run(bento, port=port, timeout=timeout)
@app.command(help='deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)')
 def deploy(
  model: Annotated[str, typer.Argument()] = '',
  instance_type: Optional[str] = None,
  repo: Optional[str] = None,
  verbose: bool = False,
 ):
  if verbose:
    VERBOSE_LEVEL.set(20)
  bento = ensure_bento(model, repo_name=repo)
  if instance_type is not None:
    cloud_deploy(bento, DeploymentTarget(name=instance_type))
    return
  targets = get_cloud_machine_spec()
  targets = filter(lambda x: can_run(bento, x) > 0, targets)
  targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
  if not targets:
    output('No available instance type, check your bentocloud account', style='red')
    raise typer.Exit(1)
  target = targets[0]
  output(f'Recommended instance type: {target.name}', style='green')
  cloud_deploy(bento, target)
@app.callback(invoke_without_command=True)
 def typer_callback(
  verbose: int = 0,
  do_not_track: bool = typer.Option(
    False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK
  ),
  version: bool = typer.Option(False, '--version', '-v', help='Show version'),
 ):
  if verbose:
    VERBOSE_LEVEL.set(verbose)
  if version:
    output(
      f"openllm, {importlib.metadata.version('openllm')}\nPython ({platform.python_implementation()}) {platform.python_version()}"
    )
    sys.exit(0)
  if do_not_track:
    os.environ[DO_NOT_TRACK] = str(True)
 if __name__ == '__main__':
  app()
--- a/src/openllm/accelerator_spec.py
+++ b/src/openllm/accelerator_spec.py
@@ -0,0 +1,144 @@
 from __future__ import annotations
 import functools
 import math
 import typing
 from types import SimpleNamespace
 import psutil
 from openllm.common import BentoInfo, DeploymentTarget, output
 class Accelerator(SimpleNamespace):
  model: str
  memory_size: float
  def __gt__(self, other):
    return self.memory_size > other.memory_size
  def __eq__(self, other):
    return self.memory_size == other.memory_size
  def __repr__(self):
    return f'{self.model}({self.memory_size}GB)'
 class Resource(SimpleNamespace):
  cpu: int = 0
  memory: float
  gpu: int = 0
  gpu_type: str = ''
  def __hash__(self):
    return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
  def __bool__(self):
    return any(value is not None for value in self.__dict__.values())
 ACCELERATOR_SPEC_DICT: dict[str, dict] = {
  'nvidia-gtx-1650': {'model': 'GTX 1650', 'memory_size': 4.0},
  'nvidia-gtx-1060': {'model': 'GTX 1060', 'memory_size': 6.0},
  'nvidia-gtx-1080-ti': {'model': 'GTX 1080 Ti', 'memory_size': 11.0},
  'nvidia-rtx-3060': {'model': 'RTX 3060', 'memory_size': 12.0},
  'nvidia-rtx-3060-ti': {'model': 'RTX 3060 Ti', 'memory_size': 8.0},
  'nvidia-rtx-3070-ti': {'model': 'RTX 3070 Ti', 'memory_size': 8.0},
  'nvidia-rtx-3080': {'model': 'RTX 3080', 'memory_size': 10.0},
  'nvidia-rtx-3080-ti': {'model': 'RTX 3080 Ti', 'memory_size': 12.0},
  'nvidia-rtx-3090': {'model': 'RTX 3090', 'memory_size': 24.0},
  'nvidia-rtx-4070-ti': {'model': 'RTX 4070 Ti', 'memory_size': 12.0},
  'nvidia-tesla-p4': {'model': 'P4', 'memory_size': 8.0},
  'nvidia-tesla-p100': {'model': 'P100', 'memory_size': 16.0},
  'nvidia-tesla-k80': {'model': 'K80', 'memory_size': 12.0},
  'nvidia-tesla-t4': {'model': 'T4', 'memory_size': 16.0},
  'nvidia-tesla-v100': {'model': 'V100', 'memory_size': 16.0},
  'nvidia-l4': {'model': 'L4', 'memory_size': 24.0},
  'nvidia-tesla-l4': {'model': 'L4', 'memory_size': 24.0},
  'nvidia-tesla-a10g': {'model': 'A10G', 'memory_size': 24.0},
  'nvidia-a100-80g': {'model': 'A100', 'memory_size': 80.0},
  'nvidia-a100-80gb': {'model': 'A100', 'memory_size': 80.0},
  'nvidia-tesla-a100': {'model': 'A100', 'memory_size': 40.0},
 }
 ACCELERATOR_SPECS: dict[str, Accelerator] = {key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()}
@functools.lru_cache
 def get_local_machine_spec():
  if psutil.MACOS:
    return DeploymentTarget(accelerators=[], source='local', platform='macos')
  if psutil.WINDOWS:
    platform = 'windows'
  elif psutil.LINUX:
    platform = 'linux'
  else:
    raise NotImplementedError('Unsupported platform')
  from pynvml import (
    nvmlDeviceGetCount,
    nvmlDeviceGetCudaComputeCapability,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetName,
    nvmlInit,
    nvmlShutdown,
  )
  try:
    nvmlInit()
    device_count = nvmlDeviceGetCount()
    accelerators: list[Accelerator] = []
    for i in range(device_count):
      handle = nvmlDeviceGetHandleByIndex(i)
      name = nvmlDeviceGetName(handle)
      memory_info = nvmlDeviceGetMemoryInfo(handle)
      accelerators.append(Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)))
      compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
      if compute_capability < (7, 5):
        output(
          f'GPU {name} with compute capability {compute_capability} '
          'may not be supported, 7.5 or higher is recommended. check '
          'https://developer.nvidia.com/cuda-gpus for more information',
          style='yellow',
        )
    nvmlShutdown()
    return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)
  except Exception as e:
    output(
      'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment', style='yellow'
    )
    output(f'Error: {e}', style='red', level=20)
    return DeploymentTarget(accelerators=[], source='local', platform=platform)
@functools.lru_cache()
 def can_run(bento: typing.Union[Resource, BentoInfo], target: typing.Optional[DeploymentTarget] = None) -> float:
  """
  Calculate if the bento can be deployed on the target.
  """
  if target is None:
    target = get_local_machine_spec()
  resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))
  labels = bento.bento_yaml.get('labels', {})
  platforms = labels.get('platforms', 'linux').split(',')
  if target.platform not in platforms:
    return 0.0
  # return 1.0 if no resource is specified
  if not resource_spec:
    return 0.5
  if resource_spec.gpu > 0:
    required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
    filtered_accelerators = [ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size]
    if resource_spec.gpu > len(filtered_accelerators):
      return 0.0
    return required_gpu.memory_size * resource_spec.gpu / sum(ac.memory_size for ac in target.accelerators)
  if target.accelerators:
    return 0.01 / sum(ac.memory_size for ac in target.accelerators)
  return 1.0
--- a/src/openllm/analytic.py
+++ b/src/openllm/analytic.py
@@ -0,0 +1,102 @@
 from __future__ import annotations
 import functools
 import os
 import re
 import time
 import typing
 from abc import ABC
 import attr
 import click
 import typer
 import typer.core
 DO_NOT_TRACK = 'BENTOML_DO_NOT_TRACK'
 class EventMeta(ABC):
  @property
  def event_name(self):
    # camel case to snake case
    event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()
    # remove "_event" suffix
    suffix_to_remove = '_event'
    if event_name.endswith(suffix_to_remove):
      event_name = event_name[: -len(suffix_to_remove)]
    return event_name
@attr.define
 class CliEvent(EventMeta):
  cmd_group: str
  cmd_name: str
  duration_in_ms: float = attr.field(default=0)
  error_type: typing.Optional[str] = attr.field(default=None)
  return_code: typing.Optional[int] = attr.field(default=None)
@attr.define
 class OpenllmCliEvent(CliEvent):
  pass
 class OrderedCommands(typer.core.TyperGroup):
  def list_commands(self, _: click.Context) -> typing.Iterable[str]:
    return list(self.commands)
 class OpenLLMTyper(typer.Typer):
  def __init__(self, *args: typing.Any, **kwargs: typing.Any):
    no_args_is_help = kwargs.pop('no_args_is_help', True)
    context_settings = kwargs.pop('context_settings', {})
    if 'help_option_names' not in context_settings:
      context_settings['help_option_names'] = ('-h', '--help')
    if 'max_content_width' not in context_settings:
      context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))
    klass = kwargs.pop('cls', OrderedCommands)
    super().__init__(*args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs)
  def command(self, *args: typing.Any, **kwargs: typing.Any):
    def decorator(f):
      @functools.wraps(f)
      @click.pass_context
      def wrapped(ctx: click.Context, *args, **kwargs):
        from bentoml._internal.utils.analytics import track
        do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'
        # so we know that the root program is openllm
        command_name = ctx.info_name
        if ctx.parent.parent is not None:
          # openllm model list
          command_group = ctx.parent.info_name
        elif ctx.parent.info_name == ctx.find_root().info_name:
          # openllm run
          command_group = 'openllm'
        if do_not_track:
          return f(*args, **kwargs)
        start_time = time.time_ns()
        try:
          return_value = f(*args, **kwargs)
          duration_in_ns = time.time_ns() - start_time
          track(OpenllmCliEvent(cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6))
          return return_value
        except BaseException as e:
          duration_in_ns = time.time_ns() - start_time
          track(
            OpenllmCliEvent(
              cmd_group=command_group,
              cmd_name=command_name,
              duration_in_ms=duration_in_ns / 1e6,
              error_type=type(e).__name__,
              return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
            )
          )
          raise
      return typer.Typer.command(self, *args, **kwargs)(wrapped)
    return decorator
--- a/src/openllm/clean.py
+++ b/src/openllm/clean.py
@@ -0,0 +1,66 @@
 import pathlib
 import shutil
 import questionary
 from openllm.analytic import OpenLLMTyper
 from openllm.common import CONFIG_FILE, REPO_DIR, VENV_DIR, VERBOSE_LEVEL, output
 app = OpenLLMTyper(help='clean up and release disk space used by OpenLLM')
 HUGGINGFACE_CACHE = pathlib.Path.home() / '.cache' / 'huggingface' / 'hub'
@app.command(help='Clean up all the cached models from huggingface')
 def model_cache(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob('*'))
  sure = questionary.confirm(
    f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
  ).ask()
  if not sure:
    return
  shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
  output('All models cached by Huggingface have been removed', style='green')
@app.command(help='Clean up all the virtual environments created by OpenLLM')
 def venvs(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  used_space = sum(f.stat().st_size for f in VENV_DIR.rglob('*'))
  sure = questionary.confirm(
    f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
  ).ask()
  if not sure:
    return
  shutil.rmtree(VENV_DIR, ignore_errors=True)
  output('All virtual environments have been removed', style='green')
@app.command(help='Clean up all the repositories cloned by OpenLLM')
 def repos(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  shutil.rmtree(REPO_DIR, ignore_errors=True)
  output('All repositories have been removed', style='green')
@app.command(help='Reset configurations to default')
 def configs(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  shutil.rmtree(CONFIG_FILE, ignore_errors=True)
  output('All configurations have been reset', style='green')
@app.command(name='all', help='Clean up all above and bring OpenLLM to a fresh start')
 def all_cache(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  repos()
  venvs()
  model_cache()
  configs()
--- a/src/openllm/cloud.py
+++ b/src/openllm/cloud.py
@@ -0,0 +1,130 @@
 import json
 import os
 import pathlib
 import shutil
 import subprocess
 import typing
 import typer
 from openllm.accelerator_spec import ACCELERATOR_SPECS
 from openllm.analytic import OpenLLMTyper
 from openllm.common import INTERACTIVE, BentoInfo, DeploymentTarget, output, run_command
 app = OpenLLMTyper()
 def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
  cmd = ['bentoml', 'deploy', bento.bentoml_tag]
  env = {'BENTOML_HOME': f'{bento.repo.path}/bentoml'}
  required_envs = bento.bento_yaml.get('envs', [])
  required_env_names = [env['name'] for env in required_envs if 'name' in env]
  if required_env_names:
    output(f'This model requires the following environment variables to run: {required_env_names!r}', style='yellow')
  for env_info in bento.bento_yaml.get('envs', []):
    if 'name' not in env_info:
      continue
    if os.environ.get(env_info['name']):
      default = os.environ[env_info['name']]
    elif 'value' in env_info:
      default = env_info['value']
    else:
      default = ''
    if INTERACTIVE.get():
      import questionary
      value = questionary.text(f"{env_info['name']}:", default=default).ask()
    else:
      if default == '':
        output(f"Environment variable {env_info['name']} is required but not provided", style='red')
        raise typer.Exit(1)
      else:
        value = default
    if value is None:
      raise typer.Exit(1)
    cmd += ['--env', f"{env_info['name']}={value}"]
  if target:
    cmd += ['--instance-type', target.name]
  assert (pathlib.Path.home() / 'bentoml' / '.yatai.yaml').exists()
  shutil.copy(pathlib.Path.home() / 'bentoml' / '.yatai.yaml', bento.repo.path / 'bentoml' / '.yatai.yaml')
  return cmd, env, None
 def ensure_cloud_context():
  import questionary
  cmd = ['bentoml', 'cloud', 'current-context']
  try:
    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
    context = json.loads(result)
    output(f"  bentoml already logged in: {context['endpoint']}", style='green')
  except subprocess.CalledProcessError:
    output('  bentoml not logged in', style='red')
    if not INTERACTIVE.get():
      output('\n  get bentoml logged in by:')
      output('    $ bentoml cloud login', style='orange')
      output('')
      output(
        """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
        style='yellow',
      )
      raise typer.Exit(1)
    else:
      action = questionary.select(
        'Choose an action:', choices=['I have a BentoCloud account', 'get an account in two minutes']
      ).ask()
      if action is None:
        raise typer.Exit(1)
      elif action == 'get an account in two minutes':
        output('Please visit https://cloud.bentoml.com to get your token', style='yellow')
      endpoint = questionary.text('Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)').ask()
      if endpoint is None:
        raise typer.Exit(1)
      token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()
      if token is None:
        raise typer.Exit(1)
      cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]
      try:
        result = subprocess.check_output(cmd)
        output('  Logged in successfully', style='green')
      except subprocess.CalledProcessError:
        output('  Failed to login', style='red')
        raise typer.Exit(1)
 def get_cloud_machine_spec():
  ensure_cloud_context()
  cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']
  try:
    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
    instance_types = json.loads(result)
    return [
      DeploymentTarget(
        source='cloud',
        name=it['name'],
        price=it['price'],
        platform='linux',
        accelerators=(
          [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]
          if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS
          else []
        ),
      )
      for it in instance_types
    ]
  except (subprocess.CalledProcessError, json.JSONDecodeError):
    output('Failed to get cloud instance types', style='red')
    return []
 def deploy(bento: BentoInfo, target: DeploymentTarget):
  ensure_cloud_context()
  cmd, env, cwd = _get_deploy_cmd(bento, target)
  run_command(cmd, env=env, cwd=cwd)
--- a/src/openllm/common.py
+++ b/src/openllm/common.py
@@ -0,0 +1,370 @@
 from __future__ import annotations
 import asyncio
 import functools
 import hashlib
 import io
 import json
 import os
 import pathlib
 import signal
 import subprocess
 import sys
 import sysconfig
 import typing
 from contextlib import asynccontextmanager, contextmanager
 from types import SimpleNamespace
 import typer
 import typer.core
 ERROR_STYLE = 'red'
 SUCCESS_STYLE = 'green'
 CLLAMA_HOME = pathlib.Path.home() / '.openllm'
 REPO_DIR = CLLAMA_HOME / 'repos'
 TEMP_DIR = CLLAMA_HOME / 'temp'
 VENV_DIR = CLLAMA_HOME / 'venv'
 REPO_DIR.mkdir(exist_ok=True, parents=True)
 TEMP_DIR.mkdir(exist_ok=True, parents=True)
 VENV_DIR.mkdir(exist_ok=True, parents=True)
 CONFIG_FILE = CLLAMA_HOME / 'config.json'
 CHECKED = '☆'
 T = typing.TypeVar('T')
 class ContextVar(typing.Generic[T]):
  def __init__(self, default: T):
    self._stack: list[T] = []
    self._default = default
  def get(self) -> T:
    if self._stack:
      return self._stack[-1]
    return self._default
  def set(self, value):
    self._stack.append(value)
  @contextmanager
  def patch(self, value):
    self._stack.append(value)
    try:
      yield
    finally:
      self._stack.pop()
 VERBOSE_LEVEL = ContextVar(10)
 INTERACTIVE = ContextVar(False)
 FORCE = ContextVar(False)
 def output(content, level=0, style=None, end=None):
  import questionary
  if level > VERBOSE_LEVEL.get():
    return
  if not isinstance(content, str):
    import pyaml
    out = io.StringIO()
    pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)
    questionary.print(out.getvalue(), style=style, end='' if end is None else end)
    out.close()
  if isinstance(content, str):
    questionary.print(content, style=style, end='\n' if end is None else end)
 class Config(SimpleNamespace):
  repos: dict[str, str] = {'default': 'git+https://github.com/bentoml/openllm-models@main'}
  default_repo: str = 'default'
  def tolist(self):
    return dict(repos=self.repos, default_repo=self.default_repo)
 def load_config():
  if CONFIG_FILE.exists():
    try:
      with open(CONFIG_FILE) as f:
        return Config(**json.load(f))
    except json.JSONDecodeError:
      return Config()
  return Config()
 def save_config(config):
  with open(CONFIG_FILE, 'w') as f:
    json.dump(config.tolist(), f, indent=2)
 class RepoInfo(SimpleNamespace):
  name: str
  path: pathlib.Path
  url: str
  server: str
  owner: str
  repo: str
  branch: str
  def tolist(self):
    if VERBOSE_LEVEL.get() <= 0:
      return f'{self.name} ({self.url})'
    if VERBOSE_LEVEL.get() <= 10:
      return dict(name=self.name, url=self.url, path=str(self.path))
    if VERBOSE_LEVEL.get() <= 20:
      return dict(
        name=self.name,
        url=self.url,
        path=str(self.path),
        server=self.server,
        owner=self.owner,
        repo=self.repo,
        branch=self.branch,
      )
 class BentoInfo(SimpleNamespace):
  repo: RepoInfo
  path: pathlib.Path
  alias: str = ''
  def __str__(self):
    if self.repo.name == 'default':
      return f'{self.tag}'
    else:
      return f'{self.repo.name}/{self.tag}'
  def __hash__(self):
    return md5(str(self.path))
  @property
  def tag(self) -> str:
    if self.alias:
      return f'{self.path.parent.name}:{self.alias}'
    return f'{self.path.parent.name}:{self.path.name}'
  @property
  def bentoml_tag(self) -> str:
    return f'{self.path.parent.name}:{self.path.name}'
  @property
  def name(self) -> str:
    return self.path.parent.name
  @property
  def version(self) -> str:
    return self.path.name
  @property
  def labels(self) -> dict[str, str]:
    return self.bento_yaml['labels']
  @functools.cached_property
  def bento_yaml(self) -> dict:
    import yaml
    bento_file = self.path / 'bento.yaml'
    return yaml.safe_load(bento_file.read_text())
  @functools.cached_property
  def platforms(self) -> list[str]:
    return self.bento_yaml['labels'].get('platforms', 'linux').split(',')
  @functools.cached_property
  def pretty_yaml(self) -> dict:
    def _pretty_routes(routes):
      return {
        route['route']: {
          'input': {k: v['type'] for k, v in route['input']['properties'].items()},
          'output': route['output']['type'],
        }
        for route in routes
      }
    if len(self.bento_yaml['services']) == 1:
      pretty_yaml = {
        'apis': _pretty_routes(self.bento_yaml['schema']['routes']),
        'resources': self.bento_yaml['services'][0]['config']['resources'],
        'envs': self.bento_yaml['envs'],
        'platforms': self.platforms,
      }
      return pretty_yaml
    return self.bento_yaml
  @functools.cached_property
  def pretty_gpu(self) -> str:
    from openllm.accelerator_spec import ACCELERATOR_SPECS
    try:
      resources = self.bento_yaml['services'][0]['config']['resources']
      if resources['gpu'] > 1:
        acc = ACCELERATOR_SPECS[resources['gpu_type']]
        return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
      elif resources['gpu'] > 0:
        acc = ACCELERATOR_SPECS[resources['gpu_type']]
        return f'{acc.memory_size:.0f}G'
    except KeyError:
      pass
    return ''
  def tolist(self):
    verbose = VERBOSE_LEVEL.get()
    if verbose <= 0:
      return str(self)
    if verbose <= 10:
      return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml)
    if verbose <= 20:
      return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml)
 class VenvSpec(SimpleNamespace):
  python_version: str
  python_packages: dict[str, str]
  name_prefix = ''
  def __hash__(self):
    return md5(
      # self.python_version,
      *sorted(self.python_packages)
    )
 class Accelerator(SimpleNamespace):
  model: str
  memory_size: float
  def __gt__(self, other):
    return self.memory_size > other.memory_size
  def __eq__(self, other):
    return self.memory_size == other.memory_size
 class DeploymentTarget(SimpleNamespace):
  source: str = 'local'
  name: str = 'local'
  price: str = ''
  platform = 'linux'
  accelerators: list[Accelerator]
  def __hash__(self):
    return hash(self.source)
  @property
  def accelerators_repr(self) -> str:
    accs = {a.model for a in self.accelerators}
    if len(accs) == 0:
      return 'null'
    if len(accs) == 1:
      a = self.accelerators[0]
      return f'{a.model} x{len(self.accelerators)}'
    return ', '.join((f'{a.model}' for a in self.accelerators))
 def run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=False) -> subprocess.CompletedProcess:
  import shlex
  env = env or {}
  cmd = [str(c) for c in cmd]
  bin_dir = 'Scripts' if os.name == 'nt' else 'bin'
  if not silent:
    output('\n')
    if cwd:
      output(f'$ cd {cwd}', style='orange')
    if env:
      for k, v in env.items():
        output(f'$ export {k}={shlex.quote(v)}', style='orange')
    if venv:
      output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
    output(f"$ {' '.join(cmd)}", style='orange')
  if venv:
    py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
  else:
    py = sys.executable
  if copy_env:
    env = {**os.environ, **env}
  if cmd and cmd[0] == 'bentoml':
    cmd = [py, '-m', 'bentoml'] + cmd[1:]
  if cmd and cmd[0] == 'python':
    cmd = [py] + cmd[1:]
  try:
    if silent:
      return subprocess.run(  # type: ignore
        cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
      )
    else:
      return subprocess.run(cmd, cwd=cwd, env=env)
  except subprocess.CalledProcessError:
    output('Command failed', style='red')
    raise typer.Exit(1)
 async def stream_command_output(stream, style='gray'):
  async for line in stream:
    output(line.decode(), style=style, end='')
@asynccontextmanager
 async def async_run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=True):
  import shlex
  env = env or {}
  cmd = [str(c) for c in cmd]
  if not silent:
    output('\n')
    if cwd:
      output(f'$ cd {cwd}', style='orange')
    if env:
      for k, v in env.items():
        output(f'$ export {k}={shlex.quote(v)}', style='orange')
    if venv:
      output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
    output(f"$ {' '.join(cmd)}", style='orange')
  if venv:
    py = venv / 'bin' / 'python'
  else:
    py = sys.executable
  if copy_env:
    env = {**os.environ, **env}
  if cmd and cmd[0] == 'bentoml':
    cmd = [py, '-m', 'bentoml'] + cmd[1:]
  if cmd and cmd[0] == 'python':
    cmd = [py] + cmd[1:]
  proc = None
  try:
    proc = await asyncio.create_subprocess_shell(
      ' '.join(map(str, cmd)), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd, env=env
    )
    yield proc
  except subprocess.CalledProcessError:
    output('Command failed', style='red')
    raise typer.Exit(1)
  finally:
    if proc:
      proc.send_signal(signal.SIGINT)
      await proc.wait()
 def md5(*strings: str) -> int:
  m = hashlib.md5()
  for s in strings:
    m.update(s.encode())
  return int(m.hexdigest(), 16)
--- a/src/openllm/local.py
+++ b/src/openllm/local.py
@@ -0,0 +1,91 @@
 import asyncio
 import time
 import httpx
 from openllm.common import BentoInfo, async_run_command, output, run_command, stream_command_output
 from openllm.venv import ensure_venv
 def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
  cmd = ['bentoml', 'serve', bento.bentoml_tag]
  if port != 3000:
    cmd += ['--port', str(port)]
  env = {'BENTOML_HOME': f'{bento.repo.path}/bentoml'}
  return cmd, env, None
 def serve(bento: BentoInfo, port: int = 3000):
  venv = ensure_venv(bento)
  cmd, env, cwd = _get_serve_cmd(bento, port=port)
  run_command(cmd, env=env, cwd=cwd, venv=venv)
 async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600):
  venv = ensure_venv(bento)
  cmd, env, cwd = _get_serve_cmd(bento, port)
  async with async_run_command(cmd, env=env, cwd=cwd, venv=venv, silent=False) as server_proc:
    output(f'Model server started {server_proc.pid}')
    stdout_streamer = None
    stderr_streamer = None
    start_time = time.time()
    output('Model loading...', style='green')
    for _ in range(timeout):
      try:
        resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)
        if resp.status_code == 200:
          break
      except httpx.RequestError:
        if time.time() - start_time > 30:
          if not stdout_streamer:
            stdout_streamer = asyncio.create_task(stream_command_output(server_proc.stdout, style='gray'))
          if not stderr_streamer:
            stderr_streamer = asyncio.create_task(stream_command_output(server_proc.stderr, style='#BD2D0F'))
        await asyncio.sleep(1)
    else:
      output('Model failed to load', style='red')
      server_proc.terminate()
      return
    if stdout_streamer:
      stdout_streamer.cancel()
    if stderr_streamer:
      stderr_streamer.cancel()
    output('Model is ready', style='green')
    messages: list[dict[str, str]] = []
    from openai import AsyncOpenAI
    client = AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')
    model_id = (await client.models.list()).data[0].id
    while True:
      try:
        message = input('user: ')
        if message == '':
          output('empty message, please enter something', style='yellow')
          continue
        messages.append(dict(role='user', content=message))
        output('assistant: ', end='', style='lightgreen')
        assistant_message = ''
        stream = await client.chat.completions.create(
          model=model_id,
          messages=messages,  # type: ignore
          stream=True,
        )
        async for chunk in stream:
          text = chunk.choices[0].delta.content or ''
          assistant_message += text
          output(text, end='', style='lightgreen')
        messages.append(dict(role='assistant', content=assistant_message))
        output('')
      except KeyboardInterrupt:
        break
    output('\nStopping model server...', style='green')
  output('Stopped model server', style='green')
 def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
  asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/src/openllm/model.py
+++ b/src/openllm/model.py
@@ -0,0 +1,147 @@
 import typing
 from typing import Optional
 import tabulate
 import typer
 from openllm.accelerator_spec import DeploymentTarget, can_run
 from openllm.analytic import OpenLLMTyper
 from openllm.common import FORCE, VERBOSE_LEVEL, BentoInfo, load_config, output
 from openllm.repo import ensure_repo_updated, parse_repo_url
 app = OpenLLMTyper(help='manage models')
@app.command()
 def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  bento_info = ensure_bento(tag, repo_name=repo)
  if bento_info:
    output(bento_info)
@app.command(name='list')
 def list_(tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  bentos = list_bento(tag=tag, repo_name=repo)
  bentos.sort(key=lambda x: x.name)
  seen = set()
  def is_seen(value):
    if value in seen:
      return True
    seen.add(value)
    return False
  table = tabulate.tabulate(
    [
      [
        '' if is_seen(bento.name) else bento.name,
        bento.tag,
        bento.repo.name,
        bento.pretty_gpu,
        ','.join(bento.platforms),
      ]
      for bento in bentos
    ],
    headers=['model', 'version', 'repo', 'required VRAM', 'platforms'],
  )
  output(table)
 def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_name: Optional[str] = None) -> BentoInfo:
  bentos = list_bento(model, repo_name=repo_name)
  if len(bentos) == 0:
    output(f'No model found for {model}', style='red')
    raise typer.Exit(1)
  if len(bentos) == 1:
    if FORCE.get():
      output(f'Found model {bentos[0]}', style='green')
      return bentos[0]
    if target is None:
      return bentos[0]
    if can_run(bentos[0], target) <= 0:
      return bentos[0]
    output(f'Found model {bentos[0]}', style='green')
    return bentos[0]
  if target is None:
    output(f'Multiple models match {model}, did you mean one of these?', style='red')
    for bento in bentos:
      output(f'  {bento}')
    raise typer.Exit(1)
  filtered = [bento for bento in bentos if can_run(bento, target) > 0]
  if len(filtered) == 0:
    output(f'No deployment target found for {model}', style='red')
    raise typer.Exit(1)
  if len(filtered) == 0:
    output(f'No deployment target found for {model}', style='red')
    raise typer.Exit(1)
  if len(bentos) > 1:
    output(f'Multiple models match {model}, did you mean one of these?', style='red')
    for bento in bentos:
      output(f'  {bento}')
    raise typer.Exit(1)
  return bentos[0]
 def list_bento(
  tag: typing.Optional[str] = None, repo_name: typing.Optional[str] = None, include_alias: bool = False
 ) -> typing.List[BentoInfo]:
  ensure_repo_updated()
  if repo_name is not None:
    config = load_config()
    if repo_name not in config.repos:
      output(f'Repo `{repo_name}` not found, did you mean one of these?')
      for repo_name in config.repos:
        output(f'  {repo_name}')
      raise typer.Exit(1)
  if not tag:
    glob_pattern = 'bentoml/bentos/*/*'
  elif ':' in tag:
    bento_name, version = tag.split(':')
    glob_pattern = f'bentoml/bentos/{bento_name}/{version}'
  else:
    glob_pattern = f'bentoml/bentos/{tag}/*'
  model_list = []
  config = load_config()
  for _repo_name, repo_url in config.repos.items():
    if repo_name is not None and _repo_name != repo_name:
      continue
    repo = parse_repo_url(repo_url, _repo_name)
    for path in repo.path.glob(glob_pattern):
      if path.is_dir() and (path / 'bento.yaml').exists():
        model = BentoInfo(repo=repo, path=path)
      elif path.is_file():
        with open(path) as f:
          origin_name = f.read().strip()
        origin_path = path.parent / origin_name
        model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
      else:
        model = None
      if model:
        model_list.append(model)
  model_list.sort(key=lambda x: x.tag)
  if not include_alias:
    seen = set()
    model_list = [
      x
      for x in model_list
      if not (
        f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
        or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
      )
    ]
  return model_list
--- a/src/openllm/repo.py
+++ b/src/openllm/repo.py
@@ -0,0 +1,179 @@
 import datetime
 import re
 import shutil
 import pyaml
 import questionary
 import typer
 from openllm.analytic import OpenLLMTyper
 from openllm.common import INTERACTIVE, REPO_DIR, VERBOSE_LEVEL, RepoInfo, load_config, output, save_config
 UPDATE_INTERVAL = datetime.timedelta(days=3)
 app = OpenLLMTyper(help='manage repos')
@app.command()
 def list(verbose: bool = False):
  if verbose:
    VERBOSE_LEVEL.set(20)
  config = load_config()
  pyaml.pprint([parse_repo_url(repo, name) for name, repo in config.repos.items()], sort_dicts=False, sort_keys=False)
@app.command()
 def remove(name: str):
  config = load_config()
  if name not in config.repos:
    output(f'Repo {name} does not exist', style='red')
    return
  del config.repos[name]
  save_config(config)
  output(f'Repo {name} removed', style='green')
 def _complete_alias(repo_name: str):
  from openllm.model import list_bento
  for bento in list_bento(repo_name=repo_name):
    alias = bento.labels.get('openllm_alias', '').strip()
    if alias:
      for a in alias.split(','):
        with open(bento.path.parent / a, 'w') as f:
          f.write(bento.version)
@app.command()
 def update():
  import dulwich
  import dulwich.errors
  import dulwich.porcelain
  config = load_config()
  repos_in_use = set()
  for repo_name, repo in config.repos.items():
    repo = parse_repo_url(repo, repo_name)
    repos_in_use.add((repo.server, repo.owner, repo.repo))
    if repo.path.exists():  # TODO: use update instead of remove and clone
      shutil.rmtree(repo.path, ignore_errors=True)
    if not repo.path.exists():
      repo.path.parent.mkdir(parents=True, exist_ok=True)
      try:
        dulwich.porcelain.clone(
          f'https://{repo.server}/{repo.owner}/{repo.repo}.git',
          str(repo.path),
          checkout=True,
          depth=1,
          branch=repo.branch,
        )
        output('')
        output(f'Repo `{repo.name}` updated', style='green')
      except:
        shutil.rmtree(repo.path, ignore_errors=True)
        output(f'Failed to clone repo {repo.name}', style='red')
    else:
      try:
        import dulwich.porcelain
        dulwich.porcelain.pull(
          str(repo.path), f'https://{repo.server}/{repo.owner}/{repo.repo}.git', refspecs=repo.branch, force=True
        )
        dulwich.porcelain.clean(str(repo.path), str(repo.path))
        output('')
        output(f'Repo `{repo.name}` updated', style='green')
      except:
        shutil.rmtree(repo.path, ignore_errors=True)
        output(f'Failed to update repo {repo.name}', style='red')
  for c in REPO_DIR.glob('*/*/*'):
    repo_spec = tuple(c.parts[-3:])
    if repo_spec not in repos_in_use:
      shutil.rmtree(c, ignore_errors=True)
      output(f'Removed unused repo cache {c}')
  with open(REPO_DIR / 'last_update', 'w') as f:
    f.write(datetime.datetime.now().isoformat())
  for repo_name in config.repos:
    _complete_alias(repo_name)
 def ensure_repo_updated():
  last_update_file = REPO_DIR / 'last_update'
  if not last_update_file.exists():
    if INTERACTIVE.get():
      choice = questionary.confirm(
        'The repo cache is never updated, do you want to update it to fetch the latest model list?'
      ).ask()
      if choice:
        update()
      return
    else:
      output(
        'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list', style='red'
      )
      raise typer.Exit(1)
  last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
  if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
    if INTERACTIVE.get():
      choice = questionary.confirm(
        'The repo cache is outdated, do you want to update it to fetch the latest model list?'
      ).ask()
      if choice:
        update()
    else:
      output(
        'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list', style='yellow'
      )
 GIT_REPO_RE = re.compile(r'git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$')
 def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
  """
  parse the git repo url to server, owner, repo name, branch
  >>> parse_repo_url('git+https://github.com/bentoml/bentovllm@main')
  ('github.com', 'bentoml', 'bentovllm', 'main')
  >>> parse_repo_url('git+https://github.com/bentoml/bentovllm')
  ('github.com', 'bentoml', 'bentovllm', 'main')
  """
  match = GIT_REPO_RE.match(repo_url)
  if not match:
    raise ValueError(f'Invalid git repo url: {repo_url}')
  server = match.group('server')
  owner = match.group('owner')
  repo = match.group('repo')
  branch = match.group('branch') or 'main'
  path = REPO_DIR / server / owner / repo
  return RepoInfo(
    name=repo if repo_name is None else repo_name,
    url=repo_url,
    server=server,
    owner=owner,
    repo=repo,
    branch=branch,
    path=path,
  )
@app.command()
 def add(name: str, repo: str):
  name = name.lower()
  if not name.isidentifier():
    output(f'Invalid repo name: {name}, should only contain letters, numbers and underscores', style='red')
    return
  config = load_config()
  if name in config.repos:
    override = questionary.confirm(f'Repo {name} already exists({config.repos[name]}), override?').ask()
    if not override:
      return
  config.repos[name] = repo
  save_config(config)
  output(f'Repo {name} added', style='green')
 if __name__ == '__main__':
  app()
--- a/src/openllm/venv.py
+++ b/src/openllm/venv.py
@@ -0,0 +1,118 @@
 import functools
 import os
 import pathlib
 import shutil
 import typing
 from typing import Iterable
 import typer
 from openllm.common import VENV_DIR, VERBOSE_LEVEL, BentoInfo, VenvSpec, output, run_command
@functools.lru_cache
 def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
  from pip_requirements_parser import RequirementsFile
  requirements_txt = RequirementsFile.from_file(str(requirement), include_nested=True)
  return requirements_txt.requirements
 def _filter_preheat_packages(requirements: Iterable) -> list[str]:
  PREHEAT_PIP_PACKAGES = ['torch', 'vllm']
  deps: list[str] = []
  for req in requirements:
    if req.is_editable or req.is_local_path or req.is_url or req.is_wheel or not req.name or not req.specifier:
      continue
    for sp in req.specifier:
      if sp.operator == '==' and req.name in PREHEAT_PIP_PACKAGES:
        assert req.line is not None
        deps.append(req.line)
        break
  return deps
@functools.lru_cache
 def _resolve_bento_env_specs(bento: BentoInfo):
  ver_file = bento.path / 'env' / 'python' / 'version.txt'
  assert ver_file.exists(), f'cannot find version file in {bento.path}'
  lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'
  if not lock_file.exists():
    lock_file = bento.path / 'env' / 'python' / 'requirements.txt'
  reqs = _resolve_packages(lock_file)
  preheat_packages = _filter_preheat_packages(reqs)
  ver = ver_file.read_text().strip()
  return (
    VenvSpec(python_version=ver, python_packages=preheat_packages, name_prefix=f"{bento.tag.replace(':', '_')}-1-"),
    VenvSpec(
      python_version=ver, python_packages=[v.line for v in reqs], name_prefix=f"{bento.tag.replace(':', '_')}-2-"
    ),
  )
 def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
  if os.name == 'nt':
    return venv / 'Lib/site-packages'
  else:
    return next(venv.glob('lib/python*')) / 'site-packages'
 def _ensure_venv(env_spec: VenvSpec, parrent_venv: typing.Optional[pathlib.Path] = None) -> pathlib.Path:
  venv = VENV_DIR / str(hash(env_spec))
  if venv.exists() and not (venv / 'DONE').exists():
    shutil.rmtree(venv, ignore_errors=True)
  if not venv.exists():
    output(f'Installing model dependencies({venv})...', style='green')
    venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'
    try:
      run_command(['python', '-m', 'uv', 'venv', venv], silent=VERBOSE_LEVEL.get() < 10)
      lib_dir = _get_lib_dir(venv)
      if parrent_venv is not None:
        parent_lib_dir = _get_lib_dir(parrent_venv)
        with open(lib_dir / f'{parrent_venv.name}.pth', 'w+') as f:
          f.write(str(parent_lib_dir))
      with open(venv / 'requirements.txt', 'w') as f:
        f.write('\n'.join(sorted(env_spec.python_packages)))
      run_command(
        ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), '-r', venv / 'requirements.txt'],
        silent=VERBOSE_LEVEL.get() < 10,
      )
      with open(venv / 'DONE', 'w') as f:
        f.write('DONE')
    except Exception:
      shutil.rmtree(venv, ignore_errors=True)
      output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')
      raise typer.Exit(1)
    output(f'Successfully installed dependencies to {venv}.', style='green')
    return venv
  else:
    return venv
 def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
  last_venv = None
  for env_spec in env_spec_list:
    last_venv = _ensure_venv(env_spec, last_venv)
  assert last_venv is not None
  return last_venv
 def ensure_venv(bento: BentoInfo) -> pathlib.Path:
  return _ensure_venvs(_resolve_bento_env_specs(bento))
 def _check_venv(env_spec: VenvSpec) -> bool:
  venv = VENV_DIR / str(hash(env_spec))
  if not venv.exists():
    return False
  if venv.exists() and not (venv / 'DONE').exists():
    return False
  return True
 def check_venv(bento: BentoInfo) -> bool:
  return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))
--- a/tools/compile.sh
+++ b/tools/compile.sh
@@ -1,7 +0,0 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
 pip install -U mypy==1.7.0 "build<1" hatchling==1.18.0 hatch-vcs==0.3.0 hatch-fancy-pypi-readme==23.1.0 hatch-mypyc==0.16.0
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-client -w -C--global-option=--verbose "$@"
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-core -w -C--global-option=--verbose "$@"
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-python -w -C--global-option=--verbose "$@"
 hatch clean
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -1,284 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import dataclasses
 import os
 import typing as t
 import tomlkit
 from ghapi.all import GhApi
 if t.TYPE_CHECKING:
  from tomlkit.items import Array, Table
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 _OWNER, _REPO = 'bentoml', 'openllm'
@dataclasses.dataclass(frozen=True)
 class Classifier:
  identifier: t.Dict[str, str] = dataclasses.field(
    default_factory=lambda: {
      'status': 'Development Status',
      'environment': 'Environment',
      'license': 'License',
      'topic': 'Topic',
      'os': 'Operating System',
      'audience': 'Intended Audience',
      'typing': 'Typing',
      'language': 'Programming Language',
    }
  )
  joiner: str = ' :: '
  @staticmethod
  def status() -> dict[int, str]:
    return {
      v: status
      for v, status in zip(
        range(1, 8),
        [
          '1 - Planning',
          '2 - Pre-Alpha',
          '3 - Alpha',
          '4 - Beta',
          '5 - Production/Stable',
          '6 - Mature',
          '7 - Inactive',
        ],
      )
    }
  @staticmethod
  def apache() -> str:
    return Classifier.create_classifier('license', 'OSI Approved', 'Apache Software License')
  @staticmethod
  def create_classifier(identifier: str, *decls: t.Any) -> str:
    cls_ = Classifier()
    if identifier not in cls_.identifier:
      raise ValueError(f'{identifier} is not yet supported (supported alias: {Classifier.identifier})')
    return cls_.joiner.join([cls_.identifier[identifier], *decls])
  @staticmethod
  def create_python_classifier(
    implementation: list[str] | None = None, supported_version: list[str] | None = None
  ) -> list[str]:
    if supported_version is None:
      supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12']
    if implementation is None:
      implementation = ['CPython', 'PyPy']
    base = [
      Classifier.create_classifier('language', 'Python'),
      Classifier.create_classifier('language', 'Python', '3'),
    ]
    base.append(Classifier.create_classifier('language', 'Python', '3', 'Only'))
    base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version])
    base.extend([
      Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation
    ])
    return base
  @staticmethod
  def create_status_classifier(level: int) -> str:
    return Classifier.create_classifier('status', Classifier.status()[level])
@dataclasses.dataclass(frozen=True)
 class Dependencies:
  name: str
  git_repo_url: t.Optional[str] = None
  branch: t.Optional[str] = None
  extensions: t.Optional[t.List[str]] = None
  subdirectory: t.Optional[str] = None
  requires_gpu: bool = False
  lower_constraint: t.Optional[str] = None
  upper_constraint: t.Optional[str] = None
  platform: t.Optional[t.Tuple[t.Literal['Linux', 'Windows', 'Darwin'], t.Literal['eq', 'ne']]] = None
  def with_options(self, **kwargs: t.Any) -> Dependencies:
    return dataclasses.replace(self, **kwargs)
  @property
  def has_constraint(self) -> bool:
    return self.lower_constraint is not None or self.upper_constraint is not None
  @property
  def pypi_extensions(self) -> str:
    return '' if self.extensions is None else f"[{','.join(self.extensions)}]"
  @staticmethod
  def platform_restriction(platform: t.LiteralString, op: t.Literal['eq', 'ne'] = 'eq') -> str:
    return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
  def to_str(self) -> str:
    deps: list[str] = []
    if self.lower_constraint is not None and self.upper_constraint is not None:
      dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}'
    elif self.lower_constraint is not None:
      dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint}'
    elif self.upper_constraint is not None:
      dep = f'{self.name}{self.pypi_extensions}<{self.upper_constraint}'
    elif self.subdirectory is not None:
      dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}'
    elif self.branch is not None:
      dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}'
    else:
      dep = f'{self.name}{self.pypi_extensions}'
    deps.append(dep)
    if self.platform:
      deps.append(self.platform_restriction(*self.platform))
    return ';'.join(deps)
  @classmethod
  def from_tuple(cls, *decls: t.Any) -> Dependencies:
    return cls(*decls)
 _LOWER_BENTOML_CONSTRAINT = '1.2.16'
 _BENTOML_EXT = ['io']
 fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}'
 def correct_style(it: t.Any) -> t.Any:
  return it
 def create_classifiers() -> Array:
  arr = correct_style(tomlkit.array())
  arr.extend([
    Classifier.create_status_classifier(5),
    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA'),
    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '12'),
    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.8'),
    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.7'),
    Classifier.apache(),
    Classifier.create_classifier('topic', 'Scientific/Engineering', 'Artificial Intelligence'),
    Classifier.create_classifier('topic', 'Software Development', 'Libraries'),
    Classifier.create_classifier('os', 'OS Independent'),
    Classifier.create_classifier('audience', 'Developers'),
    Classifier.create_classifier('audience', 'Science/Research'),
    Classifier.create_classifier('audience', 'System Administrators'),
    Classifier.create_classifier('typing', 'Typed'),
    *Classifier.create_python_classifier(),
  ])
  return arr.multiline(True)
 def create_url_table(_info: t.Any) -> Table:
  table = tomlkit.table()
  _urls = {
    'Blog': 'https://modelserving.com',
    'Chat': 'https://discord.gg/openllm',
    'Documentation': 'https://github.com/bentoml/openllm#readme',
    'GitHub': _info.html_url,
    'History': f'{_info.html_url}/blob/main/CHANGELOG.md',
    'Homepage': _info.homepage,
    'Tracker': f'{_info.html_url}/issues',
    'Twitter': 'https://twitter.com/bentomlai',
  }
  table.update({k: v for k, v in sorted(_urls.items())})
  return table
 def build_system() -> Table:
  table = tomlkit.table()
  table.add('build-backend', 'hatchling.build')
  requires_array = correct_style(tomlkit.array())
  requires_array.extend(['hatchling==1.18.0', 'hatch-vcs==0.3.0', 'hatch-fancy-pypi-readme==23.1.0'])
  table.add('requires', requires_array.multiline(True))
  return table
 def keywords() -> Array:
  arr = correct_style(tomlkit.array())
  arr.extend([
    'MLOps',
    'AI',
    'BentoML',
    'Model Serving',
    'Model Deployment',
    'LLMOps',
    'Falcon',
    'Vicuna',
    'Llama 2',
    'Fine tuning',
    'Serverless',
    'Large Language Model',
    'Generative AI',
    'StableLM',
    'Alpaca',
    'PyTorch',
    'Mistral',
    'vLLM',
    'Transformers',
  ])
  return arr.multiline(True)
 def build_cli_extensions() -> Table:
  table = tomlkit.table()
  table.update({'openllm': '_openllm_tiny._entrypoint:cli'})
  return table
 def main(args) -> int:
  api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
  _info = api.repos.get()
  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
    pyproject = tomlkit.parse(f.read())
  if args.release_version is not None:
    release_version = args.release_version
  else:
    try:
      release_version = api.repos.get_latest_release()['tag_name'].lstrip('v')
    except Exception as err:
      raise err
  _BASE_DEPENDENCIES = [
    Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=_LOWER_BENTOML_CONSTRAINT),
    Dependencies(name='openllm-client', lower_constraint=release_version),
    Dependencies(name='openllm-core', lower_constraint=release_version),
    Dependencies(name='safetensors'),
    Dependencies(name='vllm', lower_constraint='0.4.3'),
    Dependencies(name='ghapi'),
    Dependencies(name='einops'),
    Dependencies(name='sentencepiece'),
    Dependencies(name='scipy'),
    Dependencies(name='click', lower_constraint='8.1.3'),
    Dependencies(name='cuda-python', platform=('Darwin', 'ne')),
  ]
  dependencies_array = correct_style(tomlkit.array())
  dependencies_array.extend([v.to_str() for v in _BASE_DEPENDENCIES])
  # dynamic field
  dyn_arr = tomlkit.array()
  dyn_arr.extend(['version', 'readme'])
  pyproject['build-system'] = build_system()
  pyproject['project']['classifiers'] = create_classifiers()
  pyproject['project']['dependencies'] = dependencies_array.multiline(True)
  pyproject['project']['description'] = f'{_info.name}: {_info.description}'
  pyproject['project']['dynamic'] = dyn_arr
  pyproject['project']['keywords'] = keywords()
  pyproject['project']['license'] = _info.license.spdx_id
  pyproject['project']['name'] = f'{_info.name.lower()}'
  pyproject['project']['requires-python'] = '>=3.8'
  pyproject['project']['urls'] = create_url_table(_info)
  pyproject['project']['scripts'] = build_cli_extensions()
  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f:
    f.write(tomlkit.dumps(pyproject))
  return 0
 if __name__ == '__main__':
  import argparse
  parser = argparse.ArgumentParser()
  parser.add_argument('--release-version', type=str, default=None)
  raise SystemExit(main(parser.parse_args()))
--- a/tools/generate-coverage.py
+++ b/tools/generate-coverage.py
@@ -1,53 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 from collections import defaultdict
 from pathlib import Path
 import orjson
 from lxml import etree
 ROOT = Path(__file__).resolve().parent.parent
 PACKAGES = {'openllm-python/src/openllm/': 'openllm'}
 def main() -> int:
  coverage_report = ROOT / 'coverage.xml'
  root = etree.fromstring(coverage_report.read_text())
  raw_package_data: defaultdict[str, dict[str, int]] = defaultdict(lambda: {'hits': 0, 'misses': 0})
  for package in root.find('packages'):
    for module in package.find('classes'):
      filename = module.attrib['filename']
      for relative_path, package_name in PACKAGES.items():
        if filename.startswith(relative_path):
          data = raw_package_data[package_name]
          break
      else:
        message = f'unknown package: {module}'
        raise ValueError(message)
      for line in module.find('lines'):
        if line.attrib['hits'] == '1':
          data['hits'] += 1
        else:
          data['misses'] += 1
  total_statements_covered = 0
  total_statements = 0
  coverage_data = {}
  for package_name, data in sorted(raw_package_data.items()):
    statements_covered = data['hits']
    statements = statements_covered + data['misses']
    total_statements_covered += statements_covered
    total_statements += statements
    coverage_data[package_name] = {'statements_covered': statements_covered, 'statements': statements}
  coverage_data['total'] = {'statements_covered': total_statements_covered, 'statements': total_statements}
  coverage_summary = ROOT / 'coverage-summary.json'
  coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8')
  return 0
 if __name__ == '__main__':
  raise SystemExit(main())
--- a/tools/lock-actions.sh
+++ b/tools/lock-actions.sh
@@ -1,14 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 GIT_ROOT=$(git rev-parse --show-toplevel)
 cd "$GIT_ROOT" || exit 1
 [[ -x "$(command -v docker)" ]] || (
  echo "docker not found. Make sure to have docker running to run this job."
  exit 1
 )
 RATCHET_EXP_KEEP_NEWLINES=true find "${GIT_ROOT}/.github/workflows" -type f -iname '*.yml' -exec docker run -it --rm -v "${PWD}":"${PWD}" -w "${PWD}" -e RATCHET_EXP_KEEP_NEWLINES=true ghcr.io/sethvargo/ratchet:0.4.0 pin {} \;
--- a/tools/machines.py
+++ b/tools/machines.py
@@ -1,91 +0,0 @@
 from __future__ import annotations
 import httpx,os,dataclasses,datetime,time,argparse,typing as t
 if (ENV:=os.getenv("PAPERSPACE_API_KEY")) is None:raise RuntimeError('This script requires setting "PAPERSPACE_API_KEY"')
 HEADERS=httpx.Headers({'Authorization': f'Bearer {ENV}', 'Accept': 'application/json'})
 API_URL='https://api.paperspace.com/v1'
@dataclasses.dataclass
 class Machine:
  id:str
  inner:httpx.Client=dataclasses.field(default_factory=lambda:httpx.Client(headers=HEADERS,base_url=API_URL,timeout=60),repr=False)
  def close(self):self.inner.close()
  def __del__(self):self.close()
  def __enter__(self):return self
  def __exit__(self, *_: t.Any)->None:self.close()
  @property
  def metadata(self)->dict[str,t.Any]:return self.inner.get(f'/machines/{self.id}').json()
  @property
  def status(self)->t.Literal['off','ready','stopping','starting']:return self.metadata['state']
  def start(self)->bool:
    response=self.inner.patch(f'/machines/{self.id}/start')
    if response.status_code==400 or self.status=='ready':
      print('machine is already running')
      return False
    elif response.status_code!=200:raise ValueError(f'Error while starting machine: {response.json()}')
    return True
  def stop(self)->bool:
    response=self.inner.patch(f'/machines/{self.id}/stop')
    if response.status_code==400 or self.status=='off':
      print('machine is already off')
      return False
    elif response.status_code!=200:raise ValueError(f'Error while stopping machine {response.json()}')
    return True
  @classmethod
  def ci(cls,template_id:str)->Machine:
    client = httpx.Client(headers=HEADERS,base_url=API_URL,timeout=60)
    machines=client.get('/machines',params=dict(limit=1,name='openllm-ci')).json()
    if len(machines['items']) == 1:
      return cls(id=machines['items'][0]['id'],inner=client)
    response=client.post('/machines',json=dict(
      name=f'openllm-ci-{datetime.datetime.now().timestamp()}', machineType='A4000',templateId=template_id,
      networkId=os.getenv("PAPERSPACE_NETWORK_ID"), diskSize=500,region='ny2',publicIpType='dynamic',startOnCreate=True))
    if response.status_code!=200:raise ValueError(f'Failed while creating a machine: {response.json()}')
    return cls(id=response.json()['data']['id'],inner=client)
  def actions(self,ip:bool=False,id:bool=False)->str:
    if ip:return self.metadata["publicIp"]
    if id:return self.id
    raise ValueError('cannot be all false.')
 def main():
  parser=argparse.ArgumentParser()
  group=parser.add_mutually_exclusive_group(required=True)
  group.add_argument('--start',metavar='ID')
  group.add_argument('--stop',metavar='ID')
  group.add_argument('--delete',metavar='ID')
  group.add_argument('--ci-template',metavar='ID')
  parser.add_argument('--output-ip',action='store_true')
  parser.add_argument('--output-id',action='store_true')
  args=parser.parse_args()
  if args.ci_template:
    machine=Machine.ci(args.ci_template)
    while machine.status!='ready':time.sleep(5)
    print(machine.actions(args.output_ip,args.output_id))
    machine.close()
  elif args.delete:
    with httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60) as client:
      response=client.delete(f'/machines/{args.delete}')
      if response.status_code!=200:
        print('Error while deleting machine', response.json())
        return 1
  elif args.start:
    with Machine(id=args.start) as machine:
      if machine.start():
        while machine.status!='ready':
          print('Waiting for machine to be ready...')
          time.sleep(5)
      else:
        print('Failed to start machine "%s"', machine.id)
        return 1
  elif args.stop:
    with Machine(id=args.stop) as machine:
      if machine.stop():
        while machine.status!='ready':
          print('Waiting for machine to stop...')
          time.sleep(5)
      else:
        print('Failed to stop machine')
        return 1
  return 0
 if __name__ == "__main__":raise SystemExit(main())
--- a/tools/mirror.sh
+++ b/tools/mirror.sh
@@ -1,14 +0,0 @@
 #!/usr/bin/env bash
 set -eo pipefail
 GIT_ROOT="$(git rev-parse --show-toplevel)"
 cd "$GIT_ROOT" || exit 1
 mirror() {
  cp $1 $2
 }
 mirror README.md openllm-python/README.md
 mirror LICENSE.md openllm-python/LICENSE.md
 mirror CHANGELOG.md openllm-python/CHANGELOG.md
--- a/tools/requirements.txt
+++ b/tools/requirements.txt
@@ -1,6 +0,0 @@
 ipython
 jupyter
 tomlkit
 ghapi
 pre-commit
 orjson
--- a/tools/run-release-action
+++ b/tools/run-release-action
@@ -1,148 +0,0 @@
 #!/usr/bin/env bash
 set -e -o pipefail
 # Function to print script usage
 print_usage() {
  echo "Usage: $0 [--release <major|minor|patch|alpha>]"
 }
 # Function to validate release argument
 validate_release() {
  local release=$1
  if [[ $release == "major" || $release == "minor" || $release == "patch" || $release == "alpha" ]]; then
    return 0
  else
    return 1
  fi
 }
 check_membership() {
  local org="BentoML"
  local username=$(gh api user | jq -r '.login')
  if gh api orgs/$org/members/$username -q '.message' | grep -q "Not Found"; then
    echo "ERROR: You must be a member of $org to run this script."
    exit 1
  fi
 }
 for cmd in gh jq; do
  if ! command -v "$cmd" @ >&1 >/dev/null; then
    echo "ERROR: $cmd not installed. Aborting..."
    exit 1
  fi
 done
 check_membership
 # Check if release flag is provided
 if [[ $1 == "--release" ]]; then
  # Check if release argument is provided
  if [[ -z $2 ]]; then
    echo "Error: No release argument provided."
    print_usage
    exit 1
  fi
  release=$2
  if ! validate_release "$release"; then
    echo "Error: Invalid release argument. Only 'major', 'minor', 'patch', or 'alpha' are allowed."
    print_usage
    exit 1
  fi
 else
  echo "Error: Unknown option or no option provided."
  print_usage
  exit 1
 fi
 # Get the current version and separate the alpha part if it exists
 version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
 VERSION="${version#v}"
 # Initialize variables for alpha versioning
 ALPHA=""
 ALPHA_NUM=0
 # Check if current version is an alpha version and split accordingly
 if [[ $VERSION =~ -alpha ]]; then
  IFS='-' read -r BASE_VERSION ALPHA <<<"$VERSION"
  if [[ $ALPHA =~ [.] ]]; then
    IFS='.' read -r ALPHA ALPHA_NUM <<<"$ALPHA"
  fi
 else
  BASE_VERSION="$VERSION"
 fi
 # Save the current value of IFS to restore it later and split the base version
 OLD_IFS=$IFS
 IFS='.'
 read -ra VERSION_BITS <<<"$BASE_VERSION"
 IFS=$OLD_IFS
 # Assign split version numbers
 VNUM1=${VERSION_BITS[0]}
 VNUM2=${VERSION_BITS[1]}
 VNUM3=${VERSION_BITS[2]}
 # Adjust the version numbers based on the release type
 if [[ $release == 'major' ]]; then
  VNUM1=$((VNUM1 + 1))
  VNUM2=0
  VNUM3=0
  ALPHA="" # Reset alpha for major release
 elif [[ $release == 'minor' ]]; then
  if [[ -n $ALPHA ]]; then
    ALPHA="" # Remove alpha suffix for minor release from an alpha version
  else
    VNUM2=$((VNUM2 + 1))
    VNUM3=0
  fi
 elif [[ $release == 'patch' ]]; then
  VNUM3=$((VNUM3 + 1))
  ALPHA="" # Reset alpha for patch release
 elif [[ $release == 'alpha' ]]; then
  if [ -n "$ALPHA" ]; then
    ALPHA_NUM=$((ALPHA_NUM + 1))
  else
    VNUM2=$((VNUM2 + 1))
    VNUM3=0
    ALPHA="alpha"
    ALPHA_NUM=0
  fi
 fi
 # Construct the new version string
 if [ -n "$ALPHA" ]; then
  if ((ALPHA_NUM > 0)); then
    RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3-alpha.$ALPHA_NUM"
  else
    RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3-alpha"
  fi
 else
  RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3"
 fi
 echo "Releasing version: $RELEASE_TAG"
 if [[ -v DRYRUN ]]; then
  exit 0
 fi
 echo "Running release actions (create-releases.yml)..."
 echo '{"release_type": "'"$release"'"}' | gh workflow run create-releases.yml --repo bentoml/openllm --json
 sleep 20
 set -x
 echo "Waiting for new tags to be released from 'create-releases.yml'"
 while true; do
  git pull --autostash --no-edit --gpg-sign --ff origin main
  if git ls-remote -t --exit-code origin "refs/tags/${RELEASE_TAG}" &>/dev/null; then
    break
  fi
  sleep 10
 done
--- a/tools/tests-requirements.txt
+++ b/tools/tests-requirements.txt
@@ -1,9 +0,0 @@
 pytest
 pytest-cov
 pytest-mock
 pytest-randomly
 pytest-rerunfailures
 pytest-asyncio>=0.21.0
 coverage[toml]
 filelock>=3.7.1
 pytest-xdist[psutil]
--- a/tools/update-actions.sh
+++ b/tools/update-actions.sh
@@ -1,27 +0,0 @@
 #!/usr/bin/env bash
 set -e
 DEBUG=${DEBUG:-false}
 [[ "${DEBUG}" == "true" ]] && set -x
 GIT_ROOT=$(git rev-parse --show-toplevel)
 cd "$GIT_ROOT" || exit 1
 [[ -x "$(command -v docker)" ]] || (
    echo "docker not found. Make sure to have docker running to run this job."
    exit 1
 )
 docker version &>/dev/null || (
  echo "docker is not healthy. Make sure to have docker running"
  exit 1
 )
 [[ -z "${ACTIONS_TOKEN}" ]] && (
    echo "ACTIONS_TOKEN not found. Make sure to have ACTIONS_TOKEN set to run this job."
    exit 1
 )
 find "${GIT_ROOT}/.github/workflows" -type f -iname '*.yml' -exec docker run --rm -v "${PWD}":"${PWD}" -w "${PWD}" -e ACTIONS_TOKEN -e RATCHET_EXP_KEEP_NEWLINES=true ghcr.io/sethvargo/ratchet:0.4.0 update {} \;
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -1,170 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import os, sys
 from pathlib import Path
 # currently we are assuming the indentatio level is 2 for comments
 START_COMMENT = f'# {os.path.basename(__file__)}: start\n'
 END_COMMENT = f'# {os.path.basename(__file__)}: stop\n'
 # Stubs for auto class
 START_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs start\n'
 END_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs stop\n'
 # Stubs for actual imports
 START_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs start\n'
 END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'
 ROOT = Path(__file__).parent.parent
 _TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
 _TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
 _TARGET_CORE_INIT_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / '__init__.py'
 _TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
 _TARGET_IMPORT_UTILS_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'utils' / 'import_utils.pyi'
 sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings
 from openllm_core.config.configuration_auto import CONFIG_MAPPING_NAMES
 from openllm_core.utils import codegen, import_utils as iutils
 # from openllm_core.utils.peft import PeftType
 def process_annotations(annotations: str) -> str:
  if 'NotRequired' in annotations:
    return annotations[len('NotRequired[') : -1]
  elif 'Required' in annotations:
    return annotations[len('Required[') : -1]
  else:
    return annotations
 _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
 def main() -> int:
  with _TARGET_FILE.open('r') as f:
    processed = f.readlines()
  start_idx, end_idx = processed.index(' ' * 2 + START_COMMENT), processed.index(' ' * 2 + END_COMMENT)
  # NOTE: inline runtime __getitem__ overload process
  lines: list[str] = []
  lines.append(' ' * 2 + '# NOTE: ModelSettings arguments\n')
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
    lines.extend([
      ' ' * 2 + line
      for line in [
        '@overload\n',
        f"def __getitem__(self, item: t.Literal['{keys}']) -> {process_annotations(ForwardRef.__forward_arg__)}: ...\n",
      ]
    ])
  lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n')
  generation_config_anns = codegen.get_annotations(GenerationConfig)
  for keys, type_pep563 in generation_config_anns.items():
    lines.extend([
      ' ' * 2 + line
      for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]
    ])
  # backward compatible
  lines.extend([
    ' ' * 2 + line
    for line in [
      '@overload\n',
      "def __getitem__(self, item: t.Literal['max_new_tokens']) -> int: ...\n",
      '@overload\n',
      "def __getitem__(self, item: t.Literal['start_name']) -> str: ...\n",
      '@overload\n',
      "def __getitem__(self, item: t.Literal['model_name']) -> str: ...\n",
    ]
  ])
  processed = (
    processed[:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT] + processed[end_idx + 1 :]
  )
  with _TARGET_FILE.open('w') as f:
    f.writelines(processed)
  with _TARGET_AUTO_FILE.open('r') as f:
    processed = f.readlines()
  start_auto_stubs_idx, end_auto_stubs_idx = (
    processed.index(' ' * 2 + START_AUTO_STUBS_COMMENT),
    processed.index(' ' * 2 + END_AUTO_STUBS_COMMENT),
  )
  lines = []
  for model, class_name in CONFIG_MAPPING_NAMES.items():
    lines.extend([
      ' ' * 2 + line
      for line in [
        '@t.overload\n',
        '@classmethod\n',
        f"def for_model(cls, model_name: t.Literal['{model}'], **attrs: t.Any) -> openllm_core.config.{class_name}: ...\n",
      ]
    ])
  lines.extend([
    ' ' * 2 + line
    for line in [
      '@t.overload\n',
      '@classmethod\n',
      'def for_model(cls, model_name: LiteralString, **attrs: t.Any) -> openllm_core.LLMConfig: ...\n',
    ]
  ])
  processed = (
    processed[:start_auto_stubs_idx]
    + [' ' * 2 + START_AUTO_STUBS_COMMENT, *lines, ' ' * 2 + END_AUTO_STUBS_COMMENT]
    + processed[end_auto_stubs_idx + 1 :]
  )
  with _TARGET_AUTO_FILE.open('w') as f:
    f.writelines(processed)
  with _TARGET_INIT_FILE.open('r') as f:
    processed = f.readlines()
  start_import_stubs_idx, end_import_stubs_idx = (
    processed.index(START_IMPORT_STUBS_COMMENT),
    processed.index(END_IMPORT_STUBS_COMMENT),
  )
  mm = {
    '_configuration': ('GenerationConfig', 'LLMConfig'),
    '_schemas': ('GenerationInput', 'GenerationOutput', 'MetadataOutput', 'MessageParam'),
    'utils': ('api',),
  }
  lines = [
    'from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient',
    f'from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, {", ".join([a+" as "+a for a in CONFIG_MAPPING_NAMES.values()])}',
  ]
  lines.extend([
    f'from openllm_core.{module} import {", ".join([a+" as "+a for a in attr])}' for module, attr in mm.items()
  ])
  processed = (
    processed[:start_import_stubs_idx]
    + [START_IMPORT_STUBS_COMMENT, '\n'.join(lines) + '\n', END_IMPORT_STUBS_COMMENT]
    + processed[end_import_stubs_idx + 1 :]
  )
  with _TARGET_INIT_FILE.open('w') as f:
    f.writelines(processed)
  lines = [
    '# fmt: off\n',
    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
    'from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig\n',
    *[f'from .configuration_{k} import {a} as {a}\n' for k, a in CONFIG_MAPPING_NAMES.items()],
  ]
  with _TARGET_CORE_INIT_FILE.open('w') as f:
    f.writelines(lines)
  lines = [
    '# fmt: off\n',
    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
    'import typing as t\n',
    'def is_autoawq_available() -> bool: ...\n',
    'def is_vllm_available() -> bool: ...\n',
    *[f'def {k}() -> bool: ...\n' for k in iutils.caller],
    'ENV_VARS_TRUE_VALUES: t.Set[str] = ...\n',
  ]
  with _TARGET_IMPORT_UTILS_FILE.open('w') as f:
    f.writelines(lines)
  return 0
 if __name__ == '__main__':
  raise SystemExit(main())
--- a/tools/update-mypy.py
+++ b/tools/update-mypy.py
@@ -1,88 +0,0 @@
 #!/usr/bin/env python3
 import concurrent.futures
 import configparser
 import os
 from typing import List
 _MYPY_CONFIG = {
  'pretty': 'true',
  'python_version': '3.9',
  'show_error_codes': 'true',
  'strict': 'true',
  'plugins': 'pydantic.mypy',
  'ignore_missing_imports': 'true',
  'warn_unreachable': 'true',
  'explicit_package_bases': 'true',
 }
 # Function to find .pyi files in a given directory
 def pyi_in_subdir(directory: str, git_root: str) -> List[str]:
  pyi_files = []
  for root, _, files in os.walk(directory):
    for file in files:
      if file.endswith('.pyi') or file == '_typing_compat.py' or '_openllm_tiny' in file:
        full_path = os.path.join(root, file)
        # Convert to relative path with respect to the git root
        relative_path = os.path.relpath(full_path, git_root)
        pyi_files.append(relative_path)
  return pyi_files
 def find_pyi_files(git_root: str) -> List[str]:
  # List all subdirectories
  subdirectories = [
    os.path.join(git_root, name)
    for name in os.listdir(git_root)
    if os.path.isdir(os.path.join(git_root, name)) and name not in ['venv', '.git', '.venv']
  ]
  # Use a thread pool to execute searches concurrently
  with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map of future to subdirectory
    future_to_subdir = {executor.submit(pyi_in_subdir, subdir, git_root): subdir for subdir in subdirectories}
    all_pyi_files = set()
    for future in concurrent.futures.as_completed(future_to_subdir):
      pyi_files = future.result()
      all_pyi_files.update(pyi_files)
  return list(all_pyi_files)
 # Function to update mypy.ini file
 def update_mypy_ini(pyi_files: List[str], mypy_ini_path: str) -> int:
  config = configparser.ConfigParser()
  config.read(mypy_ini_path)
  # Existing files from mypy.ini
  existing_files = config.get('mypy', 'files', fallback='').split(', ')
  # Add new .pyi files if they are not already in the list
  updated_files = existing_files + [f for f in pyi_files if f not in existing_files]
  # Update the 'files' entry
  config['mypy']['files'] = ', '.join(updated_files)
  for key, value in _MYPY_CONFIG.items():
    config.set('mypy', key, value)
  # Write changes back to mypy.ini
  with open(mypy_ini_path, 'w') as configfile:
    configfile.write(
      f'# The following is autogenerated by {os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))}\n'
    )
    config.write(configfile)
  # Remove last newline if exists
  with open(mypy_ini_path, 'rb+') as file:
    file.seek(-1, os.SEEK_END)
    if file.read(1) == b'\n':
      file.seek(-1, os.SEEK_END)
      file.truncate()
  return 0
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 MYPY_CONFIG = os.path.join(ROOT, 'mypy.ini')
 if __name__ == '__main__':
  raise SystemExit(update_mypy_ini(find_pyi_files(ROOT), MYPY_CONFIG))
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -1,64 +0,0 @@
 #!/usr/bin/env python3
 import os, shutil, sys, tomlkit
 START_COMMENT = f'<!-- {os.path.basename(__file__)}: start -->\n'
 END_COMMENT = f'<!-- {os.path.basename(__file__)}: stop -->\n'
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, os.path.join(ROOT, 'openllm-core', 'src'))
 from openllm_core.config import CONFIG_MAPPING
 from openllm_core.config.configuration_auto import CONFIG_TO_ALIAS_NAMES
 def markdown_noteblock(text: str):
  return ['\n', f'> **Note:** {text}\n']
 def main() -> int:
  with open(os.path.join(ROOT, 'README.md'), 'r') as f:
    readme = f.readlines()
  start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
  content = []
  for it in CONFIG_MAPPING.values():
    it = it()
    architecture_name = it.__class__.__name__[:-6]
    details_block = ['<details>\n', f'<summary>{architecture_name}</summary>\n\n', '### Quickstart\n']
    nitem = CONFIG_TO_ALIAS_NAMES[it.__class__.__name__]
    details_block.extend([
      f'Run the following command to quickly spin up a {architecture_name} server:\n',
      f"""\
 ```bash
 openllm start {it['default_id']}{'' if not it['trust_remote_code'] else ' --trust-remote-code'}
 ```""",
      'You can run the following code in a different terminal to interact with the server:',
      """\
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```""",
      *markdown_noteblock(
        f'Any {architecture_name} variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search={nitem}) to see more {architecture_name}-compatible models.\n'
      ),
      '\n### Supported models\n',
      f'You can specify any of the following {architecture_name} models via `openllm start`:\n\n',
    ])
    list_ids = [f'- [{model_id}](https://huggingface.co/{model_id})' for model_id in it['model_ids']]
    details_block.extend(list_ids)
    details_block.append('\n</details>\n\n')
    content.append('\n'.join(details_block))
  readme = readme[:start_index] + [START_COMMENT] + content + [END_COMMENT] + readme[stop_index + 1 :]
  with open(os.path.join(ROOT, 'README.md'), 'w') as f:
    f.writelines(readme)
  shutil.copyfile(os.path.join(ROOT, 'README.md'), os.path.join(ROOT, 'openllm-python', 'README.md'))
  return 0
 if __name__ == '__main__':
  raise SystemExit(main())
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -1,46 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 from decimal import ROUND_DOWN, Decimal
 from pathlib import Path
 import orjson
 PRECISION = Decimal('.01')
 ROOT = Path(__file__).resolve().parent.parent
 def main() -> int:
  coverage_summary = ROOT / 'coverage-summary.json'
  coverage_data = orjson.loads(coverage_summary.read_text(encoding='utf-8'))
  total_data = coverage_data.pop('total')
  lines = ['\n', 'Package | Statements\n', '------- | ----------\n']
  for package, data in sorted(coverage_data.items()):
    statements_covered = data['statements_covered']
    statements = data['statements']
    rate = Decimal(statements_covered) / Decimal(statements) * 100
    rate = rate.quantize(PRECISION, rounding=ROUND_DOWN)
    lines.append(f'{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n')
  total_statements_covered = total_data['statements_covered']
  total_statements = total_data['statements']
  total_rate = Decimal(total_statements_covered) / Decimal(total_statements) * 100
  total_rate = total_rate.quantize(PRECISION, rounding=ROUND_DOWN)
  color = 'ok' if float(total_rate) >= 95 else 'critical'
  lines.insert(0, f'![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n')
  lines.append(
    f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n'
  )
  coverage_report = ROOT / 'coverage-report.md'
  with coverage_report.open('w', encoding='utf-8') as f:
    f.write(''.join(lines))
  return 0
 if __name__ == '__main__':
  raise SystemExit(main())
--- a/wheels.sh
+++ b/wheels.sh
@@ -1,26 +0,0 @@
 #!/usr/bin/env bash
 set -eo pipefail
 GIT_ROOT="$(git rev-parse --show-toplevel)"
 cd "$GIT_ROOT" || exit 1
 mkdir -p dist
 pushd openllm-client &>/dev/null
 python -m build -w && mv dist/* ../dist
 popd &>/dev/null
 pushd openllm-core &>/dev/null
 python -m build -w && mv dist/* ../dist
 popd &>/dev/null
 pushd openllm-python &>/dev/null
 python -m build -w && mv dist/* ../dist
 popd &>/dev/null
		`@@ -0,0 +1 @@`
							`/Users/aarnphm/workspace/bentoml/projects/openllm/upstream/.python-version-default`