chore: ready for 0.6 releases

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-12-23 23:57:46 -05:00 · 2024-07-09 22:05:43 -04:00
parent 61997de34d
commit f4d822125e
60 changed files with 1930 additions and 4083 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,3 @@
-**/_next/ linguist-generated=true
-
 * text=auto eol=lf
 # Needed for setuptools-scm-git-archive
 .git_archival.txt  export-subst
--- a/.github/INFRA.md
+++ b/.github/INFRA.md
@@ -1,64 +0,0 @@
-## OpenLLM CI/CD
-
-> [!NOTE]
-> All actions within this repository should always be locked to a specific version. We are using [ratchet](https://github.com/sethvargo/ratchet)
-> for doing this via [this script](https://github.com/bentoml/OpenLLM/blob/main/tools/lock-actions.sh)
-
-OpenLLM uses a GitHub Action to run all CI/CD workflows. It also use [pre-commit.ci](https://pre-commit.ci/) to run CI for all pre-commit hooks.
-
-The folder structure of this are as follow:
-
-```prolog
-.
-├── CODEOWNERS                # Code owners
-├── CODE_OF_CONDUCT.md        # Code of conduct
-├── ISSUE_TEMPLATE            # Contains issue templates
-├── SECURITY.md               # Security policy
-├── actions                   # Contains helpers script for all actions
-├── assets                    # Contains static assets to be used throughout this repository
-├── dependabot.yml            # Dependabot configuration
-└── workflows
-    ├── binary-releases.yml   # Build and publish binary releases
-    ├── build.yml             # Self-hosted EC2 runners
-    ├── ci.yml                # CI workflow
-    ├── cleanup.yml           # Cache cleanup
-    ├── build-pypi.yml        # Build PyPI packages
-    ├── create-releases.yml   # Create GitHub releases
-    ├── cron.yml              # Cron jobs
-    └── release-notes.yml     # Generate release notes
-```
-
-> [!IMPORTANT]
-> All of the following jobs will and should only be run within the BentoML organisation and this repository.
-
-### Self-hosted EC2 runners
-
-The workflow for self-hosted EC2 runners is located in [build.yml](/.github/workflows/build.yml).
-This workflow is currently used for building OpenLLM base images that contains all compiled kernels
-for serving. It will then be published to the following registry:
-
- GitHub Container Registry (`ghcr.io/bentoml/openllm`): This is where users can extend the base image
-  with their own custom kernels or use as base for building Bentos
-
- AWS Elastic Container Registry (`public.ecr.aws/y5w8i4y6/bentoml/openllm`): This is where all Bento
-  created with `openllm` will be using. This is purely for build optimisation on BentoCloud.
-
-There are a few ways to trigger this workflow:
-
- Automatically triggered when a new commit is pushed to the `main` branch and tag release
-
- On pull request: This will be triggered manually when the label `00 - EC2 Build`
-
- On commit with the following `[ec2 build]`
-
-### Wheel compilation
-
-The workflow for wheel compilation is located in [build-pypi.yml](/.github/workflows/build-pypi.yml).
-
-To speed up CI, opt in to the following label `02 - Wheel Build` on pull request or add `[wheel build]` to commit message.
-
-### Binary releases
-
-The workflow for binary releases is located in [binary-releases.yml](/.github/workflows/binary-releases.yml).
-
-To speed up CI, opt in to the following label `03 - Standalone Build` on pull request or add `[binary build]` to commit message.
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -48,14 +48,8 @@ fi
 release_package() {
  local version="$1"
  echo "Releasing version ${version}..."
-  jq --arg release_version "${version}" '.version = $release_version' <package.json >package.json.tmp && mv package.json.tmp package.json
-  pushd openllm-node &>/dev/null
-  jq --arg release_version "${version}" '.version = $release_version' <package.json >package.json.tmp && mv package.json.tmp package.json
-  popd &>/dev/null
  towncrier build --yes --version "${version}"
-  ./tools/dependencies.py --release-version "${version}"
-  cp CHANGELOG.md openllm-python/CHANGELOG.md
-  git add CHANGELOG.md openllm-python/pyproject.toml openllm-python/CHANGELOG.md changelog.d package.json openllm-node/package.json
+  git add CHANGELOG.md changelog.d
  git commit -S -sm "infra: prepare for release ${version} [generated] [skip ci]"
  git push origin main
  echo "Releasing tag ${version}..." && git tag -a "v${version}" -sm "Release ${version} [generated by GitHub Actions]"
--- a/.github/assets/agent.gif
+++ b/.github/assets/agent.gif
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,28 +1,13 @@
 version: 2
 updates:
  - package-ecosystem: github-actions
-    directory: '/'
+    directory: "/"
    schedule:
-      interval: 'weekly'
-      day: 'monday'
-      time: '09:00'
+      interval: "weekly"
+      day: "monday"
+      time: "09:00"
  - package-ecosystem: pip
-    directory: '/openllm-python'
+    directory: "/"
    schedule:
-      interval: 'weekly'
+      interval: "weekly"
    open-pull-requests-limit: 5
-  - package-ecosystem: pip
-    directory: '/openllm-core'
-    schedule:
-      interval: 'weekly'
-    open-pull-requests-limit: 5
-  - package-ecosystem: pip
-    directory: '/openllm-client'
-    schedule:
-      interval: 'weekly'
-    open-pull-requests-limit: 5
-  - package-ecosystem: npm
-    directory: '/'
-    schedule:
-      interval: 'weekly'
-    open-pull-requests-limit: 0
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -1,274 +0,0 @@
-# modified workflow from ofek/hatch, big kudos to them.
-name: Standalone build
-on:
-  workflow_call:
-  workflow_dispatch:
-  push:
-    branches: [main]
-    paths-ignore:
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
-      - 'openllm-node/**'
-      - 'Formula/**'
-  pull_request:
-    branches: [main]
-    paths-ignore:
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
-      - 'openllm-node/**'
-      - 'Formula/**'
-defaults:
-  run:
-    shell: bash --noprofile --norc -exo pipefail {0}
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-env:
-  APP_NAME: openllm
-  PYOXIDIZER_VERSION: '0.24.0'
-  HATCH_VERBOSE: 2
-jobs:
-  get_commit_message:
-    name: Get commit message
-    runs-on: ubuntu-latest
-    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
-    outputs:
-      message: ${{ steps.commit_message.outputs.message }}
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        # Gets the correct commit message for pull request
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-      - name: Get commit message
-        id: commit_message
-        run: |
-          set -xe
-          COMMIT_MSG=$(git log --no-merges -1 --oneline)
-          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
-          echo github.ref ${{ github.ref }}
-  python-artifacts:
-    name: Build wheel and source distribution
-    runs-on: ubuntu-latest
-    needs: get_commit_message
-    if: >-
-      contains(needs.get_commit_message.outputs.message, '[binary build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          fetch-depth: 0
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-        with:
-          bentoml-version: 'main'
-          python-version-file: .python-version-default
-      - name: Pull latest change
-        if: ${{ github.event_name != 'pull_request' }}
-        run: git pull --autostash --no-edit --gpg-sign --ff origin main
-      - name: Install build frontend
-        run: python -m pip install --upgrade build
-      - name: Build
-        run: python -m build -sw openllm-python/
-      - name: Upload artifacts
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: binary-artefacts
-          path: openllm-python/dist/*
-          if-no-files-found: error
-  binaries:
-    name: ${{ matrix.job.target }} (${{ matrix.job.os }})
-    needs:
-      - python-artifacts
-    defaults:
-      run:
-        working-directory: openllm-python
-    runs-on: ${{ matrix.job.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        job:
-          # Linux
-          - target: aarch64-unknown-linux-gnu
-            os: ubuntu-22.04
-            cross: true
-          - target: x86_64-unknown-linux-gnu
-            os: ubuntu-22.04
-            cross: true
-          - target: x86_64-unknown-linux-musl
-            os: ubuntu-22.04
-            cross: true
-          - target: i686-unknown-linux-gnu
-            os: ubuntu-22.04
-            cross: true
-          - target: powerpc64le-unknown-linux-gnu
-            os: ubuntu-22.04
-            cross: true
-          # macOS
-          - target: aarch64-apple-darwin
-            os: macos-12
-          - target: x86_64-apple-darwin
-            os: macos-12
-    outputs:
-      version: ${{ steps.version.outputs.version }}
-    env:
-      CARGO: cargo
-      CARGO_BUILD_TARGET: ${{ matrix.job.target }}
-      PYAPP_REPO: pyapp
-      PYAPP_VERSION: '0.10.1'
-      PYAPP_PIP_EXTERNAL: 'true'
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          fetch-depth: 0
-      - name: Pull latest change
-        if: ${{ github.event_name != 'pull_request' }}
-        run: git pull --autostash --no-edit --gpg-sign --ff origin main
-      - name: Fetch PyApp
-        run: >-
-          mkdir $PYAPP_REPO && curl -L https://github.com/ofek/pyapp/releases/download/v$PYAPP_VERSION/source.tar.gz | tar --strip-components=1 -xzf - -C $PYAPP_REPO
-      - name: Set up Python
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5.1.0
-        with:
-          python-version-file: .python-version-default
-      - name: Install Hatch
-        run: pip install -U hatch
-      - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@d388a4836fcdbde0e50e395dc79a2670ccdef13f # ratchet:dtolnay/rust-toolchain@stable
-        with:
-          targets: ${{ matrix.job.target }}
-      - name: Set up cross compiling
-        if: matrix.job.cross
-        uses: taiki-e/install-action@64e4e2f995104968c78bd697b253d55bf557af66 # ratchet:taiki-e/install-action@v2.41.11
-        with:
-          tool: cross
-      - name: Configure cross compiling
-        if: matrix.job.cross
-        run: echo "CARGO=cross" >> $GITHUB_ENV
-      - name: Configure target
-        run: |-
-          config_file="$PYAPP_REPO/.cargo/config_${{ matrix.job.target }}.toml"
-          if [[ -f "$config_file" ]]; then
-            mv "$config_file" "$PYAPP_REPO/.cargo/config.toml"
-          fi
-      - name: Download Python artifacts
-        if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          name: binary-artefacts
-          path: openllm-python/dist
-      - name: Configure embedded project
-        if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
-        run: |-
-          cd dist
-          wheel="$(echo *.whl)"
-          mv "$wheel" "../$PYAPP_REPO"
-          echo "PYAPP_PROJECT_PATH=$wheel" >> $GITHUB_ENV
-      - name: Build binary
-        run: hatch build --target app
-      # Windows installers don't accept non-integer versions so we ubiquitously
-      # perform the following transformation: X.Y.Z.devN -> X.Y.Z.N
-      - name: Set project version
-        id: version
-        run: |-
-          old_version="$(hatch version)"
-          version="${old_version/dev/}"
-
-          if [[ "$version" != "$old_version" ]]; then
-            cd dist/app
-            find . -type f -iname "openllm-*" | while read -r file; do
-              binary=$(echo "$file" | sed 's/dev//')
-              mv "$file" "$binary"
-            done
-          fi
-
-          echo "version=$version" >> $GITHUB_OUTPUT
-          echo "$version"
-      - name: Archive binary
-        run: |-
-          mkdir packaging
-          cd dist/app
-          find . -type f -iname "openllm-*" | while read -r file; do
-          if [[ "$file" =~ -pc-windows- ]]; then
-            7z a "../../packaging/${file:0:-4}.zip" "$file"
-          else
-            binary="${file/dev/}"
-            chmod +x "$file"
-            tar -czf "../../packaging/$binary.tar.gz" "$file"
-          fi
-          done
-      - name: Upload staged archive
-        if: runner.os != 'Linux'
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: staged-${{ runner.os }}-${{ matrix.job.target }}
-          path: openllm-python/packaging/*
-          if-no-files-found: error
-      - name: Upload archive
-        if: runner.os == 'Linux'
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: standalone-${{ runner.os }}-${{ matrix.job.target }}
-          path: openllm-python/packaging/*
-          if-no-files-found: error
-  windows-packaging:
-    name: Build Windows installers
-    needs: binaries
-    if: false # ${{ github.event_name != 'pull_request' }}
-    runs-on: windows-2022
-    env:
-      VERSION: ${{ needs.binaries.outputs.version }}
-    defaults:
-      run:
-        working-directory: openllm-python
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-      - name: Set up Python
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5.1.0
-        with:
-          python-version-file: .python-version-default
-      - name: Pull latest change
-        run: git pull --autostash --no-edit --gpg-sign --ff origin main
-      - name: Install PyOxidizer ${{ env.PYOXIDIZER_VERSION }}
-        run: pip install pyoxidizer==${{ env.PYOXIDIZER_VERSION }}
-      - name: Download staged binaries
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          name: staged-${{ runner.os }}
-          path: openllm-python/archives
-      - name: Extract staged binaries
-        run: |-
-          mkdir -p bin
-          for f in archives/*; do
-            7z e "$f" -obin
-          done
-      # bin/<APP_NAME>-<VERSION>-<TARGET>.exe -> targets/<TARGET>/<APP_NAME>.exe
-      - name: Prepare binaries
-        run: |-
-          mkdir -p targets
-          for f in bin/*; do
-            if [[ "$f" =~ openllm-${{ env.VERSION }}-(.+).exe$ ]]; then
-              target="${BASH_REMATCH[1]}"
-              mkdir -p "targets/$target"
-              mv "$f" "targets/$target/${{ env.APP_NAME }}.exe"
-            fi
-          done
-      - name: Build installers
-        run: >-
-          pyoxidizer build windows_installers --release --var version ${{ env.VERSION }}
-      - name: Prepare installers
-        run: |-
-          mkdir installers
-          mv build/*/release/*/*.{exe,msi} installers
-      - name: Upload binaries
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: standalone
-          path: openllm-python/archives/*
-          if-no-files-found: error
-      - name: Upload installers
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: installers
-          path: openllm-python/installers/*
--- a/.github/workflows/build-pypi.yml
+++ b/.github/workflows/build-pypi.yml
@@ -1,143 +0,0 @@
-name: Compile PyPI package
-on:
-  workflow_dispatch:
-    inputs:
-      tags:
-        description: 'Tag for releases'
-        required: true
-        type: string
-  workflow_call:
-    inputs:
-      tags:
-        required: true
-        type: string
-    outputs:
-      sucess:
-        description: 'Whether the build is successful or not'
-        value: ${{ jobs.sucess-build.outputs.success }}
-  push:
-    branches: [main]
-    paths-ignore:
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
-  pull_request:
-    branches: [main]
-    paths-ignore:
-      - '*.md'
-      - 'changelog.d/**'
-      - 'assets/**'
-env:
-  LINES: 120
-  COLUMNS: 120
-  HATCH_VERBOSE: 2
-  CIBW_BUILD_FRONTEND: build
-  CIBW_ENVIRONMENT_PASS_LINUX: >
-    HATCH_BUILD_HOOKS_ENABLE MYPYPATH
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-jobs:
-  get_commit_message:
-    name: Get commit message
-    runs-on: ubuntu-latest
-    if: "github.repository == 'bentoml/OpenLLM'" # Don't run on fork repository
-    outputs:
-      message: ${{ steps.commit_message.outputs.message }}
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        # Gets the correct commit message for pull request
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-      - name: Get commit message
-        id: commit_message
-        run: |
-          set -xe
-          COMMIT_MSG=$(git log --no-merges -1 --oneline)
-          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
-          echo github.ref ${{ github.ref }}
-  pure-wheels-sdist:
-    name: Pure wheels and sdist distribution (${{ matrix.directory }})
-    runs-on: ubuntu-latest
-    needs: get_commit_message
-    if: >-
-      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
-    strategy:
-      fail-fast: false
-      matrix:
-        directory: ['openllm-core', 'openllm-python', 'openllm-client']
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          fetch-depth: 0
-          ref: '${{ inputs.tags }}'
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-        with:
-          bentoml-version: 'main'
-          python-version-file: .python-version-default
-      - name: Build
-        run: hatch build
-        working-directory: ${{ matrix.directory }}
-      - name: Upload artifacts
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
-        with:
-          name: python-artefacts-${{ matrix.directory }}
-          path: ${{ matrix.directory }}/dist/*
-          if-no-files-found: error
-  check-download-artefacts:
-    name: dry-run for downloading artefacts
-    if: github.event_name == 'pull_request'
-    needs: [pure-wheels-sdist]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Download Python artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: python-artefacts-*
-          merge-multiple: true
-          path: dist
-      - name: dry ls
-        run: ls -rthlaR
-  push-nightly:
-    name: Push nightly wheels
-    if: ${{ !github.event.repository.fork && github.event_name == 'push' }}
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-    needs: [pure-wheels-sdist]
-    steps:
-      - name: Download Python artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: python-artefacts-*
-          merge-multiple: true
-          path: dist
-      - name: Publish nightly wheels to test.pypi.org
-        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://test.pypi.org/legacy/
-  sucess-push: # https://github.com/marketplace/actions/alls-green#why
-    if: github.event_name == 'push'
-    needs: [push-nightly, pure-wheels-sdist]
-    runs-on: ubuntu-latest
-    outputs:
-      success: ${{ steps.everygreen.outputs.success }}
-    steps:
-      - name: Decide whether the needed jobs succeeded or failed
-        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
-        id: evergreen
-        with:
-          jobs: ${{ toJSON(needs) }}
-  sucess-pr: # https://github.com/marketplace/actions/alls-green#why
-    if: github.event_name == 'pull_request'
-    needs: [pure-wheels-sdist]
-    runs-on: ubuntu-latest
-    outputs:
-      success: ${{ steps.everygreen.outputs.success }}
-    steps:
-      - name: Decide whether the needed jobs succeeded or failed
-        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
-        id: evergreen
-        with:
-          jobs: ${{ toJSON(needs) }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,71 +0,0 @@
-name: CI
-on:
-  workflow_call:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-env:
-  PAPERSPACE_API_KEY: ${{secrets.PAPERSPACE_API_KEY}}
-  PAPERSPACE_NETWORK_ID: ${{secrets.PAPERSPACE_NETWORK_ID}}
-defaults:
-  run:
-    shell: bash --noprofile --norc -eo pipefail {0}
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-jobs:
-  tests:
-    runs-on: ubuntu-latest
-    if: ${{ github.event_name == 'pull_request' || github.event_name == 'push'|| github.event_name == 'workflow_call' }}
-    name: General API tests
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4
-      - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # ratchet:actions/setup-python@v5
-        with:
-          python-version-file: '.python-version-default'
-      - name: setup tooling
-        run: |
-          python -m pip install uv
-          uv pip install --system httpx orjson
-      - name: startup machine
-        id: paperspace-machine
-        run: |
-          PUBLIC_IP=$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}} --output-ip)
-          MACHINE_ID=$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}} --output-id)
-          echo "::add-mask::$PUBLIC_IP"
-          echo "::add-mask::$MACHINE_ID"
-          echo "publicIp=$PUBLIC_IP" >> $GITHUB_OUTPUT
-          echo "machineId=$MACHINE_ID" >> $GITHUB_OUTPUT
-      - name: running regression tests (PR)
-        uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
-        if: github.event_name == 'pull_request'
-        with:
-          host: ${{ steps.paperspace-machine.outputs.publicIp }}
-          username: paperspace
-          key: ${{ secrets.PAPERSPACE_SSH_KEY }}
-          port: ${{ secrets.PAPERSPACE_PORT }}
-          command_timeout: 1h
-          script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --pr ${{github.event.number}}
-      - name: running regression tests (main)
-        uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
-        if: ${{ !github.event.repository.fork && github.event_name == 'push' }}
-        with:
-          host: ${{ steps.paperspace-machine.outputs.publicIp }}
-          username: paperspace
-          key: ${{ secrets.PAPERSPACE_SSH_KEY }}
-          port: ${{ secrets.PAPERSPACE_PORT }}
-          command_timeout: 1h
-          script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --accept-flake-config --refresh -- --head
-      - name: shutdown machine
-        run: python tools/machines.py --delete ${{ steps.paperspace-machine.outputs.machineId }}
-  evergreen: # https://github.com/marketplace/actions/alls-green#why
-    if: always()
-    needs:
-      - tests
-    runs-on: ubuntu-latest
-    steps:
-      - name: Decide whether the needed jobs succeeded or failed
-        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
-        with:
-          jobs: ${{ toJSON(needs) }}
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -1,114 +0,0 @@
-name: release
-on:
-  workflow_dispatch:
-    inputs:
-      release_type:
-        description: 'Release type'
-        required: true
-        type: choice
-        default: 'patch'
-        options:
-          - major
-          - minor
-          - patch
-          - alpha
-defaults:
-  run:
-    shell: bash --noprofile --norc -exo pipefail {0}
-env:
-  LINES: 120
-  COLUMNS: 120
-  OPENLLM_DO_NOT_TRACK: True
-  PYTHONUNBUFFERED: '1'
-  HATCH_VERBOSE: 2
-jobs:
-  release:
-    if: github.repository_owner == 'bentoml'
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      id-token: write
-    outputs:
-      version: ${{ steps.version.outputs.version }}
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-        with:
-          bentoml-version: 'main'
-          python-version-file: .python-version-default
-      - name: Import bot's GPG key for signing commits
-        id: import-gpg
-        uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4 # ratchet:crazy-max/ghaction-import-gpg@v6
-        with:
-          gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
-          passphrase: ${{ secrets.GPG_PASSPHRASE }}
-          git_config_global: true
-          git_user_signingkey: true
-          git_commit_gpgsign: true
-          git_tag_gpgsign: true
-      - name: Create a release
-        env:
-          GIT_AUTHOR_NAME: ${{ steps.import-gpg.outputs.name }}
-          GIT_AUTHOR_EMAIL: ${{ steps.import-gpg.outputs.email }}
-          GIT_COMMITTER_NAME: ${{ steps.import-gpg.outputs.name }}
-          GIT_COMMITTER_EMAIL: ${{ steps.import-gpg.outputs.email }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |-
-          pip install ghapi inflection tomlkit orjson
-          ./.github/actions/release.sh --release ${{ inputs.release_type }}
-      - name: Set project version
-        id: version
-        run: |-
-          version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
-          echo "Release version: $version"
-          echo "version=$version" >> $GITHUB_OUTPUT
-  build-pypi:
-    if: github.repository_owner == 'bentoml'
-    needs:
-      - release
-    name: Compile PyPI distribution for OpenLLM
-    uses: bentoml/OpenLLM/.github/workflows/build-pypi.yml@main # ratchet:exclude
-    with:
-      tags: ${{ needs.release.outputs.version }}
-  publish-python:
-    needs:
-      - build-pypi
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          ref: '${{ needs.release.outputs.version }}'
-          token: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: Keep this section in sync with build-pypi.yml
-      - name: Download Python artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: python-artefacts-*
-          merge-multiple: true
-          path: dist
-      - name: Smoke test compiled artefacts
-        run: ls -R dist
-      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
-        with:
-          print-hash: true
-  binary-distribution:
-    if: github.repository_owner == 'bentoml'
-    needs: publish-python
-    name: Create binary/wheels distribution
-    uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@main # ratchet:exclude
-  release-notes:
-    if: github.repository_owner == 'bentoml'
-    needs:
-      - release
-      - publish-python
-      - binary-distribution
-    name: Create release notes and setup for next cycle
-    uses: bentoml/OpenLLM/.github/workflows/release-notes.yml@main # ratchet:exclude
-    with:
-      tags: ${{ needs.release.outputs.version }}
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -1,66 +0,0 @@
-name: release-notes
-on:
-  workflow_dispatch:
-    inputs:
-      tags:
-        description: 'Tag for releases'
-        required: true
-        type: string
-  workflow_call:
-    inputs:
-      tags:
-        required: true
-        type: string
-env:
-  HATCH_VERBOSE: 2
-defaults:
-  run:
-    shell: bash --noprofile --norc -exo pipefail {0}
-jobs:
-  gen-release-notes:
-    if: github.repository_owner == 'bentoml'
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      id-token: write
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # ratchet:actions/checkout@v4.1.7
-        with:
-          fetch-depth: 0
-          ref: '${{ inputs.tags }}'
-      - uses: bentoml/setup-bentoml-action@862aa8fa0e0c3793fcca4bfe7a62717a497417e4 # ratchet:bentoml/setup-bentoml-action@v1
-        with:
-          bentoml-version: 'main'
-          python-version-file: .python-version-default
-      - name: Create release notes
-        run: ./.github/actions/create_release_and_archive.sh ${{ inputs.tags }}
-      - name: Download Python artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: python-artefacts-*
-          merge-multiple: true
-          path: dist
-      - name: Download binaries
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: standalone-*
-          merge-multiple: true
-          path: archives
-      - name: Download standalone MacOS
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
-        with:
-          pattern: staged-*
-          merge-multiple: true
-          path: archives
-      - name: Create release
-        uses: softprops/action-gh-release@a74c6b72af54cfa997e81df42d94703d6313a2d0 # ratchet:softprops/action-gh-release@v2.0.6
-        with:
-          # Use GH feature to populate the changelog automatically
-          generate_release_notes: true
-          body_path: release_notes.txt
-          fail_on_unmatched_files: true
-          tag_name: '${{ inputs.tags }}'
-          # TODO: add installers/* once windows support is back
-          files: |-
-            dist/*
-            archives/*
--- a/.github/workflows/releases.yml
+++ b/.github/workflows/releases.yml
@@ -0,0 +1,43 @@
+name: release
+on:
+  push:
+    tags:
+      - "*"
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: .python-version-default
+      - name: Build
+        run: pipx run build
+      - name: Upload artifacts
+        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # ratchet:actions/upload-artifact@v4
+        with:
+          name: python-artefacts-openllm
+          path: dist/*
+          if-no-files-found: error
+  release:
+    if: github.repository_owner == 'bentoml'
+    needs:
+      - build
+    runs-on: ubuntu-latest
+    name: Release
+    permissions:
+      id-token: write
+    steps:
+      - name: Download Python artifacts
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # ratchet:actions/download-artifact@v4
+        with:
+          pattern: python-artefacts-*
+          merge-multiple: true
+          path: dist
+      - name: dry ls
+        run: ls -rthlaR
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # ratchet:pypa/gh-action-pypi-publish@release/v1
+        with:
+          print-hash: true
--- a/.gitignore
+++ b/.gitignore
@@ -161,3 +161,6 @@ cython_debug/
 *.whl
 # Environments
 venv/
+.envrc
+.python-version
+_version.py
--- a/.npmrc
+++ b/.npmrc
@@ -1,7 +0,0 @@
-strict-peer-dependencies=false
-shell-emulator=true
-engine-strict=true
-save-exact=true
-unsafe-perm=true
-prefer-offline=true
-prefer-workspace-packages=true
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,12 +31,7 @@ repos:
      - id: editorconfig-checker
        verbose: true
        alias: ec
-        types_or: [python, javascript]
-        exclude: |
-          (?x)^(
-              openllm-python/src/openllm_cli/entrypoint.py |
-              openllm-python/src/_openllm_tiny/_entrypoint.py
-          )$
+        types_or: [python]
  - repo: meta
    hooks:
      - id: check-hooks-apply
@@ -55,11 +50,3 @@ repos:
      - id: check-added-large-files
      - id: debug-statements
      - id: check-merge-conflict
-  - repo: https://github.com/econchick/interrogate
-    rev: 1.7.0
-    hooks:
-      - id: interrogate
-        verbose: true
-        types: [python]
-        exclude: ^(docs|tools|openllm-python/tests)
-        args: [--config=pyproject.toml]
--- a/.prettierrc
+++ b/.prettierrc
@@ -1,9 +0,0 @@
-{
-  "printWidth": 119,
-  "quoteProps": "as-needed",
-  "trailingComma": "none",
-  "useTabs": false,
-  "tabWidth": 2,
-  "semi": false,
-  "singleQuote": true
-}
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+/Users/aarnphm/workspace/bentoml/projects/openllm/upstream/.python-version-default
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -1,4 +1,3 @@
-exclude = ["tools", "examples", "openllm-python/src/openllm_cli/playground/"]
 extend-include = ["*.ipynb"]
 preview = true
 line-length = 119
@@ -12,13 +11,10 @@ skip-magic-trailing-comma = true
 docstring-code-format = true

 [lint]
-typing-modules = [
-    "openllm_core._typing_compat",
-    "openllm_client._typing_compat",
-]
 ignore = [
  "RUF012",
  "ANN",    # Mypy is better at this
+  "E722",
 ]
 select = [
  "F",
--- a/.taplo.toml
+++ b/.taplo.toml
@@ -1,2 +0,0 @@
-[formatting]
-indent_string = "    "
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,10 @@ This changelog is managed by towncrier and is compiled at release time.

 <!-- towncrier release notes start -->

+## [0.6.0](https://github.com/bentoml/openllm/tree/v0.6.0)
+No significant changes.
+
+
 ## [0.5.7](https://github.com/bentoml/openllm/tree/v0.5.7)
 No significant changes.

--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -44,6 +44,12 @@ Before you can start developing, you'll need to set up your environment:
   git branch --set-upstream-to=upstream/main
   ```

+6. (Optional) Link `.python-version-default` to `.python-version`:
+
+   ```bash
+   ln .python-version-default .python-version
+   ```
+
 ## Development Workflow

 There are a few ways to contribute to the repository structure for OpenLLM:
--- a/README.md
+++ b/README.md
@@ -34,8 +34,6 @@ OpenLLM helps developers **run any open-source LLMs**, such as Llama 2 and Mistr

 <br/>

-<!-- hatch-fancy-pypi-readme interim start -->
-
 ## 💾 TL/DR

 For starter, we provide two ways to quickly try out OpenLLM:
@@ -98,7 +96,6 @@ OpenLLM currently supports the following models. By default, OpenLLM doesn't inc

 <summary>Baichuan</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Baichuan server:
@@ -106,23 +103,21 @@ Run the following command to quickly spin up a Baichuan server:
 ```bash
 openllm start baichuan-inc/baichuan-7b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Baichuan variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=baichuan) to see more Baichuan-compatible models.

-
-
 ### Supported models

 You can specify any of the following Baichuan models via `openllm start`:

-
 - [baichuan-inc/baichuan2-7b-base](https://huggingface.co/baichuan-inc/baichuan2-7b-base)
 - [baichuan-inc/baichuan2-7b-chat](https://huggingface.co/baichuan-inc/baichuan2-7b-chat)
 - [baichuan-inc/baichuan2-13b-base](https://huggingface.co/baichuan-inc/baichuan2-13b-base)
@@ -134,7 +129,6 @@ You can specify any of the following Baichuan models via `openllm start`:

 <summary>ChatGLM</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a ChatGLM server:
@@ -142,23 +136,21 @@ Run the following command to quickly spin up a ChatGLM server:
 ```bash
 openllm start thudm/chatglm-6b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any ChatGLM variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=chatglm) to see more ChatGLM-compatible models.

-
-
 ### Supported models

 You can specify any of the following ChatGLM models via `openllm start`:

-
 - [thudm/chatglm-6b](https://huggingface.co/thudm/chatglm-6b)
 - [thudm/chatglm-6b-int8](https://huggingface.co/thudm/chatglm-6b-int8)
 - [thudm/chatglm-6b-int4](https://huggingface.co/thudm/chatglm-6b-int4)
@@ -172,7 +164,6 @@ You can specify any of the following ChatGLM models via `openllm start`:

 <summary>Cohere</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Cohere server:
@@ -180,23 +171,21 @@ Run the following command to quickly spin up a Cohere server:
 ```bash
 openllm start CohereForAI/c4ai-command-r-plus --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Cohere variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=commandr) to see more Cohere-compatible models.

-
-
 ### Supported models

 You can specify any of the following Cohere models via `openllm start`:

-
 - [CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)

@@ -206,7 +195,6 @@ You can specify any of the following Cohere models via `openllm start`:

 <summary>Dbrx</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Dbrx server:
@@ -214,23 +202,21 @@ Run the following command to quickly spin up a Dbrx server:
 ```bash
 openllm start databricks/dbrx-instruct --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Dbrx variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dbrx) to see more Dbrx-compatible models.

-
-
 ### Supported models

 You can specify any of the following Dbrx models via `openllm start`:

-
 - [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)
 - [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)

@@ -240,7 +226,6 @@ You can specify any of the following Dbrx models via `openllm start`:

 <summary>DollyV2</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a DollyV2 server:
@@ -248,23 +233,21 @@ Run the following command to quickly spin up a DollyV2 server:
 ```bash
 openllm start databricks/dolly-v2-3b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any DollyV2 variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=dolly_v2) to see more DollyV2-compatible models.

-
-
 ### Supported models

 You can specify any of the following DollyV2 models via `openllm start`:

-
 - [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
 - [databricks/dolly-v2-7b](https://huggingface.co/databricks/dolly-v2-7b)
 - [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b)
@@ -275,7 +258,6 @@ You can specify any of the following DollyV2 models via `openllm start`:

 <summary>Falcon</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Falcon server:
@@ -283,23 +265,21 @@ Run the following command to quickly spin up a Falcon server:
 ```bash
 openllm start tiiuae/falcon-7b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Falcon variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=falcon) to see more Falcon-compatible models.

-
-
 ### Supported models

 You can specify any of the following Falcon models via `openllm start`:

-
 - [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
 - [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b)
 - [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)
@@ -311,7 +291,6 @@ You can specify any of the following Falcon models via `openllm start`:

 <summary>Gemma</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Gemma server:
@@ -319,23 +298,21 @@ Run the following command to quickly spin up a Gemma server:
 ```bash
 openllm start google/gemma-7b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Gemma variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=gemma) to see more Gemma-compatible models.

-
-
 ### Supported models

 You can specify any of the following Gemma models via `openllm start`:

-
 - [google/gemma-7b](https://huggingface.co/google/gemma-7b)
 - [google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)
 - [google/gemma-2b](https://huggingface.co/google/gemma-2b)
@@ -347,7 +324,6 @@ You can specify any of the following Gemma models via `openllm start`:

 <summary>GPTNeoX</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a GPTNeoX server:
@@ -355,23 +331,21 @@ Run the following command to quickly spin up a GPTNeoX server:
 ```bash
 openllm start eleutherai/gpt-neox-20b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any GPTNeoX variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=gpt_neox) to see more GPTNeoX-compatible models.

-
-
 ### Supported models

 You can specify any of the following GPTNeoX models via `openllm start`:

-
 - [eleutherai/gpt-neox-20b](https://huggingface.co/eleutherai/gpt-neox-20b)

 </details>
@@ -380,7 +354,6 @@ You can specify any of the following GPTNeoX models via `openllm start`:

 <summary>Llama</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Llama server:
@@ -388,23 +361,21 @@ Run the following command to quickly spin up a Llama server:
 ```bash
 openllm start NousResearch/llama-2-7b-hf --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Llama variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=llama) to see more Llama-compatible models.

-
-
 ### Supported models

 You can specify any of the following Llama models via `openllm start`:

-
 - [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
 - [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 - [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
@@ -424,7 +395,6 @@ You can specify any of the following Llama models via `openllm start`:

 <summary>Mistral</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Mistral server:
@@ -432,23 +402,21 @@ Run the following command to quickly spin up a Mistral server:
 ```bash
 openllm start mistralai/Mistral-7B-Instruct-v0.1 --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Mistral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mistral) to see more Mistral-compatible models.

-
-
 ### Supported models

 You can specify any of the following Mistral models via `openllm start`:

-
 - [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
 - [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
 - [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
@@ -461,7 +429,6 @@ You can specify any of the following Mistral models via `openllm start`:

 <summary>Mixtral</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Mixtral server:
@@ -469,23 +436,21 @@ Run the following command to quickly spin up a Mixtral server:
 ```bash
 openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.

-
-
 ### Supported models

 You can specify any of the following Mixtral models via `openllm start`:

-
 - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
 - [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)

@@ -495,7 +460,6 @@ You can specify any of the following Mixtral models via `openllm start`:

 <summary>MPT</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a MPT server:
@@ -503,23 +467,21 @@ Run the following command to quickly spin up a MPT server:
 ```bash
 openllm start mosaicml/mpt-7b-instruct --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any MPT variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mpt) to see more MPT-compatible models.

-
-
 ### Supported models

 You can specify any of the following MPT models via `openllm start`:

-
 - [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b)
 - [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat)
@@ -534,7 +496,6 @@ You can specify any of the following MPT models via `openllm start`:

 <summary>OPT</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a OPT server:
@@ -542,23 +503,21 @@ Run the following command to quickly spin up a OPT server:
 ```bash
 openllm start facebook/opt-1.3b
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any OPT variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=opt) to see more OPT-compatible models.

-
-
 ### Supported models

 You can specify any of the following OPT models via `openllm start`:

-
 - [facebook/opt-125m](https://huggingface.co/facebook/opt-125m)
 - [facebook/opt-350m](https://huggingface.co/facebook/opt-350m)
 - [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b)
@@ -572,7 +531,6 @@ You can specify any of the following OPT models via `openllm start`:

 <summary>Phi</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Phi server:
@@ -580,23 +538,21 @@ Run the following command to quickly spin up a Phi server:
 ```bash
 openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Phi variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=phi) to see more Phi-compatible models.

-
-
 ### Supported models

 You can specify any of the following Phi models via `openllm start`:

-
 - [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)
 - [microsoft/Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct)
@@ -610,7 +566,6 @@ You can specify any of the following Phi models via `openllm start`:

 <summary>Qwen</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Qwen server:
@@ -618,23 +573,21 @@ Run the following command to quickly spin up a Qwen server:
 ```bash
 openllm start qwen/Qwen-7B-Chat --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Qwen variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=qwen) to see more Qwen-compatible models.

-
-
 ### Supported models

 You can specify any of the following Qwen models via `openllm start`:

-
 - [qwen/Qwen-7B-Chat](https://huggingface.co/qwen/Qwen-7B-Chat)
 - [qwen/Qwen-7B-Chat-Int8](https://huggingface.co/qwen/Qwen-7B-Chat-Int8)
 - [qwen/Qwen-7B-Chat-Int4](https://huggingface.co/qwen/Qwen-7B-Chat-Int4)
@@ -648,7 +601,6 @@ You can specify any of the following Qwen models via `openllm start`:

 <summary>StableLM</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a StableLM server:
@@ -656,23 +608,21 @@ Run the following command to quickly spin up a StableLM server:
 ```bash
 openllm start stabilityai/stablelm-tuned-alpha-3b --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any StableLM variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=stablelm) to see more StableLM-compatible models.

-
-
 ### Supported models

 You can specify any of the following StableLM models via `openllm start`:

-
 - [stabilityai/stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b)
 - [stabilityai/stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
 - [stabilityai/stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)
@@ -684,7 +634,6 @@ You can specify any of the following StableLM models via `openllm start`:

 <summary>StarCoder</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a StarCoder server:
@@ -692,23 +641,21 @@ Run the following command to quickly spin up a StarCoder server:
 ```bash
 openllm start bigcode/starcoder --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any StarCoder variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=starcoder) to see more StarCoder-compatible models.

-
-
 ### Supported models

 You can specify any of the following StarCoder models via `openllm start`:

-
 - [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
 - [bigcode/starcoderbase](https://huggingface.co/bigcode/starcoderbase)

@@ -718,7 +665,6 @@ You can specify any of the following StarCoder models via `openllm start`:

 <summary>Yi</summary>

-
 ### Quickstart

 Run the following command to quickly spin up a Yi server:
@@ -726,23 +672,21 @@ Run the following command to quickly spin up a Yi server:
 ```bash
 openllm start 01-ai/Yi-6B --trust-remote-code
 ```
+
 You can run the following code in a different terminal to interact with the server:
+
 ```python
 import openllm_client
 client = openllm_client.HTTPClient('http://localhost:3000')
 client.generate('What are large language models?')
 ```

-
 > **Note:** Any Yi variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=yi) to see more Yi-compatible models.

-
-
 ### Supported models

 You can specify any of the following Yi models via `openllm start`:

-
 - [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)
 - [01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)
 - [01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)
@@ -830,8 +774,6 @@ async for it in llm.astream('What is the difference between a duck and a goose?
  print(it, flush=True, end='')
 ```

-<!-- hatch-fancy-pypi-readme interim stop -->
-
 <!-- hatch-fancy-pypi-readme meta start -->

 ## 🚀 Deploying models to production
--- a/hatch.toml
+++ b/hatch.toml
@@ -1,84 +0,0 @@
-[envs.default]
-installer = "uv"
-type = "virtual"
-dependencies = [
-    "openllm-core @ {root:uri}/openllm-core",
-    "openllm-client @ {root:uri}/openllm-client",
-    "openllm[chatglm,fine-tune] @ {root:uri}/openllm-python",
-    # NOTE: To run all hooks
-    "pre-commit",
-    # NOTE: towncrier for changelog
-    "towncrier",
-    "tomlkit",
-    # NOTE: For fancy PyPI readme
-    "hatch-fancy-pypi-readme",
-    # NOTE: For working with shell pipe
-    "plumbum",
-    # The below sync with mypyc deps and pre-commit mypy
-    "types-psutil",
-    "types-tabulate",
-    "types-PyYAML",
-    "types-protobuf",
-]
-[envs.default.scripts]
-changelog = "towncrier build --version main --draft"
-inplace-changelog = "towncrier build --version main --keep"
-setup = [
-    "pre-commit install",
-    "- ln -s .python-version-default .python-version",
-]
-quality = [
-    "bash ./local.sh --tools",
-    "- pre-commit run --all-files",
-    "- pnpm format",
-]
-tool = ["quality", "bash ./clean.sh", 'python ./cz.py']
-[envs.tests]
-installer = "uv"
-type = "virtual"
-dependencies = [
-    "openllm @ {root:uri}/openllm-python",
-    "openllm-core @ {root:uri}/openllm-core",
-    "openllm-client @ {root:uri}/openllm-client",
-    # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
-    "coverage[toml]>=6.5",
-    "filelock>=3.7.1",
-    "pytest",
-    "pytest-cov",
-    "pytest-mock",
-    "pytest-randomly",
-    "pytest-rerunfailures",
-    "pytest-asyncio>=0.21.0",
-    "pytest-xdist[psutil]",
-    "trustme",
-    "hypothesis",
-    "syrupy",
-]
-skip-install = false
-template = "tests"
-[envs.tests.scripts]
-_run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml -vv"
-python = "_run_script -r aR -x {args:openllm-python/tests}"
-[envs.tests.overrides]
-env.GITHUB_ACTIONS.env-vars = "COVERAGE_REPORT="
-[envs.coverage]
-dependencies = ["coverage[toml]>=6.5", "lxml", "orjson"]
-detached = true
-[envs.coverage.scripts]
-combine = "coverage combine {args}"
-generate-summary = "python tools/generate-coverage.py"
-report-uncovered-html = "coverage html --skip-covered --skip-empty {args}"
-report-xml = "coverage xml {args}"
-write-summary-report = "python tools/write-coverage-report.py"
-[envs.ui]
-detached = true
-[envs.ui.scripts]
-server = "openllm start {args:facebook/opt-250m} --working-dir {root:uri} --cors --debug"
-[envs.ci]
-detached = true
-[envs.ci.scripts]
-client-stubs = "bash openllm-client/generate-grpc-stubs"
-compile = "bash ./tools/compile.sh {args}"
-recompile = ["bash ./clean.sh", "compile"]
-edi = "bash local.sh"
-lock = ["bash tools/lock-actions.sh"]
--- a/openllm_next/main.py
+++ b/openllm_next/main.py
@@ -1,338 +0,0 @@
-import os
-import random
-import sys
-from collections import defaultdict
-from typing import Annotated, Optional
-
-import questionary
-import typer
-
-from openllm_next.accelerator_spec import (
-    DeploymentTarget,
-    can_run,
-    get_local_machine_spec,
-)
-from openllm_next.analytic import DO_NOT_TRACK, OpenLLMTyper
-from openllm_next.clean import app as clean_app
-from openllm_next.cloud import deploy as cloud_deploy
-from openllm_next.cloud import ensure_cloud_context, get_cloud_machine_spec
-from openllm_next.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
-from openllm_next.local import run as local_run
-from openllm_next.local import serve as local_serve
-from openllm_next.model import app as model_app
-from openllm_next.model import ensure_bento, list_bento
-from openllm_next.repo import app as repo_app
-
-app = OpenLLMTyper(
-    help="`openllm hello` to get started. "
-    "OpenLLM is a CLI tool to manage and deploy open source LLMs and"
-    " get an OpenAI API compatible chat server in seconds.",
-)
-
-app.add_typer(repo_app, name="repo")
-app.add_typer(model_app, name="model")
-app.add_typer(clean_app, name="clean")
-
-
-def _select_bento_name(models, target):
-    from tabulate import tabulate
-
-    options = []
-    model_infos = [
-        [model.repo.name, model.name, can_run(model, target)] for model in models
-    ]
-    model_name_groups = defaultdict(lambda: 0)
-    for repo, name, score in model_infos:
-        model_name_groups[(repo, name)] += score
-    table_data = [
-        [name, repo, CHECKED if score > 0 else ""]
-        for (repo, name), score in model_name_groups.items()
-    ]
-    if not table_data:
-        output("No model found", style="red")
-        raise typer.Exit(1)
-    table = tabulate(
-        table_data,
-        headers=["model", "repo", "locally runnable"],
-    ).split("\n")
-    headers = f"{table[0]}\n   {table[1]}"
-
-    options.append(questionary.Separator(headers))
-    for table_data, table_line in zip(table_data, table[2:]):
-        options.append(questionary.Choice(table_line, value=table_data[:2]))
-    selected = questionary.select("Select a model", options).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
-
-
-def _select_bento_version(models, target, bento_name, repo):
-    from tabulate import tabulate
-
-    model_infos = [
-        [model, can_run(model, target)]
-        for model in models
-        if model.name == bento_name and model.repo.name == repo
-    ]
-
-    table_data = [
-        [model.tag, CHECKED if score > 0 else ""]
-        for model, score in model_infos
-        if model.name == bento_name and model.repo.name == repo
-    ]
-    if not table_data:
-        output(f"No model found for {bento_name} in {repo}", style="red")
-        raise typer.Exit(1)
-    table = tabulate(
-        table_data,
-        headers=["version", "locally runnable"],
-    ).split("\n")
-
-    options = []
-    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
-    for table_data, table_line in zip(model_infos, table[2:]):
-        options.append(questionary.Choice(table_line, value=table_data))
-    selected = questionary.select("Select a version", options).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
-
-
-def _select_target(bento, targets):
-    from tabulate import tabulate
-
-    options = []
-    targets.sort(key=lambda x: can_run(bento, x), reverse=True)
-    if not targets:
-        output(
-            "No available instance type, check your bentocloud account",
-            style="red",
-        )
-        raise typer.Exit(1)
-
-    table = tabulate(
-        [
-            [
-                target.name,
-                target.accelerators_repr,
-                f"${target.price}",
-                CHECKED if can_run(bento, target) else "insufficient res.",
-            ]
-            for target in targets
-        ],
-        headers=["instance type", "accelerator", "price/hr", "deployable"],
-    ).split("\n")
-    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
-
-    for target, line in zip(targets, table[2:]):
-        options.append(
-            questionary.Choice(
-                f"{line}",
-                value=target,
-            )
-        )
-    selected = questionary.select("Select an instance type", options).ask()
-    if selected is None:
-        raise typer.Exit(1)
-    return selected
-
-
-def _select_action(bento, score):
-    if score > 0:
-        options = [
-            questionary.Separator("Available actions"),
-            questionary.Choice(
-                "0. Run the model in terminal",
-                value="run",
-                shortcut_key="0",
-            ),
-            questionary.Separator(f"  $ openllm run {bento}"),
-            questionary.Separator(" "),
-            questionary.Choice(
-                "1. Serve the model locally and get a chat server",
-                value="serve",
-                shortcut_key="1",
-            ),
-            questionary.Separator(f"  $ openllm serve {bento}"),
-            questionary.Separator(" "),
-            questionary.Choice(
-                "2. Deploy the model to bentocloud and get a scalable chat server",
-                value="deploy",
-                shortcut_key="2",
-            ),
-            questionary.Separator(f"  $ openllm deploy {bento}"),
-        ]
-    else:
-        options = [
-            questionary.Separator("Available actions"),
-            questionary.Choice(
-                "0. Run the model in terminal",
-                value="run",
-                disabled="insufficient res.",
-                shortcut_key="0",
-            ),
-            questionary.Separator(f"  $ openllm run {bento}"),
-            questionary.Separator(" "),
-            questionary.Choice(
-                "1. Serve the model locally and get a chat server",
-                value="serve",
-                disabled="insufficient res.",
-                shortcut_key="1",
-            ),
-            questionary.Separator(f"  $ openllm serve {bento}"),
-            questionary.Separator(" "),
-            questionary.Choice(
-                "2. Deploy the model to bentocloud and get a scalable chat server",
-                value="deploy",
-                shortcut_key="2",
-            ),
-            questionary.Separator(f"  $ openllm deploy {bento}"),
-        ]
-    action = questionary.select("Select an action", options).ask()
-    if action is None:
-        raise typer.Exit(1)
-    if action == "run":
-        try:
-            local_run(bento)
-        finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(f"  $ openllm run {bento}", style="orange")
-    elif action == "serve":
-        try:
-            local_serve(bento)
-        finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(f"  $ openllm serve {bento}", style="orange")
-    elif action == "deploy":
-        ensure_cloud_context()
-        targets = get_cloud_machine_spec()
-        target = _select_target(bento, targets)
-        try:
-            cloud_deploy(bento, target)
-        finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(
-                f"  $ openllm deploy {bento} --instance-type {target.name}",
-                style="orange",
-            )
-
-
-@app.command(help="get started interactively")
-def hello():
-    INTERACTIVE.set(True)
-    VERBOSE_LEVEL.set(20)
-
-    target = get_local_machine_spec()
-    output(f"  Detected Platform: {target.platform}", style="green")
-    if target.accelerators:
-        output("  Detected Accelerators: ", style="green")
-        for a in target.accelerators:
-            output(f"   - {a.model} {a.memory_size}GB", style="green")
-    else:
-        output("  Detected Accelerators: None", style="yellow")
-
-    models = list_bento()
-    if not models:
-        output(
-            "No model found, you probably need to update the model repo:",
-            style="red",
-        )
-        output(
-            "  $ openllm repo update",
-            style="orange",
-        )
-        raise typer.Exit(1)
-
-    bento_name, repo = _select_bento_name(models, target)
-    bento, score = _select_bento_version(models, target, bento_name, repo)
-    _select_action(bento, score)
-
-
-@app.command(help="start an OpenAI API compatible chat server and chat in browser")
-def serve(
-    model: Annotated[str, typer.Argument()] = "",
-    repo: Optional[str] = None,
-    port: int = 3000,
-    verbose: bool = False,
-):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    target = get_local_machine_spec()
-    bento = ensure_bento(model, target=target, repo_name=repo)
-    local_serve(bento, port=port)
-
-
-@app.command(help="run the model and chat in terminal")
-def run(
-    model: Annotated[str, typer.Argument()] = "",
-    repo: Optional[str] = None,
-    port: Optional[int] = None,
-    timeout: int = 600,
-    verbose: bool = False,
-):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    target = get_local_machine_spec()
-    bento = ensure_bento(model, target=target, repo_name=repo)
-    if port is None:
-        port = random.randint(30000, 40000)
-    local_run(bento, port=port, timeout=timeout)
-
-
-@app.command(
-    help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)",
-)
-def deploy(
-    model: Annotated[str, typer.Argument()] = "",
-    instance_type: Optional[str] = None,
-    repo: Optional[str] = None,
-    verbose: bool = False,
-):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    bento = ensure_bento(model, repo_name=repo)
-    if instance_type is not None:
-        cloud_deploy(bento, DeploymentTarget(name=instance_type))
-        return
-    targets = get_cloud_machine_spec()
-    targets = filter(lambda x: can_run(bento, x) > 0, targets)
-    targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
-    if not targets:
-        output(
-            "No available instance type, check your bentocloud account",
-            style="red",
-        )
-        raise typer.Exit(1)
-    target = targets[0]
-    output(
-        f"Recommended instance type: {target.name}",
-        style="green",
-    )
-    cloud_deploy(bento, target)
-
-
-@app.callback(invoke_without_command=True)
-def typer_callback(
-    verbose: int = 0,
-    do_not_track: bool = typer.Option(
-        False,
-        "--do-not-track",
-        help="Whether to disable usage tracking",
-        envvar=DO_NOT_TRACK,
-    ),
-):
-    if verbose:
-        VERBOSE_LEVEL.set(verbose)
-    if do_not_track:
-        os.environ[DO_NOT_TRACK] = str(True)
-
-
-def main():
-    if sys.version_info < (3, 9):
-        output("Python 3.8 or higher is required", style="red")
-        sys.exit(1)
-    app()
-
-
-if __name__ == "__main__":
-    main()
--- a/openllm_next/accelerator_spec.py
+++ b/openllm_next/accelerator_spec.py
@@ -1,166 +0,0 @@
-from __future__ import annotations
-
-import functools
-import math
-import typing
-from types import SimpleNamespace
-
-import psutil
-
-from openllm_next.common import BentoInfo, DeploymentTarget, output
-
-
-class Accelerator(SimpleNamespace):
-    model: str
-    memory_size: float
-
-    def __gt__(self, other):
-        return self.memory_size > other.memory_size
-
-    def __eq__(self, other):
-        return self.memory_size == other.memory_size
-
-    def __repr__(self):
-        return f"{self.model}({self.memory_size}GB)"
-
-
-class Resource(SimpleNamespace):
-    cpu: int = 0
-    memory: float
-    gpu: int = 0
-    gpu_type: str = ""
-
-    def __hash__(self):
-        return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
-    
-    def __bool__(self):
-        return any(value is not None for value in self.__dict__.values())
-
-
-ACCELERATOR_SPEC_DICT: dict[str, dict] = {
-    "nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0},
-    "nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0},
-    "nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0},
-    "nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0},
-    "nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0},
-    "nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0},
-    "nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0},
-    "nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0},
-    "nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0},
-    "nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0},
-    "nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0},
-    "nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0},
-    "nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0},
-    "nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0},
-    "nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0},
-    "nvidia-l4": {"model": "L4", "memory_size": 24.0},
-    "nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0},
-    "nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0},
-    "nvidia-a100-80g": {"model": "A100", "memory_size": 80.0},
-    "nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0},
-    "nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0},
-}
-
-
-ACCELERATOR_SPECS: dict[str, Accelerator] = {
-    key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()
-}
-
-
-@functools.lru_cache
-def get_local_machine_spec():
-    if psutil.MACOS:
-        return DeploymentTarget(accelerators=[], source="local", platform="macos")
-
-    if psutil.WINDOWS:
-        platform = "windows"
-    elif psutil.LINUX:
-        platform = "linux"
-    else:
-        raise NotImplementedError(f"Unsupported platform")
-
-    from pynvml import (
-        nvmlDeviceGetCount,
-        nvmlDeviceGetCudaComputeCapability,
-        nvmlDeviceGetHandleByIndex,
-        nvmlDeviceGetMemoryInfo,
-        nvmlDeviceGetName,
-        nvmlInit,
-        nvmlShutdown,
-    )
-
-    try:
-        nvmlInit()
-        device_count = nvmlDeviceGetCount()
-        accelerators: list[Accelerator] = []
-        for i in range(device_count):
-            handle = nvmlDeviceGetHandleByIndex(i)
-            name = nvmlDeviceGetName(handle)
-            memory_info = nvmlDeviceGetMemoryInfo(handle)
-            accelerators.append(
-                Accelerator(
-                    model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
-                )
-            )
-            compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
-            if compute_capability < (7, 5):
-                output(
-                    f"GPU {name} with compute capability {compute_capability} "
-                    "may not be supported, 7.5 or higher is recommended. check "
-                    "https://developer.nvidia.com/cuda-gpus for more information",
-                    style="yellow",
-                )
-        nvmlShutdown()
-        return DeploymentTarget(
-            accelerators=accelerators,
-            source="local",
-            platform=platform,
-        )
-    except Exception as e:
-        output(
-            f"Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment",
-            style="yellow",
-        )
-        output(f"Error: {e}", style="red", level=20)
-        return DeploymentTarget(accelerators=[], source="local", platform=platform)
-
-
-@functools.lru_cache()
-def can_run(
-    bento: typing.Union[Resource, BentoInfo],
-    target: typing.Optional[DeploymentTarget] = None,
-) -> float:
-    """
-    Calculate if the bento can be deployed on the target.
-    """
-    if target is None:
-        target = get_local_machine_spec()
-
-    resource_spec = Resource(**(bento.bento_yaml["services"][0]["config"].get("resources", {})))
-    labels = bento.bento_yaml.get("labels", {})
-    platforms = labels.get("platforms", "linux").split(",")
-
-    if target.platform not in platforms:
-        return 0.0
-    
-    # return 1.0 if no resource is specified
-    if not resource_spec:
-        return 0.5
-    
-    if resource_spec.gpu > 0:
-        required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
-        filtered_accelerators = [
-            ac
-            for ac in target.accelerators
-            if ac.memory_size >= required_gpu.memory_size
-        ]
-        if resource_spec.gpu > len(filtered_accelerators):
-            return 0.0
-        return (
-            required_gpu.memory_size
-            * resource_spec.gpu
-            / sum(ac.memory_size for ac in target.accelerators)
-        )
-    if target.accelerators:
-        return 0.01 / sum(ac.memory_size for ac in target.accelerators)
-    return 1.0
--- a/openllm_next/analytic.py
+++ b/openllm_next/analytic.py
@@ -1,118 +0,0 @@
-from __future__ import annotations
-
-import functools
-import os
-import re
-import time
-import typing
-from abc import ABC
-
-import attr
-import click
-import typer
-import typer.core
-
-DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK"
-
-
-class EventMeta(ABC):
-    @property
-    def event_name(self):
-        # camel case to snake case
-        event_name = re.sub(r"(?<!^)(?=[A-Z])", "_", self.__class__.__name__).lower()
-        # remove "_event" suffix
-        suffix_to_remove = "_event"
-        if event_name.endswith(suffix_to_remove):
-            event_name = event_name[: -len(suffix_to_remove)]
-        return event_name
-
-
-@attr.define
-class CliEvent(EventMeta):
-    cmd_group: str
-    cmd_name: str
-    duration_in_ms: float = attr.field(default=0)
-    error_type: typing.Optional[str] = attr.field(default=None)
-    return_code: typing.Optional[int] = attr.field(default=None)
-
-
-@attr.define
-class OpenllmCliEvent(CliEvent):
-    pass
-
-
-class OrderedCommands(typer.core.TyperGroup):
-    def list_commands(self, _: click.Context) -> typing.Iterable[str]:
-        return list(self.commands)
-
-
-class OpenLLMTyper(typer.Typer):
-    def __init__(self, *args: typing.Any, **kwargs: typing.Any):
-        no_args_is_help = kwargs.pop("no_args_is_help", True)
-        context_settings = kwargs.pop("context_settings", {})
-        if "help_option_names" not in context_settings:
-            context_settings["help_option_names"] = ("-h", "--help")
-        if "max_content_width" not in context_settings:
-            context_settings["max_content_width"] = int(
-                os.environ.get("COLUMNS", str(120))
-            )
-        klass = kwargs.pop("cls", OrderedCommands)
-
-        super().__init__(
-            *args,
-            cls=klass,
-            no_args_is_help=no_args_is_help,
-            context_settings=context_settings,
-            **kwargs,
-        )
-
-    def command(self, *args: typing.Any, **kwargs: typing.Any):
-        def decorator(f):
-            @functools.wraps(f)
-            @click.pass_context
-            def wrapped(ctx: click.Context, *args, **kwargs):
-                from bentoml._internal.utils.analytics import track
-
-                do_not_track = (
-                    os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true"
-                )
-
-                # so we know that the root program is openllm
-                command_name = ctx.info_name
-                if ctx.parent.parent is not None:
-                    # openllm model list
-                    command_group = ctx.parent.info_name
-                elif ctx.parent.info_name == ctx.find_root().info_name:
-                    # openllm run
-                    command_group = "openllm"
-
-                if do_not_track:
-                    return f(*args, **kwargs)
-                start_time = time.time_ns()
-                try:
-                    return_value = f(*args, **kwargs)
-                    duration_in_ns = time.time_ns() - start_time
-                    track(
-                        OpenllmCliEvent(
-                            cmd_group=command_group,
-                            cmd_name=command_name,
-                            duration_in_ms=duration_in_ns / 1e6,
-                        )
-                    )
-                    return return_value
-                except BaseException as e:
-                    duration_in_ns = time.time_ns() - start_time
-                    track(
-                        OpenllmCliEvent(
-                            cmd_group=command_group,
-                            cmd_name=command_name,
-                            duration_in_ms=duration_in_ns / 1e6,
-                            error_type=type(e).__name__,
-                            return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
-                        )
-                    )
-                    raise
-
-            return typer.Typer.command(self, *args, **kwargs)(wrapped)
-
-        return decorator
--- a/openllm_next/clean.py
+++ b/openllm_next/clean.py
@@ -1,75 +0,0 @@
-import pathlib
-import shutil
-
-import questionary
-
-from openllm_next.analytic import OpenLLMTyper
-from openllm_next.common import (
-    CONFIG_FILE,
-    REPO_DIR,
-    VENV_DIR,
-    VERBOSE_LEVEL,
-    output,
-)
-
-app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM")
-
-
-HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub"
-
-
-@app.command(help="Clean up all the cached models from huggingface")
-def model_cache(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob("*"))
-    sure = questionary.confirm(
-        f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
-    ).ask()
-    if not sure:
-        return
-    shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
-    output("All models cached by Huggingface have been removed", style="green")
-
-
-@app.command(help="Clean up all the virtual environments created by OpenLLM")
-def venvs(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    used_space = sum(f.stat().st_size for f in VENV_DIR.rglob("*"))
-    sure = questionary.confirm(
-        f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
-    ).ask()
-    if not sure:
-        return
-    shutil.rmtree(VENV_DIR, ignore_errors=True)
-    output("All virtual environments have been removed", style="green")
-
-
-@app.command(help="Clean up all the repositories cloned by OpenLLM")
-def repos(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    shutil.rmtree(REPO_DIR, ignore_errors=True)
-    output("All repositories have been removed", style="green")
-
-
-@app.command(help="Reset configurations to default")
-def configs(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    shutil.rmtree(CONFIG_FILE, ignore_errors=True)
-    output("All configurations have been reset", style="green")
-
-
-@app.command(
-    name="all",
-    help="Clean up all above and bring OpenLLM to a fresh start",
-)
-def all_cache(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    repos()
-    venvs()
-    model_cache()
-    configs()
--- a/openllm_next/cloud.py
+++ b/openllm_next/cloud.py
@@ -1,174 +0,0 @@
-import json
-import os
-import pathlib
-import shutil
-import subprocess
-import typing
-
-import typer
-
-from openllm_next.accelerator_spec import ACCELERATOR_SPECS
-from openllm_next.analytic import OpenLLMTyper
-from openllm_next.common import (
-    INTERACTIVE,
-    BentoInfo,
-    DeploymentTarget,
-    output,
-    run_command,
-)
-
-app = OpenLLMTyper()
-
-
-def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
-    cmd = ["bentoml", "deploy", bento.bentoml_tag]
-    env = {
-        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
-    }
-
-    required_envs = bento.bento_yaml.get("envs", [])
-    required_env_names = [env["name"] for env in required_envs if "name" in env]
-    if required_env_names:
-        output(
-            f"This model requires the following environment variables to run: {repr(required_env_names)}",
-            style="yellow",
-        )
-
-    for env_info in bento.bento_yaml.get("envs", []):
-        if "name" not in env_info:
-            continue
-        if os.environ.get(env_info["name"]):
-            default = os.environ[env_info["name"]]
-        elif "value" in env_info:
-            default = env_info["value"]
-        else:
-            default = ""
-
-        if INTERACTIVE.get():
-            import questionary
-
-            value = questionary.text(
-                f"{env_info['name']}:",
-                default=default,
-            ).ask()
-        else:
-            if default == "":
-                output(
-                    f"Environment variable {env_info['name']} is required but not provided",
-                    style="red",
-                )
-                raise typer.Exit(1)
-            else:
-                value = default
-
-        if value is None:
-            raise typer.Exit(1)
-        cmd += ["--env", f"{env_info['name']}={value}"]
-
-    if target:
-        cmd += ["--instance-type", target.name]
-
-    assert (pathlib.Path.home() / "bentoml" / ".yatai.yaml").exists()
-    shutil.copy(
-        pathlib.Path.home() / "bentoml" / ".yatai.yaml",
-        bento.repo.path / "bentoml" / ".yatai.yaml",
-    )
-
-    return cmd, env, None
-
-
-def ensure_cloud_context():
-    import questionary
-
-    cmd = ["bentoml", "cloud", "current-context"]
-    try:
-        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
-        context = json.loads(result)
-        output(f"  bentoml already logged in: {context['endpoint']}", style="green")
-    except subprocess.CalledProcessError:
-        output("  bentoml not logged in", style="red")
-        if not INTERACTIVE.get():
-            output(
-                "\n  get bentoml logged in by:",
-            )
-            output(
-                "    $ bentoml cloud login",
-                style="orange",
-            )
-            output("")
-            output(
-                """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
-                style="yellow",
-            )
-            raise typer.Exit(1)
-        else:
-            action = questionary.select(
-                "Choose an action:",
-                choices=[
-                    "I have a BentoCloud account",
-                    "get an account in two minutes",
-                ],
-            ).ask()
-            if action is None:
-                raise typer.Exit(1)
-            elif action == "get an account in two minutes":
-                output(
-                    "Please visit https://cloud.bentoml.com to get your token",
-                    style="yellow",
-                )
-            endpoint = questionary.text(
-                "Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)"
-            ).ask()
-            if endpoint is None:
-                raise typer.Exit(1)
-            token = questionary.text(
-                "Enter your token: (similar to cniluaxxxxxxxx)"
-            ).ask()
-            if token is None:
-                raise typer.Exit(1)
-            cmd = [
-                "bentoml",
-                "cloud",
-                "login",
-                "--api-token",
-                token,
-                "--endpoint",
-                endpoint,
-            ]
-            try:
-                result = subprocess.check_output(cmd)
-                output("  Logged in successfully", style="green")
-            except subprocess.CalledProcessError:
-                output("  Failed to login", style="red")
-                raise typer.Exit(1)
-
-
-def get_cloud_machine_spec():
-    ensure_cloud_context()
-    cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
-    try:
-        result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
-        instance_types = json.loads(result)
-        return [
-            DeploymentTarget(
-                source="cloud",
-                name=it["name"],
-                price=it["price"],
-                platform="linux",
-                accelerators=(
-                    [ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))]
-                    if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS
-                    else []
-                ),
-            )
-            for it in instance_types
-        ]
-    except (subprocess.CalledProcessError, json.JSONDecodeError):
-        output("Failed to get cloud instance types", style="red")
-        return []
-
-
-def deploy(bento: BentoInfo, target: DeploymentTarget):
-    ensure_cloud_context()
-    cmd, env, cwd = _get_deploy_cmd(bento, target)
-    run_command(cmd, env=env, cwd=cwd)
--- a/openllm_next/common.py
+++ b/openllm_next/common.py
@@ -1,422 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-import functools
-import hashlib
-import io
-import json
-import os
-import pathlib
-import signal
-import subprocess
-import sys
-import sysconfig
-import typing
-from contextlib import asynccontextmanager, contextmanager
-from types import SimpleNamespace
-
-import typer
-import typer.core
-
-ERROR_STYLE = "red"
-SUCCESS_STYLE = "green"
-
-
-CLLAMA_HOME = pathlib.Path.home() / ".openllm_next"
-REPO_DIR = CLLAMA_HOME / "repos"
-TEMP_DIR = CLLAMA_HOME / "temp"
-VENV_DIR = CLLAMA_HOME / "venv"
-
-REPO_DIR.mkdir(exist_ok=True, parents=True)
-TEMP_DIR.mkdir(exist_ok=True, parents=True)
-VENV_DIR.mkdir(exist_ok=True, parents=True)
-
-CONFIG_FILE = CLLAMA_HOME / "config.json"
-
-CHECKED = "☆"
-
-T = typing.TypeVar("T")
-
-
-class ContextVar(typing.Generic[T]):
-    def __init__(self, default: T):
-        self._stack: list[T] = []
-        self._default = default
-
-    def get(self) -> T:
-        if self._stack:
-            return self._stack[-1]
-        return self._default
-
-    def set(self, value):
-        self._stack.append(value)
-
-    @contextmanager
-    def patch(self, value):
-        self._stack.append(value)
-        try:
-            yield
-        finally:
-            self._stack.pop()
-
-
-VERBOSE_LEVEL = ContextVar(10)
-INTERACTIVE = ContextVar(False)
-FORCE = ContextVar(False)
-
-
-def output(content, level=0, style=None, end=None):
-    import questionary
-
-    if level > VERBOSE_LEVEL.get():
-        return
-
-    if not isinstance(content, str):
-        import pyaml
-
-        out = io.StringIO()
-        pyaml.pprint(
-            content,
-            dst=out,
-            sort_dicts=False,
-            sort_keys=False,
-        )
-        questionary.print(out.getvalue(), style=style, end="" if end is None else end)
-        out.close()
-
-    if isinstance(content, str):
-        questionary.print(content, style=style, end="\n" if end is None else end)
-
-
-class Config(SimpleNamespace):
-    repos: dict[str, str] = {
-        "default": "git+https://github.com/bentoml/openllm-models@main"
-    }
-    default_repo: str = "default"
-
-    def tolist(self):
-        return dict(
-            repos=self.repos,
-            default_repo=self.default_repo,
-        )
-
-
-def load_config():
-    if CONFIG_FILE.exists():
-        try:
-            with open(CONFIG_FILE) as f:
-                return Config(**json.load(f))
-        except json.JSONDecodeError:
-            return Config()
-    return Config()
-
-
-def save_config(config):
-    with open(CONFIG_FILE, "w") as f:
-        json.dump(config.tolist(), f, indent=2)
-
-
-class RepoInfo(SimpleNamespace):
-    name: str
-    path: pathlib.Path
-    url: str
-    server: str
-    owner: str
-    repo: str
-    branch: str
-
-    def tolist(self):
-        if VERBOSE_LEVEL.get() <= 0:
-            return f"{self.name} ({self.url})"
-        if VERBOSE_LEVEL.get() <= 10:
-            return dict(
-                name=self.name,
-                url=self.url,
-                path=str(self.path),
-            )
-        if VERBOSE_LEVEL.get() <= 20:
-            return dict(
-                name=self.name,
-                url=self.url,
-                path=str(self.path),
-                server=self.server,
-                owner=self.owner,
-                repo=self.repo,
-                branch=self.branch,
-            )
-
-
-class BentoInfo(SimpleNamespace):
-    repo: RepoInfo
-    path: pathlib.Path
-    alias: str = ""
-
-    def __str__(self):
-        if self.repo.name == "default":
-            return f"{self.tag}"
-        else:
-            return f"{self.repo.name}/{self.tag}"
-
-    def __hash__(self):
-        return md5(str(self.path))
-
-    @property
-    def tag(self) -> str:
-        if self.alias:
-            return f"{self.path.parent.name}:{self.alias}"
-        return f"{self.path.parent.name}:{self.path.name}"
-
-    @property
-    def bentoml_tag(self) -> str:
-        return f"{self.path.parent.name}:{self.path.name}"
-
-    @property
-    def name(self) -> str:
-        return self.path.parent.name
-
-    @property
-    def version(self) -> str:
-        return self.path.name
-
-    @property
-    def labels(self) -> dict[str, str]:
-        return self.bento_yaml["labels"]
-
-    @functools.cached_property
-    def bento_yaml(self) -> dict:
-        import yaml
-
-        bento_file = self.path / "bento.yaml"
-        return yaml.safe_load(bento_file.read_text())
-
-    @functools.cached_property
-    def platforms(self) -> list[str]:
-        return self.bento_yaml["labels"].get("platforms", "linux").split(",")
-
-    @functools.cached_property
-    def pretty_yaml(self) -> dict:
-        def _pretty_routes(routes):
-            return {
-                route["route"]: {
-                    "input": {
-                        k: v["type"] for k, v in route["input"]["properties"].items()
-                    },
-                    "output": route["output"]["type"],
-                }
-                for route in routes
-            }
-
-        if len(self.bento_yaml["services"]) == 1:
-            pretty_yaml = {
-                "apis": _pretty_routes(self.bento_yaml["schema"]["routes"]),
-                "resources": self.bento_yaml["services"][0]["config"]["resources"],
-                "envs": self.bento_yaml["envs"],
-                "platforms": self.platforms,
-            }
-            return pretty_yaml
-        return self.bento_yaml
-
-    @functools.cached_property
-    def pretty_gpu(self) -> str:
-        from openllm_next.accelerator_spec import ACCELERATOR_SPECS
-
-        try:
-            resources = self.bento_yaml["services"][0]["config"]["resources"]
-            if resources["gpu"] > 1:
-                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
-                return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
-            elif resources["gpu"] > 0:
-                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
-                return f"{acc.memory_size:.0f}G"
-        except KeyError:
-            pass
-        return ""
-
-    def tolist(self):
-        verbose = VERBOSE_LEVEL.get()
-        if verbose <= 0:
-            return str(self)
-        if verbose <= 10:
-            return dict(
-                tag=self.tag,
-                repo=self.repo.tolist(),
-                path=str(self.path),
-                model_card=self.pretty_yaml,
-            )
-        if verbose <= 20:
-            return dict(
-                tag=self.tag,
-                repo=self.repo.tolist(),
-                path=str(self.path),
-                bento_yaml=self.bento_yaml,
-            )
-
-
-class VenvSpec(SimpleNamespace):
-    python_version: str
-    python_packages: dict[str, str]
-    name_prefix = ""
-
-    def __hash__(self):
-        return md5(
-            # self.python_version,
-            *sorted(self.python_packages),
-        )
-
-
-class Accelerator(SimpleNamespace):
-    model: str
-    memory_size: float
-
-    def __gt__(self, other):
-        return self.memory_size > other.memory_size
-
-    def __eq__(self, other):
-        return self.memory_size == other.memory_size
-
-
-class DeploymentTarget(SimpleNamespace):
-    source: str = "local"
-    name: str = "local"
-    price: str = ""
-    platform = "linux"
-    accelerators: list[Accelerator]
-
-    def __hash__(self):
-        return hash(self.source)
-
-    @property
-    def accelerators_repr(self) -> str:
-        accs = {a.model for a in self.accelerators}
-        if len(accs) == 0:
-            return "null"
-        if len(accs) == 1:
-            a = self.accelerators[0]
-            return f"{a.model} x{len(self.accelerators)}"
-        return ", ".join((f"{a.model}" for a in self.accelerators))
-
-
-def run_command(
-    cmd,
-    cwd=None,
-    env=None,
-    copy_env=True,
-    venv=None,
-    silent=False,
-) -> subprocess.CompletedProcess:
-    import shlex
-
-    env = env or {}
-    cmd = [str(c) for c in cmd]
-    bin_dir = "Scripts" if os.name == "nt" else "bin"
-    if not silent:
-        output("\n")
-        if cwd:
-            output(f"$ cd {cwd}", style="orange")
-        if env:
-            for k, v in env.items():
-                output(f"$ export {k}={shlex.quote(v)}", style="orange")
-        if venv:
-            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
-        output(f"$ {' '.join(cmd)}", style="orange")
-
-    if venv:
-        py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
-    else:
-        py = sys.executable
-
-    if copy_env:
-        env = {**os.environ, **env}
-
-    if cmd and cmd[0] == "bentoml":
-        cmd = [py, "-m", "bentoml"] + cmd[1:]
-    if cmd and cmd[0] == "python":
-        cmd = [py] + cmd[1:]
-
-    try:
-        if silent:
-            return subprocess.run(  # type: ignore
-                cmd,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-        else:
-            return subprocess.run(
-                cmd,
-                cwd=cwd,
-                env=env,
-            )
-    except subprocess.CalledProcessError:
-        output("Command failed", style="red")
-        raise typer.Exit(1)
-
-
-async def stream_command_output(stream, style="gray"):
-    async for line in stream:
-        output(line.decode(), style=style, end="")
-
-
-@asynccontextmanager
-async def async_run_command(
-    cmd,
-    cwd=None,
-    env=None,
-    copy_env=True,
-    venv=None,
-    silent=True,
-):
-    import shlex
-
-    env = env or {}
-    cmd = [str(c) for c in cmd]
-
-    if not silent:
-        output("\n")
-        if cwd:
-            output(f"$ cd {cwd}", style="orange")
-        if env:
-            for k, v in env.items():
-                output(f"$ export {k}={shlex.quote(v)}", style="orange")
-        if venv:
-            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
-        output(f"$ {' '.join(cmd)}", style="orange")
-
-    if venv:
-        py = venv / "bin" / "python"
-    else:
-        py = sys.executable
-
-    if copy_env:
-        env = {**os.environ, **env}
-
-    if cmd and cmd[0] == "bentoml":
-        cmd = [py, "-m", "bentoml"] + cmd[1:]
-    if cmd and cmd[0] == "python":
-        cmd = [py] + cmd[1:]
-
-    proc = None
-    try:
-        proc = await asyncio.create_subprocess_shell(
-            " ".join(map(str, cmd)),
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-            cwd=cwd,
-            env=env,
-        )
-        yield proc
-    except subprocess.CalledProcessError:
-        output("Command failed", style="red")
-        raise typer.Exit(1)
-    finally:
-        if proc:
-            proc.send_signal(signal.SIGINT)
-            await proc.wait()
-
-
-def md5(*strings: str) -> int:
-    m = hashlib.md5()
-    for s in strings:
-        m.update(s.encode())
-    return int(m.hexdigest(), 16)
--- a/openllm_next/local.py
+++ b/openllm_next/local.py
@@ -1,117 +0,0 @@
-import asyncio
-import time
-
-import httpx
-
-from openllm_next.common import (
-    BentoInfo,
-    async_run_command,
-    output,
-    run_command,
-    stream_command_output,
-)
-from openllm_next.venv import ensure_venv
-
-
-def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
-    cmd = ["bentoml", "serve", bento.bentoml_tag]
-    if port != 3000:
-        cmd += ["--port", str(port)]
-    env = {
-        "BENTOML_HOME": f"{bento.repo.path}/bentoml",
-    }
-    return cmd, env, None
-
-
-def serve(
-    bento: BentoInfo,
-    port: int = 3000,
-):
-    venv = ensure_venv(bento)
-    cmd, env, cwd = _get_serve_cmd(bento, port=port)
-    run_command(cmd, env=env, cwd=cwd, venv=venv)
-
-
-async def _run_model(
-    bento: BentoInfo,
-    port: int = 3000,
-    timeout: int = 600,
-):
-    venv = ensure_venv(bento)
-    cmd, env, cwd = _get_serve_cmd(bento, port)
-    async with async_run_command(
-        cmd,
-        env=env,
-        cwd=cwd,
-        venv=venv,
-        silent=False,
-    ) as server_proc:
-
-        output(f"Model server started {server_proc.pid}")
-
-        stdout_streamer = None
-        stderr_streamer = None
-        start_time = time.time()
-
-        output("Model loading...", style="green")
-        for _ in range(timeout):
-            try:
-                resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3)
-                if resp.status_code == 200:
-                    break
-            except httpx.RequestError:
-                if time.time() - start_time > 30:
-                    if not stdout_streamer:
-                        stdout_streamer = asyncio.create_task(
-                            stream_command_output(server_proc.stdout, style="gray")
-                        )
-                    if not stderr_streamer:
-                        stderr_streamer = asyncio.create_task(
-                            stream_command_output(server_proc.stderr, style="#BD2D0F")
-                        )
-                await asyncio.sleep(1)
-        else:
-            output("Model failed to load", style="red")
-            server_proc.terminate()
-            return
-
-        if stdout_streamer:
-            stdout_streamer.cancel()
-        if stderr_streamer:
-            stderr_streamer.cancel()
-
-        output("Model is ready", style="green")
-        messages: list[dict[str, str]] = []
-
-        from openai import AsyncOpenAI
-
-        client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local")
-        model_id = (await client.models.list()).data[0].id
-        while True:
-            try:
-                message = input("user: ")
-                if message == "":
-                    output("empty message, please enter something", style="yellow")
-                    continue
-                messages.append(dict(role="user", content=message))
-                output("assistant: ", end="", style="lightgreen")
-                assistant_message = ""
-                stream = await client.chat.completions.create(
-                    model=model_id,
-                    messages=messages,  # type: ignore
-                    stream=True,
-                )
-                async for chunk in stream:
-                    text = chunk.choices[0].delta.content or ""
-                    assistant_message += text
-                    output(text, end="", style="lightgreen")
-                messages.append(dict(role="assistant", content=assistant_message))
-                output("")
-            except KeyboardInterrupt:
-                break
-        output("\nStopping model server...", style="green")
-    output("Stopped model server", style="green")
-
-
-def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
-    asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/openllm_next/model.py
+++ b/openllm_next/model.py
@@ -1,173 +0,0 @@
-import typing
-from typing import Optional
-
-import tabulate
-import typer
-
-from openllm_next.accelerator_spec import DeploymentTarget, can_run
-from openllm_next.analytic import OpenLLMTyper
-from openllm_next.common import (
-    FORCE,
-    VERBOSE_LEVEL,
-    BentoInfo,
-    load_config,
-    output,
-)
-from openllm_next.repo import ensure_repo_updated, parse_repo_url
-
-app = OpenLLMTyper(help="manage models")
-
-
-@app.command()
-def get(
-    tag: str,
-    repo: Optional[str] = None,
-    verbose: bool = False,
-):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    bento_info = ensure_bento(tag, repo_name=repo)
-    if bento_info:
-        output(bento_info)
-
-
-@app.command(name="list")
-def list_(
-    tag: Optional[str] = None,
-    repo: Optional[str] = None,
-    verbose: bool = False,
-):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-
-    bentos = list_bento(tag=tag, repo_name=repo)
-    bentos.sort(key=lambda x: x.name)
-
-    seen = set()
-
-    def is_seen(value):
-        if value in seen:
-            return True
-        seen.add(value)
-        return False
-
-    table = tabulate.tabulate(
-        [
-            [
-                "" if is_seen(bento.name) else bento.name,
-                bento.tag,
-                bento.repo.name,
-                bento.pretty_gpu,
-                ",".join(bento.platforms),
-            ]
-            for bento in bentos
-        ],
-        headers=["model", "version", "repo", "required VRAM", "platforms"],
-    )
-    output(table)
-
-
-def ensure_bento(
-    model: str,
-    target: Optional[DeploymentTarget] = None,
-    repo_name: Optional[str] = None,
-) -> BentoInfo:
-    bentos = list_bento(model, repo_name=repo_name)
-    if len(bentos) == 0:
-        output(f"No model found for {model}", style="red")
-        raise typer.Exit(1)
-
-    if len(bentos) == 1:
-        if FORCE.get():
-            output(f"Found model {bentos[0]}", style="green")
-            return bentos[0]
-        if target is None:
-            return bentos[0]
-        if can_run(bentos[0], target) <= 0:
-            return bentos[0]
-        output(f"Found model {bentos[0]}", style="green")
-        return bentos[0]
-
-    if target is None:
-        output(
-            f"Multiple models match {model}, did you mean one of these?",
-            style="red",
-        )
-        for bento in bentos:
-            output(f"  {bento}")
-        raise typer.Exit(1)
-
-    filtered = [bento for bento in bentos if can_run(bento, target) > 0]
-    if len(filtered) == 0:
-        output(f"No deployment target found for {model}", style="red")
-        raise typer.Exit(1)
-
-    if len(filtered) == 0:
-        output(f"No deployment target found for {model}", style="red")
-        raise typer.Exit(1)
-
-    if len(bentos) > 1:
-        output(
-            f"Multiple models match {model}, did you mean one of these?",
-            style="red",
-        )
-        for bento in bentos:
-            output(f"  {bento}")
-        raise typer.Exit(1)
-
-    return bentos[0]
-
-
-def list_bento(
-    tag: typing.Optional[str] = None,
-    repo_name: typing.Optional[str] = None,
-    include_alias: bool = False,
-) -> typing.List[BentoInfo]:
-    ensure_repo_updated()
-
-    if repo_name is not None:
-        config = load_config()
-        if repo_name not in config.repos:
-            output(f"Repo `{repo_name}` not found, did you mean one of these?")
-            for repo_name in config.repos:
-                output(f"  {repo_name}")
-            raise typer.Exit(1)
-
-    if not tag:
-        glob_pattern = "bentoml/bentos/*/*"
-    elif ":" in tag:
-        bento_name, version = tag.split(":")
-        glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
-    else:
-        glob_pattern = f"bentoml/bentos/{tag}/*"
-
-    model_list = []
-    config = load_config()
-    for _repo_name, repo_url in config.repos.items():
-        if repo_name is not None and _repo_name != repo_name:
-            continue
-        repo = parse_repo_url(repo_url, _repo_name)
-        for path in repo.path.glob(glob_pattern):
-            if path.is_dir() and (path / "bento.yaml").exists():
-                model = BentoInfo(repo=repo, path=path)
-            elif path.is_file():
-                with open(path) as f:
-                    origin_name = f.read().strip()
-                origin_path = path.parent / origin_name
-                model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
-            else:
-                model = None
-            if model:
-                model_list.append(model)
-    model_list.sort(key=lambda x: x.tag)
-    if not include_alias:
-        seen = set()
-        model_list = [
-            x
-            for x in model_list
-            if not (
-                f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
-                or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
-            )
-        ]
-    return model_list
--- a/openllm_next/repo.py
+++ b/openllm_next/repo.py
@@ -1,203 +0,0 @@
-import datetime
-import re
-import shutil
-
-import pyaml
-import questionary
-import typer
-
-from openllm_next.analytic import OpenLLMTyper
-from openllm_next.common import (
-    INTERACTIVE,
-    REPO_DIR,
-    VERBOSE_LEVEL,
-    RepoInfo,
-    load_config,
-    output,
-    save_config,
-)
-
-UPDATE_INTERVAL = datetime.timedelta(days=3)
-
-app = OpenLLMTyper(help="manage repos")
-
-
-@app.command()
-def list(verbose: bool = False):
-    if verbose:
-        VERBOSE_LEVEL.set(20)
-    config = load_config()
-    pyaml.pprint(
-        [parse_repo_url(repo, name) for name, repo in config.repos.items()],
-        sort_dicts=False,
-        sort_keys=False,
-    )
-
-
-@app.command()
-def remove(name: str):
-    config = load_config()
-    if name not in config.repos:
-        output(f"Repo {name} does not exist", style="red")
-        return
-
-    del config.repos[name]
-    save_config(config)
-    output(f"Repo {name} removed", style="green")
-
-
-def _complete_alias(repo_name: str):
-    from openllm_next.model import list_bento
-
-    for bento in list_bento(repo_name=repo_name):
-        alias = bento.labels.get("openllm_alias", "").strip()
-        if alias:
-            for a in alias.split(","):
-                with open(bento.path.parent / a, "w") as f:
-                    f.write(bento.version)
-
-
-@app.command()
-def update():
-    import dulwich
-    import dulwich.errors
-    import dulwich.porcelain
-
-    config = load_config()
-    repos_in_use = set()
-    for repo_name, repo in config.repos.items():
-        repo = parse_repo_url(repo, repo_name)
-        repos_in_use.add((repo.server, repo.owner, repo.repo))
-        if repo.path.exists():  # TODO: use update instead of remove and clone
-            shutil.rmtree(repo.path, ignore_errors=True)
-        if not repo.path.exists():
-            repo.path.parent.mkdir(parents=True, exist_ok=True)
-            try:
-                dulwich.porcelain.clone(
-                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
-                    str(repo.path),
-                    checkout=True,
-                    depth=1,
-                    branch=repo.branch,
-                )
-                output("")
-                output(f"Repo `{repo.name}` updated", style="green")
-            except:
-                shutil.rmtree(repo.path, ignore_errors=True)
-                output(f"Failed to clone repo {repo.name}", style="red")
-        else:
-            try:
-                import dulwich.porcelain
-
-                dulwich.porcelain.pull(
-                    str(repo.path),
-                    f"https://{repo.server}/{repo.owner}/{repo.repo}.git",
-                    refspecs=repo.branch,
-                    force=True,
-                )
-                dulwich.porcelain.clean(str(repo.path), str(repo.path))
-                output("")
-                output(f"Repo `{repo.name}` updated", style="green")
-            except:
-                shutil.rmtree(repo.path, ignore_errors=True)
-                output(f"Failed to update repo {repo.name}", style="red")
-    for c in REPO_DIR.glob("*/*/*"):
-        repo_spec = tuple(c.parts[-3:])
-        if repo_spec not in repos_in_use:
-            shutil.rmtree(c, ignore_errors=True)
-            output(f"Removed unused repo cache {c}")
-    with open(REPO_DIR / "last_update", "w") as f:
-        f.write(datetime.datetime.now().isoformat())
-    for repo_name in config.repos:
-        _complete_alias(repo_name)
-
-
-def ensure_repo_updated():
-    last_update_file = REPO_DIR / "last_update"
-    if not last_update_file.exists():
-        if INTERACTIVE.get():
-            choice = questionary.confirm(
-                "The repo cache is never updated, do you want to update it to fetch the latest model list?"
-            ).ask()
-            if choice:
-                update()
-            return
-        else:
-            output(
-                "The repo cache is never updated, please run `openllm repo update` to fetch the latest model list",
-                style="red",
-            )
-            raise typer.Exit(1)
-    last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
-    if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
-        if INTERACTIVE.get():
-            choice = questionary.confirm(
-                "The repo cache is outdated, do you want to update it to fetch the latest model list?"
-            ).ask()
-            if choice:
-                update()
-        else:
-            output(
-                "The repo cache is outdated, please run `openllm repo update` to fetch the latest model list",
-                style="yellow",
-            )
-
-
-GIT_REPO_RE = re.compile(
-    r"git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$"
-)
-
-
-def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
-    """
-    parse the git repo url to server, owner, repo name, branch
-    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm@main")
-    ('github.com', 'bentoml', 'bentovllm', 'main')
-
-    >>> parse_repo_url("git+https://github.com/bentoml/bentovllm")
-    ('github.com', 'bentoml', 'bentovllm', 'main')
-    """
-    match = GIT_REPO_RE.match(repo_url)
-    if not match:
-        raise ValueError(f"Invalid git repo url: {repo_url}")
-    server = match.group("server")
-    owner = match.group("owner")
-    repo = match.group("repo")
-    branch = match.group("branch") or "main"
-    path = REPO_DIR / server / owner / repo
-    return RepoInfo(
-        name=repo if repo_name is None else repo_name,
-        url=repo_url,
-        server=server,
-        owner=owner,
-        repo=repo,
-        branch=branch,
-        path=path,
-    )
-
-
-@app.command()
-def add(name: str, repo: str):
-    name = name.lower()
-    if not name.isidentifier():
-        output(
-            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
-            style="red",
-        )
-        return
-
-    config = load_config()
-    if name in config.repos:
-        override = questionary.confirm(
-            f"Repo {name} already exists({config.repos[name]}), override?"
-        ).ask()
-        if not override:
-            return
-
-    config.repos[name] = repo
-    save_config(config)
-    output(f"Repo {name} added", style="green")
-
-
-if __name__ == "__main__":
-    app()
--- a/openllm_next/venv.py
+++ b/openllm_next/venv.py
@@ -1,164 +0,0 @@
-import functools
-import os
-import pathlib
-import shutil
-import typing
-from typing import Iterable
-
-import typer
-
-from openllm_next.common import (
-    VENV_DIR,
-    VERBOSE_LEVEL,
-    BentoInfo,
-    VenvSpec,
-    output,
-    run_command,
-)
-
-
-@functools.lru_cache
-def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
-    from pip_requirements_parser import RequirementsFile
-
-    requirements_txt = RequirementsFile.from_file(
-        str(requirement),
-        include_nested=True,
-    )
-    return requirements_txt.requirements
-
-
-def _filter_preheat_packages(requirements: Iterable) -> list[str]:
-    PREHEAT_PIP_PACKAGES = ["torch", "vllm"]
-
-    deps: list[str] = []
-    for req in requirements:
-        if (
-            req.is_editable
-            or req.is_local_path
-            or req.is_url
-            or req.is_wheel
-            or not req.name
-            or not req.specifier
-        ):
-            continue
-        for sp in req.specifier:
-            if sp.operator == "==" and req.name in PREHEAT_PIP_PACKAGES:
-                assert req.line is not None
-                deps.append(req.line)
-                break
-    return deps
-
-
-@functools.lru_cache
-def _resolve_bento_env_specs(bento: BentoInfo):
-    ver_file = bento.path / "env" / "python" / "version.txt"
-    assert ver_file.exists(), f"cannot find version file in {bento.path}"
-
-    lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
-    if not lock_file.exists():
-        lock_file = bento.path / "env" / "python" / "requirements.txt"
-
-    reqs = _resolve_packages(lock_file)
-    preheat_packages = _filter_preheat_packages(reqs)
-    ver = ver_file.read_text().strip()
-    return (
-        VenvSpec(
-            python_version=ver,
-            python_packages=preheat_packages,
-            name_prefix=f"{bento.tag.replace(':', '_')}-1-",
-        ),
-        VenvSpec(
-            python_version=ver,
-            python_packages=[v.line for v in reqs],
-            name_prefix=f"{bento.tag.replace(':', '_')}-2-",
-        ),
-    )
-
-
-def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
-    if os.name == "nt":
-        return venv / "Lib/site-packages"
-    else:
-        return next(venv.glob("lib/python*")) / "site-packages"
-
-
-def _ensure_venv(
-    env_spec: VenvSpec,
-    parrent_venv: typing.Optional[pathlib.Path] = None,
-) -> pathlib.Path:
-    venv = VENV_DIR / str(hash(env_spec))
-    if venv.exists() and not (venv / "DONE").exists():
-        shutil.rmtree(venv, ignore_errors=True)
-    if not venv.exists():
-        output(f"Installing model dependencies({venv})...", style="green")
-
-        venv_py = (
-            venv / "Scripts" / "python.exe"
-            if os.name == "nt"
-            else venv / "bin" / "python"
-        )
-        try:
-            run_command(
-                ["python", "-m", "uv", "venv", venv],
-                silent=VERBOSE_LEVEL.get() < 10,
-            )
-            lib_dir = _get_lib_dir(venv)
-            if parrent_venv is not None:
-                parent_lib_dir = _get_lib_dir(parrent_venv)
-                with open(lib_dir / f"{parrent_venv.name}.pth", "w+") as f:
-                    f.write(str(parent_lib_dir))
-            with open(venv / "requirements.txt", "w") as f:
-                f.write("\n".join(sorted(env_spec.python_packages)))
-            run_command(
-                [
-                    "python",
-                    "-m",
-                    "uv",
-                    "pip",
-                    "install",
-                    "-p",
-                    str(venv_py),
-                    "-r",
-                    venv / "requirements.txt",
-                ],
-                silent=VERBOSE_LEVEL.get() < 10,
-            )
-            with open(venv / "DONE", "w") as f:
-                f.write("DONE")
-        except Exception:
-            shutil.rmtree(venv, ignore_errors=True)
-            output(
-                f"Failed to install dependencies to {venv}. Cleaned up.",
-                style="red",
-            )
-            raise typer.Exit(1)
-        output(f"Successfully installed dependencies to {venv}.", style="green")
-        return venv
-    else:
-        return venv
-
-
-def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
-    last_venv = None
-    for env_spec in env_spec_list:
-        last_venv = _ensure_venv(env_spec, last_venv)
-    assert last_venv is not None
-    return last_venv
-
-
-def ensure_venv(bento: BentoInfo) -> pathlib.Path:
-    return _ensure_venvs(_resolve_bento_env_specs(bento))
-
-
-def _check_venv(env_spec: VenvSpec) -> bool:
-    venv = VENV_DIR / str(hash(env_spec))
-    if not venv.exists():
-        return False
-    if venv.exists() and not (venv / "DONE").exists():
-        return False
-    return True
-
-
-def check_venv(bento: BentoInfo) -> bool:
-    return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))
--- a/package.json
+++ b/package.json
@@ -1,57 +0,0 @@
-{
-  "name": "@bentoml/openllm-monorepo",
-  "version": "0.5.7",
-  "description": "OpenLLM: Operating LLMs in production",
-  "author": "Aaron Pham <29749331+aarnphm@users.noreply.github.com>",
-  "license": "Apache-2.0",
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/bentoml/OpenLLM.git"
-  },
-  "bugs": {
-    "url": "https://github.com/bentoml/OpenLLM/issues"
-  },
-  "engines": {
-    "node": ">=18"
-  },
-  "type": "module",
-  "private": true,
-  "scripts": {
-    "lint": "eslint . --cache",
-    "check": "npx prettier . --check",
-    "format": "npx prettier . --write"
-  },
-  "packageManager": "pnpm@8.6.3",
-  "devDependencies": {
-    "@edge-runtime/vm": "3.1.6",
-    "@next/eslint-plugin-next": "13.5.5",
-    "@types/node": "20.8.7",
-    "@typescript-eslint/eslint-plugin": "6.10.0",
-    "@typescript-eslint/parser": "6.8.0",
-    "eslint": "^8.53.0",
-    "eslint-config-prettier": "9.0.0",
-    "eslint-plugin-import": "2.29.0",
-    "eslint-plugin-react": "7.33.2",
-    "eslint-plugin-react-hooks": "4.6.0",
-    "eslint-plugin-tailwindcss": "3.13.0",
-    "eslint-plugin-unicorn": "48.0.1",
-    "prettier": "^3.0.3",
-    "prettier-plugin-pkg": "0.18.0",
-    "prettier-plugin-tailwindcss": "0.5.6",
-    "turbo": "1.10.16",
-    "typescript": "5.2.2"
-  },
-  "pnpm": {
-    "overrides": {
-      "vitest": "^0.27.1",
-      "react": "^18.2.0",
-      "react-dom": "^18.2.0",
-      "@mdx-js/react": "^2.3.0",
-      "@vercel/analytics": "1.0.2",
-      "next-themes": "^0.2.1",
-      "next": "^13.4.8",
-      "@types/react": "^18.0.15",
-      "@types/react-dom": "^18.0.6"
-    }
-  }
-}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,34 @@
-[build-system]
-requires = ["setuptools>=42", "wheel"]
-build-backend = "setuptools.build_meta"
-
 [project]
-name = "openllm-next"
-version = "0.0.1"
-description = "A description of your package."
-authors = [{name = "oasiszero", email = "oasis0.com@gmail.com"}]
-license = {file = "LICENSE"}
+name = "openllm"
+description = "OpenLLM: Self-hosting LLMs Made Easy."
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [{ name = "BentoML Team", email = "contact@bentoml.com" }]
+dynamic = ["version"]
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
 dependencies = [
  "bentoml",
  "typer",
@@ -22,13 +43,147 @@ dependencies = [
  "uv",
  "openai==1.35.9",
 ]
+keywords = [
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Mistral",
+  "vLLM",
+  "Transformers",
+]
+license = "Apache-2.0"
+requires-python = ">=3.9"

 [project.scripts]
-openllm = "openllm_next.__main__:main"
+openllm = "openllm.__main__:app"
+
+[project.urls]
+Blog = "https://modelserving.com"
+Documentation = "https://github.com/bentoml/OpenLLM#readme"
+GitHub = "https://github.com/bentoml/OpenLLM"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"

 [tool.typer]
-src-dir = "openllm_next"
+src-dir = "src/openllm"

-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
+[build-system]
+requires = [
+  "hatchling==1.25.0",
+  "hatch-vcs==0.4.0",
+  "hatch-fancy-pypi-readme==24.1.0",
+]
+build-backend = 'hatchling.build'
+
+[tool.hatch.version]
+source = "vcs"
+fallback-version = "0.0.0"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
+]
+version_scheme = "post-release"
+fallback_version = "0.0.0"
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm"]
+sources = ["src"]
+[tool.hatch.build.targets.sdist]
+exclude = ["/.git_archival.txt", "/.python-version-default"]
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
+  </a>
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+end-before = "\n<!-- hatch-fancy-pypi-readme meta stop -->"
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme meta start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+## Release Information
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+path = "CHANGELOG.md"
+pattern = "\n(###.+?\n)## "
+start-after = "<!-- towncrier release notes start -->"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+---
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
+
+[tool.towncrier]
+directory = "changelog.d"
+filename = "CHANGELOG.md"
+issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
+name = "openllm"
+start_string = "<!-- towncrier release notes start -->\n"
+template = "changelog.d/template.md.jinja"
+title_format = ""
+underlines = ["", "", ""]
+[[tool.towncrier.section]]
+path = ""
+[[tool.towncrier.type]]
+directory = "breaking"
+name = "Backwards-incompatible Changes"
+showcontent = true
+[[tool.towncrier.type]]
+directory = "deprecation"
+name = "Deprecations"
+showcontent = true
+[[tool.towncrier.type]]
+directory = "change"
+name = "Changes"
+showcontent = true
+[[tool.towncrier.type]]
+directory = "refactor"
+name = "Refactor"
+showcontent = true
+[[tool.towncrier.type]]
+directory = "feature"
+name = "Features"
+showcontent = true
+[[tool.towncrier.type]]
+directory = "fix"
+name = "Bug fix"
+showcontent = true
--- a/openllm_next/init.py
+++ b/openllm_next/init.py
--- a/src/openllm/main.py
+++ b/src/openllm/main.py
@@ -0,0 +1,274 @@
+import os
+import random
+import sys
+import platform
+import importlib.metadata
+from collections import defaultdict
+from typing import Annotated, Optional
+
+import questionary
+import typer
+
+from openllm.accelerator_spec import DeploymentTarget, can_run, get_local_machine_spec
+from openllm.analytic import DO_NOT_TRACK, OpenLLMTyper
+from openllm.clean import app as clean_app
+from openllm.cloud import deploy as cloud_deploy
+from openllm.cloud import ensure_cloud_context, get_cloud_machine_spec
+from openllm.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, output
+from openllm.local import run as local_run
+from openllm.local import serve as local_serve
+from openllm.model import app as model_app
+from openllm.model import ensure_bento, list_bento
+from openllm.repo import app as repo_app
+
+app = OpenLLMTyper(
+  help='`openllm hello` to get started. '
+  'OpenLLM is a CLI tool to manage and deploy open source LLMs and'
+  ' get an OpenAI API compatible chat server in seconds.'
+)
+
+app.add_typer(repo_app, name='repo')
+app.add_typer(model_app, name='model')
+app.add_typer(clean_app, name='clean')
+
+
+def _select_bento_name(models, target):
+  from tabulate import tabulate
+
+  options = []
+  model_infos = [[model.repo.name, model.name, can_run(model, target)] for model in models]
+  model_name_groups = defaultdict(lambda: 0)
+  for repo, name, score in model_infos:
+    model_name_groups[(repo, name)] += score
+  table_data = [[name, repo, CHECKED if score > 0 else ''] for (repo, name), score in model_name_groups.items()]
+  if not table_data:
+    output('No model found', style='red')
+    raise typer.Exit(1)
+  table = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\n')
+  headers = f'{table[0]}\n   {table[1]}'
+
+  options.append(questionary.Separator(headers))
+  for table_data, table_line in zip(table_data, table[2:]):
+    options.append(questionary.Choice(table_line, value=table_data[:2]))
+  selected = questionary.select('Select a model', options).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected
+
+
+def _select_bento_version(models, target, bento_name, repo):
+  from tabulate import tabulate
+
+  model_infos = [
+    [model, can_run(model, target)] for model in models if model.name == bento_name and model.repo.name == repo
+  ]
+
+  table_data = [
+    [model.tag, CHECKED if score > 0 else '']
+    for model, score in model_infos
+    if model.name == bento_name and model.repo.name == repo
+  ]
+  if not table_data:
+    output(f'No model found for {bento_name} in {repo}', style='red')
+    raise typer.Exit(1)
+  table = tabulate(table_data, headers=['version', 'locally runnable']).split('\n')
+
+  options = []
+  options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
+  for table_data, table_line in zip(model_infos, table[2:]):
+    options.append(questionary.Choice(table_line, value=table_data))
+  selected = questionary.select('Select a version', options).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected
+
+
+def _select_target(bento, targets):
+  from tabulate import tabulate
+
+  options = []
+  targets.sort(key=lambda x: can_run(bento, x), reverse=True)
+  if not targets:
+    output('No available instance type, check your bentocloud account', style='red')
+    raise typer.Exit(1)
+
+  table = tabulate(
+    [
+      [
+        target.name,
+        target.accelerators_repr,
+        f'${target.price}',
+        CHECKED if can_run(bento, target) else 'insufficient res.',
+      ]
+      for target in targets
+    ],
+    headers=['instance type', 'accelerator', 'price/hr', 'deployable'],
+  ).split('\n')
+  options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
+
+  for target, line in zip(targets, table[2:]):
+    options.append(questionary.Choice(f'{line}', value=target))
+  selected = questionary.select('Select an instance type', options).ask()
+  if selected is None:
+    raise typer.Exit(1)
+  return selected
+
+
+def _select_action(bento, score):
+  if score > 0:
+    options = [
+      questionary.Separator('Available actions'),
+      questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),
+      questionary.Separator(f'  $ openllm run {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice('1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'),
+      questionary.Separator(f'  $ openllm serve {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
+      ),
+      questionary.Separator(f'  $ openllm deploy {bento}'),
+    ]
+  else:
+    options = [
+      questionary.Separator('Available actions'),
+      questionary.Choice('0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'),
+      questionary.Separator(f'  $ openllm run {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '1. Serve the model locally and get a chat server',
+        value='serve',
+        disabled='insufficient res.',
+        shortcut_key='1',
+      ),
+      questionary.Separator(f'  $ openllm serve {bento}'),
+      questionary.Separator(' '),
+      questionary.Choice(
+        '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
+      ),
+      questionary.Separator(f'  $ openllm deploy {bento}'),
+    ]
+  action = questionary.select('Select an action', options).ask()
+  if action is None:
+    raise typer.Exit(1)
+  if action == 'run':
+    try:
+      local_run(bento)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm run {bento}', style='orange')
+  elif action == 'serve':
+    try:
+      local_serve(bento)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm serve {bento}', style='orange')
+  elif action == 'deploy':
+    ensure_cloud_context()
+    targets = get_cloud_machine_spec()
+    target = _select_target(bento, targets)
+    try:
+      cloud_deploy(bento, target)
+    finally:
+      output('\nUse this command to run the action again:', style='green')
+      output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')
+
+
+@app.command(help='get started interactively')
+def hello():
+  INTERACTIVE.set(True)
+  VERBOSE_LEVEL.set(20)
+
+  target = get_local_machine_spec()
+  output(f'  Detected Platform: {target.platform}', style='green')
+  if target.accelerators:
+    output('  Detected Accelerators: ', style='green')
+    for a in target.accelerators:
+      output(f'   - {a.model} {a.memory_size}GB', style='green')
+  else:
+    output('  Detected Accelerators: None', style='yellow')
+
+  models = list_bento()
+  if not models:
+    output('No model found, you probably need to update the model repo:', style='red')
+    output('  $ openllm repo update', style='orange')
+    raise typer.Exit(1)
+
+  bento_name, repo = _select_bento_name(models, target)
+  bento, score = _select_bento_version(models, target, bento_name, repo)
+  _select_action(bento, score)
+
+
+@app.command(help='start an OpenAI API compatible chat server and chat in browser')
+def serve(
+  model: Annotated[str, typer.Argument()] = '', repo: Optional[str] = None, port: int = 3000, verbose: bool = False
+):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  target = get_local_machine_spec()
+  bento = ensure_bento(model, target=target, repo_name=repo)
+  local_serve(bento, port=port)
+
+
+@app.command(help='run the model and chat in terminal')
+def run(
+  model: Annotated[str, typer.Argument()] = '',
+  repo: Optional[str] = None,
+  port: Optional[int] = None,
+  timeout: int = 600,
+  verbose: bool = False,
+):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  target = get_local_machine_spec()
+  bento = ensure_bento(model, target=target, repo_name=repo)
+  if port is None:
+    port = random.randint(30000, 40000)
+  local_run(bento, port=port, timeout=timeout)
+
+
+@app.command(help='deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)')
+def deploy(
+  model: Annotated[str, typer.Argument()] = '',
+  instance_type: Optional[str] = None,
+  repo: Optional[str] = None,
+  verbose: bool = False,
+):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  bento = ensure_bento(model, repo_name=repo)
+  if instance_type is not None:
+    cloud_deploy(bento, DeploymentTarget(name=instance_type))
+    return
+  targets = get_cloud_machine_spec()
+  targets = filter(lambda x: can_run(bento, x) > 0, targets)
+  targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
+  if not targets:
+    output('No available instance type, check your bentocloud account', style='red')
+    raise typer.Exit(1)
+  target = targets[0]
+  output(f'Recommended instance type: {target.name}', style='green')
+  cloud_deploy(bento, target)
+
+
+@app.callback(invoke_without_command=True)
+def typer_callback(
+  verbose: int = 0,
+  do_not_track: bool = typer.Option(
+    False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK
+  ),
+  version: bool = typer.Option(False, '--version', '-v', help='Show version'),
+):
+  if verbose:
+    VERBOSE_LEVEL.set(verbose)
+  if version:
+    output(
+      f"openllm, {importlib.metadata.version('openllm')}\nPython ({platform.python_implementation()}) {platform.python_version()}"
+    )
+    sys.exit(0)
+  if do_not_track:
+    os.environ[DO_NOT_TRACK] = str(True)
+
+
+if __name__ == '__main__':
+  app()
--- a/src/openllm/accelerator_spec.py
+++ b/src/openllm/accelerator_spec.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import functools
+import math
+import typing
+from types import SimpleNamespace
+
+import psutil
+
+from openllm.common import BentoInfo, DeploymentTarget, output
+
+
+class Accelerator(SimpleNamespace):
+  model: str
+  memory_size: float
+
+  def __gt__(self, other):
+    return self.memory_size > other.memory_size
+
+  def __eq__(self, other):
+    return self.memory_size == other.memory_size
+
+  def __repr__(self):
+    return f'{self.model}({self.memory_size}GB)'
+
+
+class Resource(SimpleNamespace):
+  cpu: int = 0
+  memory: float
+  gpu: int = 0
+  gpu_type: str = ''
+
+  def __hash__(self):
+    return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
+
+  def __bool__(self):
+    return any(value is not None for value in self.__dict__.values())
+
+
+ACCELERATOR_SPEC_DICT: dict[str, dict] = {
+  'nvidia-gtx-1650': {'model': 'GTX 1650', 'memory_size': 4.0},
+  'nvidia-gtx-1060': {'model': 'GTX 1060', 'memory_size': 6.0},
+  'nvidia-gtx-1080-ti': {'model': 'GTX 1080 Ti', 'memory_size': 11.0},
+  'nvidia-rtx-3060': {'model': 'RTX 3060', 'memory_size': 12.0},
+  'nvidia-rtx-3060-ti': {'model': 'RTX 3060 Ti', 'memory_size': 8.0},
+  'nvidia-rtx-3070-ti': {'model': 'RTX 3070 Ti', 'memory_size': 8.0},
+  'nvidia-rtx-3080': {'model': 'RTX 3080', 'memory_size': 10.0},
+  'nvidia-rtx-3080-ti': {'model': 'RTX 3080 Ti', 'memory_size': 12.0},
+  'nvidia-rtx-3090': {'model': 'RTX 3090', 'memory_size': 24.0},
+  'nvidia-rtx-4070-ti': {'model': 'RTX 4070 Ti', 'memory_size': 12.0},
+  'nvidia-tesla-p4': {'model': 'P4', 'memory_size': 8.0},
+  'nvidia-tesla-p100': {'model': 'P100', 'memory_size': 16.0},
+  'nvidia-tesla-k80': {'model': 'K80', 'memory_size': 12.0},
+  'nvidia-tesla-t4': {'model': 'T4', 'memory_size': 16.0},
+  'nvidia-tesla-v100': {'model': 'V100', 'memory_size': 16.0},
+  'nvidia-l4': {'model': 'L4', 'memory_size': 24.0},
+  'nvidia-tesla-l4': {'model': 'L4', 'memory_size': 24.0},
+  'nvidia-tesla-a10g': {'model': 'A10G', 'memory_size': 24.0},
+  'nvidia-a100-80g': {'model': 'A100', 'memory_size': 80.0},
+  'nvidia-a100-80gb': {'model': 'A100', 'memory_size': 80.0},
+  'nvidia-tesla-a100': {'model': 'A100', 'memory_size': 40.0},
+}
+
+
+ACCELERATOR_SPECS: dict[str, Accelerator] = {key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()}
+
+
+@functools.lru_cache
+def get_local_machine_spec():
+  if psutil.MACOS:
+    return DeploymentTarget(accelerators=[], source='local', platform='macos')
+
+  if psutil.WINDOWS:
+    platform = 'windows'
+  elif psutil.LINUX:
+    platform = 'linux'
+  else:
+    raise NotImplementedError('Unsupported platform')
+
+  from pynvml import (
+    nvmlDeviceGetCount,
+    nvmlDeviceGetCudaComputeCapability,
+    nvmlDeviceGetHandleByIndex,
+    nvmlDeviceGetMemoryInfo,
+    nvmlDeviceGetName,
+    nvmlInit,
+    nvmlShutdown,
+  )
+
+  try:
+    nvmlInit()
+    device_count = nvmlDeviceGetCount()
+    accelerators: list[Accelerator] = []
+    for i in range(device_count):
+      handle = nvmlDeviceGetHandleByIndex(i)
+      name = nvmlDeviceGetName(handle)
+      memory_info = nvmlDeviceGetMemoryInfo(handle)
+      accelerators.append(Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)))
+      compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
+      if compute_capability < (7, 5):
+        output(
+          f'GPU {name} with compute capability {compute_capability} '
+          'may not be supported, 7.5 or higher is recommended. check '
+          'https://developer.nvidia.com/cuda-gpus for more information',
+          style='yellow',
+        )
+    nvmlShutdown()
+    return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)
+  except Exception as e:
+    output(
+      'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment', style='yellow'
+    )
+    output(f'Error: {e}', style='red', level=20)
+    return DeploymentTarget(accelerators=[], source='local', platform=platform)
+
+
+@functools.lru_cache()
+def can_run(bento: typing.Union[Resource, BentoInfo], target: typing.Optional[DeploymentTarget] = None) -> float:
+  """
+  Calculate if the bento can be deployed on the target.
+  """
+  if target is None:
+    target = get_local_machine_spec()
+
+  resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))
+  labels = bento.bento_yaml.get('labels', {})
+  platforms = labels.get('platforms', 'linux').split(',')
+
+  if target.platform not in platforms:
+    return 0.0
+
+  # return 1.0 if no resource is specified
+  if not resource_spec:
+    return 0.5
+
+  if resource_spec.gpu > 0:
+    required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
+    filtered_accelerators = [ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size]
+    if resource_spec.gpu > len(filtered_accelerators):
+      return 0.0
+    return required_gpu.memory_size * resource_spec.gpu / sum(ac.memory_size for ac in target.accelerators)
+  if target.accelerators:
+    return 0.01 / sum(ac.memory_size for ac in target.accelerators)
+  return 1.0
--- a/src/openllm/analytic.py
+++ b/src/openllm/analytic.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import functools
+import os
+import re
+import time
+import typing
+from abc import ABC
+
+import attr
+import click
+import typer
+import typer.core
+
+DO_NOT_TRACK = 'BENTOML_DO_NOT_TRACK'
+
+
+class EventMeta(ABC):
+  @property
+  def event_name(self):
+    # camel case to snake case
+    event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()
+    # remove "_event" suffix
+    suffix_to_remove = '_event'
+    if event_name.endswith(suffix_to_remove):
+      event_name = event_name[: -len(suffix_to_remove)]
+    return event_name
+
+
+@attr.define
+class CliEvent(EventMeta):
+  cmd_group: str
+  cmd_name: str
+  duration_in_ms: float = attr.field(default=0)
+  error_type: typing.Optional[str] = attr.field(default=None)
+  return_code: typing.Optional[int] = attr.field(default=None)
+
+
+@attr.define
+class OpenllmCliEvent(CliEvent):
+  pass
+
+
+class OrderedCommands(typer.core.TyperGroup):
+  def list_commands(self, _: click.Context) -> typing.Iterable[str]:
+    return list(self.commands)
+
+
+class OpenLLMTyper(typer.Typer):
+  def __init__(self, *args: typing.Any, **kwargs: typing.Any):
+    no_args_is_help = kwargs.pop('no_args_is_help', True)
+    context_settings = kwargs.pop('context_settings', {})
+    if 'help_option_names' not in context_settings:
+      context_settings['help_option_names'] = ('-h', '--help')
+    if 'max_content_width' not in context_settings:
+      context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))
+    klass = kwargs.pop('cls', OrderedCommands)
+
+    super().__init__(*args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs)
+
+  def command(self, *args: typing.Any, **kwargs: typing.Any):
+    def decorator(f):
+      @functools.wraps(f)
+      @click.pass_context
+      def wrapped(ctx: click.Context, *args, **kwargs):
+        from bentoml._internal.utils.analytics import track
+
+        do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'
+
+        # so we know that the root program is openllm
+        command_name = ctx.info_name
+        if ctx.parent.parent is not None:
+          # openllm model list
+          command_group = ctx.parent.info_name
+        elif ctx.parent.info_name == ctx.find_root().info_name:
+          # openllm run
+          command_group = 'openllm'
+
+        if do_not_track:
+          return f(*args, **kwargs)
+        start_time = time.time_ns()
+        try:
+          return_value = f(*args, **kwargs)
+          duration_in_ns = time.time_ns() - start_time
+          track(OpenllmCliEvent(cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6))
+          return return_value
+        except BaseException as e:
+          duration_in_ns = time.time_ns() - start_time
+          track(
+            OpenllmCliEvent(
+              cmd_group=command_group,
+              cmd_name=command_name,
+              duration_in_ms=duration_in_ns / 1e6,
+              error_type=type(e).__name__,
+              return_code=2 if isinstance(e, KeyboardInterrupt) else 1,
+            )
+          )
+          raise
+
+      return typer.Typer.command(self, *args, **kwargs)(wrapped)
+
+    return decorator
--- a/src/openllm/clean.py
+++ b/src/openllm/clean.py
@@ -0,0 +1,66 @@
+import pathlib
+import shutil
+
+import questionary
+
+from openllm.analytic import OpenLLMTyper
+from openllm.common import CONFIG_FILE, REPO_DIR, VENV_DIR, VERBOSE_LEVEL, output
+
+app = OpenLLMTyper(help='clean up and release disk space used by OpenLLM')
+
+
+HUGGINGFACE_CACHE = pathlib.Path.home() / '.cache' / 'huggingface' / 'hub'
+
+
+@app.command(help='Clean up all the cached models from huggingface')
+def model_cache(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  used_space = sum(f.stat().st_size for f in HUGGINGFACE_CACHE.rglob('*'))
+  sure = questionary.confirm(
+    f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
+  ).ask()
+  if not sure:
+    return
+  shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
+  output('All models cached by Huggingface have been removed', style='green')
+
+
+@app.command(help='Clean up all the virtual environments created by OpenLLM')
+def venvs(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  used_space = sum(f.stat().st_size for f in VENV_DIR.rglob('*'))
+  sure = questionary.confirm(
+    f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
+  ).ask()
+  if not sure:
+    return
+  shutil.rmtree(VENV_DIR, ignore_errors=True)
+  output('All virtual environments have been removed', style='green')
+
+
+@app.command(help='Clean up all the repositories cloned by OpenLLM')
+def repos(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  shutil.rmtree(REPO_DIR, ignore_errors=True)
+  output('All repositories have been removed', style='green')
+
+
+@app.command(help='Reset configurations to default')
+def configs(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  shutil.rmtree(CONFIG_FILE, ignore_errors=True)
+  output('All configurations have been reset', style='green')
+
+
+@app.command(name='all', help='Clean up all above and bring OpenLLM to a fresh start')
+def all_cache(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  repos()
+  venvs()
+  model_cache()
+  configs()
--- a/src/openllm/cloud.py
+++ b/src/openllm/cloud.py
@@ -0,0 +1,130 @@
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import typing
+
+import typer
+
+from openllm.accelerator_spec import ACCELERATOR_SPECS
+from openllm.analytic import OpenLLMTyper
+from openllm.common import INTERACTIVE, BentoInfo, DeploymentTarget, output, run_command
+
+app = OpenLLMTyper()
+
+
+def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
+  cmd = ['bentoml', 'deploy', bento.bentoml_tag]
+  env = {'BENTOML_HOME': f'{bento.repo.path}/bentoml'}
+
+  required_envs = bento.bento_yaml.get('envs', [])
+  required_env_names = [env['name'] for env in required_envs if 'name' in env]
+  if required_env_names:
+    output(f'This model requires the following environment variables to run: {required_env_names!r}', style='yellow')
+
+  for env_info in bento.bento_yaml.get('envs', []):
+    if 'name' not in env_info:
+      continue
+    if os.environ.get(env_info['name']):
+      default = os.environ[env_info['name']]
+    elif 'value' in env_info:
+      default = env_info['value']
+    else:
+      default = ''
+
+    if INTERACTIVE.get():
+      import questionary
+
+      value = questionary.text(f"{env_info['name']}:", default=default).ask()
+    else:
+      if default == '':
+        output(f"Environment variable {env_info['name']} is required but not provided", style='red')
+        raise typer.Exit(1)
+      else:
+        value = default
+
+    if value is None:
+      raise typer.Exit(1)
+    cmd += ['--env', f"{env_info['name']}={value}"]
+
+  if target:
+    cmd += ['--instance-type', target.name]
+
+  assert (pathlib.Path.home() / 'bentoml' / '.yatai.yaml').exists()
+  shutil.copy(pathlib.Path.home() / 'bentoml' / '.yatai.yaml', bento.repo.path / 'bentoml' / '.yatai.yaml')
+
+  return cmd, env, None
+
+
+def ensure_cloud_context():
+  import questionary
+
+  cmd = ['bentoml', 'cloud', 'current-context']
+  try:
+    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+    context = json.loads(result)
+    output(f"  bentoml already logged in: {context['endpoint']}", style='green')
+  except subprocess.CalledProcessError:
+    output('  bentoml not logged in', style='red')
+    if not INTERACTIVE.get():
+      output('\n  get bentoml logged in by:')
+      output('    $ bentoml cloud login', style='orange')
+      output('')
+      output(
+        """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
+        style='yellow',
+      )
+      raise typer.Exit(1)
+    else:
+      action = questionary.select(
+        'Choose an action:', choices=['I have a BentoCloud account', 'get an account in two minutes']
+      ).ask()
+      if action is None:
+        raise typer.Exit(1)
+      elif action == 'get an account in two minutes':
+        output('Please visit https://cloud.bentoml.com to get your token', style='yellow')
+      endpoint = questionary.text('Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)').ask()
+      if endpoint is None:
+        raise typer.Exit(1)
+      token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()
+      if token is None:
+        raise typer.Exit(1)
+      cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]
+      try:
+        result = subprocess.check_output(cmd)
+        output('  Logged in successfully', style='green')
+      except subprocess.CalledProcessError:
+        output('  Failed to login', style='red')
+        raise typer.Exit(1)
+
+
+def get_cloud_machine_spec():
+  ensure_cloud_context()
+  cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']
+  try:
+    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
+    instance_types = json.loads(result)
+    return [
+      DeploymentTarget(
+        source='cloud',
+        name=it['name'],
+        price=it['price'],
+        platform='linux',
+        accelerators=(
+          [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]
+          if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS
+          else []
+        ),
+      )
+      for it in instance_types
+    ]
+  except (subprocess.CalledProcessError, json.JSONDecodeError):
+    output('Failed to get cloud instance types', style='red')
+    return []
+
+
+def deploy(bento: BentoInfo, target: DeploymentTarget):
+  ensure_cloud_context()
+  cmd, env, cwd = _get_deploy_cmd(bento, target)
+  run_command(cmd, env=env, cwd=cwd)
--- a/src/openllm/common.py
+++ b/src/openllm/common.py
@@ -0,0 +1,370 @@
+from __future__ import annotations
+
+import asyncio
+import functools
+import hashlib
+import io
+import json
+import os
+import pathlib
+import signal
+import subprocess
+import sys
+import sysconfig
+import typing
+from contextlib import asynccontextmanager, contextmanager
+from types import SimpleNamespace
+
+import typer
+import typer.core
+
+ERROR_STYLE = 'red'
+SUCCESS_STYLE = 'green'
+
+
+CLLAMA_HOME = pathlib.Path.home() / '.openllm'
+REPO_DIR = CLLAMA_HOME / 'repos'
+TEMP_DIR = CLLAMA_HOME / 'temp'
+VENV_DIR = CLLAMA_HOME / 'venv'
+
+REPO_DIR.mkdir(exist_ok=True, parents=True)
+TEMP_DIR.mkdir(exist_ok=True, parents=True)
+VENV_DIR.mkdir(exist_ok=True, parents=True)
+
+CONFIG_FILE = CLLAMA_HOME / 'config.json'
+
+CHECKED = '☆'
+
+T = typing.TypeVar('T')
+
+
+class ContextVar(typing.Generic[T]):
+  def __init__(self, default: T):
+    self._stack: list[T] = []
+    self._default = default
+
+  def get(self) -> T:
+    if self._stack:
+      return self._stack[-1]
+    return self._default
+
+  def set(self, value):
+    self._stack.append(value)
+
+  @contextmanager
+  def patch(self, value):
+    self._stack.append(value)
+    try:
+      yield
+    finally:
+      self._stack.pop()
+
+
+VERBOSE_LEVEL = ContextVar(10)
+INTERACTIVE = ContextVar(False)
+FORCE = ContextVar(False)
+
+
+def output(content, level=0, style=None, end=None):
+  import questionary
+
+  if level > VERBOSE_LEVEL.get():
+    return
+
+  if not isinstance(content, str):
+    import pyaml
+
+    out = io.StringIO()
+    pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)
+    questionary.print(out.getvalue(), style=style, end='' if end is None else end)
+    out.close()
+
+  if isinstance(content, str):
+    questionary.print(content, style=style, end='\n' if end is None else end)
+
+
+class Config(SimpleNamespace):
+  repos: dict[str, str] = {'default': 'git+https://github.com/bentoml/openllm-models@main'}
+  default_repo: str = 'default'
+
+  def tolist(self):
+    return dict(repos=self.repos, default_repo=self.default_repo)
+
+
+def load_config():
+  if CONFIG_FILE.exists():
+    try:
+      with open(CONFIG_FILE) as f:
+        return Config(**json.load(f))
+    except json.JSONDecodeError:
+      return Config()
+  return Config()
+
+
+def save_config(config):
+  with open(CONFIG_FILE, 'w') as f:
+    json.dump(config.tolist(), f, indent=2)
+
+
+class RepoInfo(SimpleNamespace):
+  name: str
+  path: pathlib.Path
+  url: str
+  server: str
+  owner: str
+  repo: str
+  branch: str
+
+  def tolist(self):
+    if VERBOSE_LEVEL.get() <= 0:
+      return f'{self.name} ({self.url})'
+    if VERBOSE_LEVEL.get() <= 10:
+      return dict(name=self.name, url=self.url, path=str(self.path))
+    if VERBOSE_LEVEL.get() <= 20:
+      return dict(
+        name=self.name,
+        url=self.url,
+        path=str(self.path),
+        server=self.server,
+        owner=self.owner,
+        repo=self.repo,
+        branch=self.branch,
+      )
+
+
+class BentoInfo(SimpleNamespace):
+  repo: RepoInfo
+  path: pathlib.Path
+  alias: str = ''
+
+  def __str__(self):
+    if self.repo.name == 'default':
+      return f'{self.tag}'
+    else:
+      return f'{self.repo.name}/{self.tag}'
+
+  def __hash__(self):
+    return md5(str(self.path))
+
+  @property
+  def tag(self) -> str:
+    if self.alias:
+      return f'{self.path.parent.name}:{self.alias}'
+    return f'{self.path.parent.name}:{self.path.name}'
+
+  @property
+  def bentoml_tag(self) -> str:
+    return f'{self.path.parent.name}:{self.path.name}'
+
+  @property
+  def name(self) -> str:
+    return self.path.parent.name
+
+  @property
+  def version(self) -> str:
+    return self.path.name
+
+  @property
+  def labels(self) -> dict[str, str]:
+    return self.bento_yaml['labels']
+
+  @functools.cached_property
+  def bento_yaml(self) -> dict:
+    import yaml
+
+    bento_file = self.path / 'bento.yaml'
+    return yaml.safe_load(bento_file.read_text())
+
+  @functools.cached_property
+  def platforms(self) -> list[str]:
+    return self.bento_yaml['labels'].get('platforms', 'linux').split(',')
+
+  @functools.cached_property
+  def pretty_yaml(self) -> dict:
+    def _pretty_routes(routes):
+      return {
+        route['route']: {
+          'input': {k: v['type'] for k, v in route['input']['properties'].items()},
+          'output': route['output']['type'],
+        }
+        for route in routes
+      }
+
+    if len(self.bento_yaml['services']) == 1:
+      pretty_yaml = {
+        'apis': _pretty_routes(self.bento_yaml['schema']['routes']),
+        'resources': self.bento_yaml['services'][0]['config']['resources'],
+        'envs': self.bento_yaml['envs'],
+        'platforms': self.platforms,
+      }
+      return pretty_yaml
+    return self.bento_yaml
+
+  @functools.cached_property
+  def pretty_gpu(self) -> str:
+    from openllm.accelerator_spec import ACCELERATOR_SPECS
+
+    try:
+      resources = self.bento_yaml['services'][0]['config']['resources']
+      if resources['gpu'] > 1:
+        acc = ACCELERATOR_SPECS[resources['gpu_type']]
+        return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
+      elif resources['gpu'] > 0:
+        acc = ACCELERATOR_SPECS[resources['gpu_type']]
+        return f'{acc.memory_size:.0f}G'
+    except KeyError:
+      pass
+    return ''
+
+  def tolist(self):
+    verbose = VERBOSE_LEVEL.get()
+    if verbose <= 0:
+      return str(self)
+    if verbose <= 10:
+      return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml)
+    if verbose <= 20:
+      return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml)
+
+
+class VenvSpec(SimpleNamespace):
+  python_version: str
+  python_packages: dict[str, str]
+  name_prefix = ''
+
+  def __hash__(self):
+    return md5(
+      # self.python_version,
+      *sorted(self.python_packages)
+    )
+
+
+class Accelerator(SimpleNamespace):
+  model: str
+  memory_size: float
+
+  def __gt__(self, other):
+    return self.memory_size > other.memory_size
+
+  def __eq__(self, other):
+    return self.memory_size == other.memory_size
+
+
+class DeploymentTarget(SimpleNamespace):
+  source: str = 'local'
+  name: str = 'local'
+  price: str = ''
+  platform = 'linux'
+  accelerators: list[Accelerator]
+
+  def __hash__(self):
+    return hash(self.source)
+
+  @property
+  def accelerators_repr(self) -> str:
+    accs = {a.model for a in self.accelerators}
+    if len(accs) == 0:
+      return 'null'
+    if len(accs) == 1:
+      a = self.accelerators[0]
+      return f'{a.model} x{len(self.accelerators)}'
+    return ', '.join((f'{a.model}' for a in self.accelerators))
+
+
+def run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=False) -> subprocess.CompletedProcess:
+  import shlex
+
+  env = env or {}
+  cmd = [str(c) for c in cmd]
+  bin_dir = 'Scripts' if os.name == 'nt' else 'bin'
+  if not silent:
+    output('\n')
+    if cwd:
+      output(f'$ cd {cwd}', style='orange')
+    if env:
+      for k, v in env.items():
+        output(f'$ export {k}={shlex.quote(v)}', style='orange')
+    if venv:
+      output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
+    output(f"$ {' '.join(cmd)}", style='orange')
+
+  if venv:
+    py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
+  else:
+    py = sys.executable
+
+  if copy_env:
+    env = {**os.environ, **env}
+
+  if cmd and cmd[0] == 'bentoml':
+    cmd = [py, '-m', 'bentoml'] + cmd[1:]
+  if cmd and cmd[0] == 'python':
+    cmd = [py] + cmd[1:]
+
+  try:
+    if silent:
+      return subprocess.run(  # type: ignore
+        cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+      )
+    else:
+      return subprocess.run(cmd, cwd=cwd, env=env)
+  except subprocess.CalledProcessError:
+    output('Command failed', style='red')
+    raise typer.Exit(1)
+
+
+async def stream_command_output(stream, style='gray'):
+  async for line in stream:
+    output(line.decode(), style=style, end='')
+
+
+@asynccontextmanager
+async def async_run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=True):
+  import shlex
+
+  env = env or {}
+  cmd = [str(c) for c in cmd]
+
+  if not silent:
+    output('\n')
+    if cwd:
+      output(f'$ cd {cwd}', style='orange')
+    if env:
+      for k, v in env.items():
+        output(f'$ export {k}={shlex.quote(v)}', style='orange')
+    if venv:
+      output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
+    output(f"$ {' '.join(cmd)}", style='orange')
+
+  if venv:
+    py = venv / 'bin' / 'python'
+  else:
+    py = sys.executable
+
+  if copy_env:
+    env = {**os.environ, **env}
+
+  if cmd and cmd[0] == 'bentoml':
+    cmd = [py, '-m', 'bentoml'] + cmd[1:]
+  if cmd and cmd[0] == 'python':
+    cmd = [py] + cmd[1:]
+
+  proc = None
+  try:
+    proc = await asyncio.create_subprocess_shell(
+      ' '.join(map(str, cmd)), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd, env=env
+    )
+    yield proc
+  except subprocess.CalledProcessError:
+    output('Command failed', style='red')
+    raise typer.Exit(1)
+  finally:
+    if proc:
+      proc.send_signal(signal.SIGINT)
+      await proc.wait()
+
+
+def md5(*strings: str) -> int:
+  m = hashlib.md5()
+  for s in strings:
+    m.update(s.encode())
+  return int(m.hexdigest(), 16)
--- a/src/openllm/local.py
+++ b/src/openllm/local.py
@@ -0,0 +1,91 @@
+import asyncio
+import time
+
+import httpx
+
+from openllm.common import BentoInfo, async_run_command, output, run_command, stream_command_output
+from openllm.venv import ensure_venv
+
+
+def _get_serve_cmd(bento: BentoInfo, port: int = 3000):
+  cmd = ['bentoml', 'serve', bento.bentoml_tag]
+  if port != 3000:
+    cmd += ['--port', str(port)]
+  env = {'BENTOML_HOME': f'{bento.repo.path}/bentoml'}
+  return cmd, env, None
+
+
+def serve(bento: BentoInfo, port: int = 3000):
+  venv = ensure_venv(bento)
+  cmd, env, cwd = _get_serve_cmd(bento, port=port)
+  run_command(cmd, env=env, cwd=cwd, venv=venv)
+
+
+async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600):
+  venv = ensure_venv(bento)
+  cmd, env, cwd = _get_serve_cmd(bento, port)
+  async with async_run_command(cmd, env=env, cwd=cwd, venv=venv, silent=False) as server_proc:
+    output(f'Model server started {server_proc.pid}')
+
+    stdout_streamer = None
+    stderr_streamer = None
+    start_time = time.time()
+
+    output('Model loading...', style='green')
+    for _ in range(timeout):
+      try:
+        resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)
+        if resp.status_code == 200:
+          break
+      except httpx.RequestError:
+        if time.time() - start_time > 30:
+          if not stdout_streamer:
+            stdout_streamer = asyncio.create_task(stream_command_output(server_proc.stdout, style='gray'))
+          if not stderr_streamer:
+            stderr_streamer = asyncio.create_task(stream_command_output(server_proc.stderr, style='#BD2D0F'))
+        await asyncio.sleep(1)
+    else:
+      output('Model failed to load', style='red')
+      server_proc.terminate()
+      return
+
+    if stdout_streamer:
+      stdout_streamer.cancel()
+    if stderr_streamer:
+      stderr_streamer.cancel()
+
+    output('Model is ready', style='green')
+    messages: list[dict[str, str]] = []
+
+    from openai import AsyncOpenAI
+
+    client = AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')
+    model_id = (await client.models.list()).data[0].id
+    while True:
+      try:
+        message = input('user: ')
+        if message == '':
+          output('empty message, please enter something', style='yellow')
+          continue
+        messages.append(dict(role='user', content=message))
+        output('assistant: ', end='', style='lightgreen')
+        assistant_message = ''
+        stream = await client.chat.completions.create(
+          model=model_id,
+          messages=messages,  # type: ignore
+          stream=True,
+        )
+        async for chunk in stream:
+          text = chunk.choices[0].delta.content or ''
+          assistant_message += text
+          output(text, end='', style='lightgreen')
+        messages.append(dict(role='assistant', content=assistant_message))
+        output('')
+      except KeyboardInterrupt:
+        break
+    output('\nStopping model server...', style='green')
+  output('Stopped model server', style='green')
+
+
+def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
+  asyncio.run(_run_model(bento, port=port, timeout=timeout))
--- a/src/openllm/model.py
+++ b/src/openllm/model.py
@@ -0,0 +1,147 @@
+import typing
+from typing import Optional
+
+import tabulate
+import typer
+
+from openllm.accelerator_spec import DeploymentTarget, can_run
+from openllm.analytic import OpenLLMTyper
+from openllm.common import FORCE, VERBOSE_LEVEL, BentoInfo, load_config, output
+from openllm.repo import ensure_repo_updated, parse_repo_url
+
+app = OpenLLMTyper(help='manage models')
+
+
+@app.command()
+def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  bento_info = ensure_bento(tag, repo_name=repo)
+  if bento_info:
+    output(bento_info)
+
+
+@app.command(name='list')
+def list_(tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+
+  bentos = list_bento(tag=tag, repo_name=repo)
+  bentos.sort(key=lambda x: x.name)
+
+  seen = set()
+
+  def is_seen(value):
+    if value in seen:
+      return True
+    seen.add(value)
+    return False
+
+  table = tabulate.tabulate(
+    [
+      [
+        '' if is_seen(bento.name) else bento.name,
+        bento.tag,
+        bento.repo.name,
+        bento.pretty_gpu,
+        ','.join(bento.platforms),
+      ]
+      for bento in bentos
+    ],
+    headers=['model', 'version', 'repo', 'required VRAM', 'platforms'],
+  )
+  output(table)
+
+
+def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_name: Optional[str] = None) -> BentoInfo:
+  bentos = list_bento(model, repo_name=repo_name)
+  if len(bentos) == 0:
+    output(f'No model found for {model}', style='red')
+    raise typer.Exit(1)
+
+  if len(bentos) == 1:
+    if FORCE.get():
+      output(f'Found model {bentos[0]}', style='green')
+      return bentos[0]
+    if target is None:
+      return bentos[0]
+    if can_run(bentos[0], target) <= 0:
+      return bentos[0]
+    output(f'Found model {bentos[0]}', style='green')
+    return bentos[0]
+
+  if target is None:
+    output(f'Multiple models match {model}, did you mean one of these?', style='red')
+    for bento in bentos:
+      output(f'  {bento}')
+    raise typer.Exit(1)
+
+  filtered = [bento for bento in bentos if can_run(bento, target) > 0]
+  if len(filtered) == 0:
+    output(f'No deployment target found for {model}', style='red')
+    raise typer.Exit(1)
+
+  if len(filtered) == 0:
+    output(f'No deployment target found for {model}', style='red')
+    raise typer.Exit(1)
+
+  if len(bentos) > 1:
+    output(f'Multiple models match {model}, did you mean one of these?', style='red')
+    for bento in bentos:
+      output(f'  {bento}')
+    raise typer.Exit(1)
+
+  return bentos[0]
+
+
+def list_bento(
+  tag: typing.Optional[str] = None, repo_name: typing.Optional[str] = None, include_alias: bool = False
+) -> typing.List[BentoInfo]:
+  ensure_repo_updated()
+
+  if repo_name is not None:
+    config = load_config()
+    if repo_name not in config.repos:
+      output(f'Repo `{repo_name}` not found, did you mean one of these?')
+      for repo_name in config.repos:
+        output(f'  {repo_name}')
+      raise typer.Exit(1)
+
+  if not tag:
+    glob_pattern = 'bentoml/bentos/*/*'
+  elif ':' in tag:
+    bento_name, version = tag.split(':')
+    glob_pattern = f'bentoml/bentos/{bento_name}/{version}'
+  else:
+    glob_pattern = f'bentoml/bentos/{tag}/*'
+
+  model_list = []
+  config = load_config()
+  for _repo_name, repo_url in config.repos.items():
+    if repo_name is not None and _repo_name != repo_name:
+      continue
+    repo = parse_repo_url(repo_url, _repo_name)
+    for path in repo.path.glob(glob_pattern):
+      if path.is_dir() and (path / 'bento.yaml').exists():
+        model = BentoInfo(repo=repo, path=path)
+      elif path.is_file():
+        with open(path) as f:
+          origin_name = f.read().strip()
+        origin_path = path.parent / origin_name
+        model = BentoInfo(alias=path.name, repo=repo, path=origin_path)
+      else:
+        model = None
+      if model:
+        model_list.append(model)
+  model_list.sort(key=lambda x: x.tag)
+  if not include_alias:
+    seen = set()
+    model_list = [
+      x
+      for x in model_list
+      if not (
+        f"{x.bento_yaml['name']}:{x.bento_yaml['version']}" in seen
+        or seen.add(f"{x.bento_yaml['name']}:{x.bento_yaml['version']}")
+      )
+    ]
+  return model_list
--- a/src/openllm/repo.py
+++ b/src/openllm/repo.py
@@ -0,0 +1,179 @@
+import datetime
+import re
+import shutil
+
+import pyaml
+import questionary
+import typer
+
+from openllm.analytic import OpenLLMTyper
+from openllm.common import INTERACTIVE, REPO_DIR, VERBOSE_LEVEL, RepoInfo, load_config, output, save_config
+
+UPDATE_INTERVAL = datetime.timedelta(days=3)
+
+app = OpenLLMTyper(help='manage repos')
+
+
+@app.command()
+def list(verbose: bool = False):
+  if verbose:
+    VERBOSE_LEVEL.set(20)
+  config = load_config()
+  pyaml.pprint([parse_repo_url(repo, name) for name, repo in config.repos.items()], sort_dicts=False, sort_keys=False)
+
+
+@app.command()
+def remove(name: str):
+  config = load_config()
+  if name not in config.repos:
+    output(f'Repo {name} does not exist', style='red')
+    return
+
+  del config.repos[name]
+  save_config(config)
+  output(f'Repo {name} removed', style='green')
+
+
+def _complete_alias(repo_name: str):
+  from openllm.model import list_bento
+
+  for bento in list_bento(repo_name=repo_name):
+    alias = bento.labels.get('openllm_alias', '').strip()
+    if alias:
+      for a in alias.split(','):
+        with open(bento.path.parent / a, 'w') as f:
+          f.write(bento.version)
+
+
+@app.command()
+def update():
+  import dulwich
+  import dulwich.errors
+  import dulwich.porcelain
+
+  config = load_config()
+  repos_in_use = set()
+  for repo_name, repo in config.repos.items():
+    repo = parse_repo_url(repo, repo_name)
+    repos_in_use.add((repo.server, repo.owner, repo.repo))
+    if repo.path.exists():  # TODO: use update instead of remove and clone
+      shutil.rmtree(repo.path, ignore_errors=True)
+    if not repo.path.exists():
+      repo.path.parent.mkdir(parents=True, exist_ok=True)
+      try:
+        dulwich.porcelain.clone(
+          f'https://{repo.server}/{repo.owner}/{repo.repo}.git',
+          str(repo.path),
+          checkout=True,
+          depth=1,
+          branch=repo.branch,
+        )
+        output('')
+        output(f'Repo `{repo.name}` updated', style='green')
+      except:
+        shutil.rmtree(repo.path, ignore_errors=True)
+        output(f'Failed to clone repo {repo.name}', style='red')
+    else:
+      try:
+        import dulwich.porcelain
+
+        dulwich.porcelain.pull(
+          str(repo.path), f'https://{repo.server}/{repo.owner}/{repo.repo}.git', refspecs=repo.branch, force=True
+        )
+        dulwich.porcelain.clean(str(repo.path), str(repo.path))
+        output('')
+        output(f'Repo `{repo.name}` updated', style='green')
+      except:
+        shutil.rmtree(repo.path, ignore_errors=True)
+        output(f'Failed to update repo {repo.name}', style='red')
+  for c in REPO_DIR.glob('*/*/*'):
+    repo_spec = tuple(c.parts[-3:])
+    if repo_spec not in repos_in_use:
+      shutil.rmtree(c, ignore_errors=True)
+      output(f'Removed unused repo cache {c}')
+  with open(REPO_DIR / 'last_update', 'w') as f:
+    f.write(datetime.datetime.now().isoformat())
+  for repo_name in config.repos:
+    _complete_alias(repo_name)
+
+
+def ensure_repo_updated():
+  last_update_file = REPO_DIR / 'last_update'
+  if not last_update_file.exists():
+    if INTERACTIVE.get():
+      choice = questionary.confirm(
+        'The repo cache is never updated, do you want to update it to fetch the latest model list?'
+      ).ask()
+      if choice:
+        update()
+      return
+    else:
+      output(
+        'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list', style='red'
+      )
+      raise typer.Exit(1)
+  last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
+  if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
+    if INTERACTIVE.get():
+      choice = questionary.confirm(
+        'The repo cache is outdated, do you want to update it to fetch the latest model list?'
+      ).ask()
+      if choice:
+        update()
+    else:
+      output(
+        'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list', style='yellow'
+      )
+
+
+GIT_REPO_RE = re.compile(r'git\+https://(?P<server>.+)/(?P<owner>.+)/(?P<repo>.+?)(@(?P<branch>.+))?$')
+
+
+def parse_repo_url(repo_url, repo_name=None) -> RepoInfo:
+  """
+  parse the git repo url to server, owner, repo name, branch
+  >>> parse_repo_url('git+https://github.com/bentoml/bentovllm@main')
+  ('github.com', 'bentoml', 'bentovllm', 'main')
+
+  >>> parse_repo_url('git+https://github.com/bentoml/bentovllm')
+  ('github.com', 'bentoml', 'bentovllm', 'main')
+  """
+  match = GIT_REPO_RE.match(repo_url)
+  if not match:
+    raise ValueError(f'Invalid git repo url: {repo_url}')
+  server = match.group('server')
+  owner = match.group('owner')
+  repo = match.group('repo')
+  branch = match.group('branch') or 'main'
+  path = REPO_DIR / server / owner / repo
+  return RepoInfo(
+    name=repo if repo_name is None else repo_name,
+    url=repo_url,
+    server=server,
+    owner=owner,
+    repo=repo,
+    branch=branch,
+    path=path,
+  )
+
+
+@app.command()
+def add(name: str, repo: str):
+  name = name.lower()
+  if not name.isidentifier():
+    output(f'Invalid repo name: {name}, should only contain letters, numbers and underscores', style='red')
+    return
+
+  config = load_config()
+  if name in config.repos:
+    override = questionary.confirm(f'Repo {name} already exists({config.repos[name]}), override?').ask()
+    if not override:
+      return
+
+  config.repos[name] = repo
+  save_config(config)
+  output(f'Repo {name} added', style='green')
+
+
+if __name__ == '__main__':
+  app()
--- a/src/openllm/venv.py
+++ b/src/openllm/venv.py
@@ -0,0 +1,118 @@
+import functools
+import os
+import pathlib
+import shutil
+import typing
+from typing import Iterable
+
+import typer
+
+from openllm.common import VENV_DIR, VERBOSE_LEVEL, BentoInfo, VenvSpec, output, run_command
+
+
+@functools.lru_cache
+def _resolve_packages(requirement: typing.Union[pathlib.Path, str]):
+  from pip_requirements_parser import RequirementsFile
+
+  requirements_txt = RequirementsFile.from_file(str(requirement), include_nested=True)
+  return requirements_txt.requirements
+
+
+def _filter_preheat_packages(requirements: Iterable) -> list[str]:
+  PREHEAT_PIP_PACKAGES = ['torch', 'vllm']
+
+  deps: list[str] = []
+  for req in requirements:
+    if req.is_editable or req.is_local_path or req.is_url or req.is_wheel or not req.name or not req.specifier:
+      continue
+    for sp in req.specifier:
+      if sp.operator == '==' and req.name in PREHEAT_PIP_PACKAGES:
+        assert req.line is not None
+        deps.append(req.line)
+        break
+  return deps
+
+
+@functools.lru_cache
+def _resolve_bento_env_specs(bento: BentoInfo):
+  ver_file = bento.path / 'env' / 'python' / 'version.txt'
+  assert ver_file.exists(), f'cannot find version file in {bento.path}'
+
+  lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'
+  if not lock_file.exists():
+    lock_file = bento.path / 'env' / 'python' / 'requirements.txt'
+
+  reqs = _resolve_packages(lock_file)
+  preheat_packages = _filter_preheat_packages(reqs)
+  ver = ver_file.read_text().strip()
+  return (
+    VenvSpec(python_version=ver, python_packages=preheat_packages, name_prefix=f"{bento.tag.replace(':', '_')}-1-"),
+    VenvSpec(
+      python_version=ver, python_packages=[v.line for v in reqs], name_prefix=f"{bento.tag.replace(':', '_')}-2-"
+    ),
+  )
+
+
+def _get_lib_dir(venv: pathlib.Path) -> pathlib.Path:
+  if os.name == 'nt':
+    return venv / 'Lib/site-packages'
+  else:
+    return next(venv.glob('lib/python*')) / 'site-packages'
+
+
+def _ensure_venv(env_spec: VenvSpec, parrent_venv: typing.Optional[pathlib.Path] = None) -> pathlib.Path:
+  venv = VENV_DIR / str(hash(env_spec))
+  if venv.exists() and not (venv / 'DONE').exists():
+    shutil.rmtree(venv, ignore_errors=True)
+  if not venv.exists():
+    output(f'Installing model dependencies({venv})...', style='green')
+
+    venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'
+    try:
+      run_command(['python', '-m', 'uv', 'venv', venv], silent=VERBOSE_LEVEL.get() < 10)
+      lib_dir = _get_lib_dir(venv)
+      if parrent_venv is not None:
+        parent_lib_dir = _get_lib_dir(parrent_venv)
+        with open(lib_dir / f'{parrent_venv.name}.pth', 'w+') as f:
+          f.write(str(parent_lib_dir))
+      with open(venv / 'requirements.txt', 'w') as f:
+        f.write('\n'.join(sorted(env_spec.python_packages)))
+      run_command(
+        ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), '-r', venv / 'requirements.txt'],
+        silent=VERBOSE_LEVEL.get() < 10,
+      )
+      with open(venv / 'DONE', 'w') as f:
+        f.write('DONE')
+    except Exception:
+      shutil.rmtree(venv, ignore_errors=True)
+      output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')
+      raise typer.Exit(1)
+    output(f'Successfully installed dependencies to {venv}.', style='green')
+    return venv
+  else:
+    return venv
+
+
+def _ensure_venvs(env_spec_list: Iterable[VenvSpec]) -> pathlib.Path:
+  last_venv = None
+  for env_spec in env_spec_list:
+    last_venv = _ensure_venv(env_spec, last_venv)
+  assert last_venv is not None
+  return last_venv
+
+
+def ensure_venv(bento: BentoInfo) -> pathlib.Path:
+  return _ensure_venvs(_resolve_bento_env_specs(bento))
+
+
+def _check_venv(env_spec: VenvSpec) -> bool:
+  venv = VENV_DIR / str(hash(env_spec))
+  if not venv.exists():
+    return False
+  if venv.exists() and not (venv / 'DONE').exists():
+    return False
+  return True
+
+
+def check_venv(bento: BentoInfo) -> bool:
+  return all(_check_venv(env_spec) for env_spec in _resolve_bento_env_specs(bento))
--- a/tools/compile.sh
+++ b/tools/compile.sh
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
-pip install -U mypy==1.7.0 "build<1" hatchling==1.18.0 hatch-vcs==0.3.0 hatch-fancy-pypi-readme==23.1.0 hatch-mypyc==0.16.0
-HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-client -w -C--global-option=--verbose "$@"
-HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-core -w -C--global-option=--verbose "$@"
-HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 python -m build openllm-python -w -C--global-option=--verbose "$@"
-hatch clean
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-import dataclasses
-import os
-import typing as t
-
-import tomlkit
-from ghapi.all import GhApi
-
-if t.TYPE_CHECKING:
-  from tomlkit.items import Array, Table
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-_OWNER, _REPO = 'bentoml', 'openllm'
-
-
-@dataclasses.dataclass(frozen=True)
-class Classifier:
-  identifier: t.Dict[str, str] = dataclasses.field(
-    default_factory=lambda: {
-      'status': 'Development Status',
-      'environment': 'Environment',
-      'license': 'License',
-      'topic': 'Topic',
-      'os': 'Operating System',
-      'audience': 'Intended Audience',
-      'typing': 'Typing',
-      'language': 'Programming Language',
-    }
-  )
-  joiner: str = ' :: '
-
-  @staticmethod
-  def status() -> dict[int, str]:
-    return {
-      v: status
-      for v, status in zip(
-        range(1, 8),
-        [
-          '1 - Planning',
-          '2 - Pre-Alpha',
-          '3 - Alpha',
-          '4 - Beta',
-          '5 - Production/Stable',
-          '6 - Mature',
-          '7 - Inactive',
-        ],
-      )
-    }
-
-  @staticmethod
-  def apache() -> str:
-    return Classifier.create_classifier('license', 'OSI Approved', 'Apache Software License')
-
-  @staticmethod
-  def create_classifier(identifier: str, *decls: t.Any) -> str:
-    cls_ = Classifier()
-    if identifier not in cls_.identifier:
-      raise ValueError(f'{identifier} is not yet supported (supported alias: {Classifier.identifier})')
-    return cls_.joiner.join([cls_.identifier[identifier], *decls])
-
-  @staticmethod
-  def create_python_classifier(
-    implementation: list[str] | None = None, supported_version: list[str] | None = None
-  ) -> list[str]:
-    if supported_version is None:
-      supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12']
-    if implementation is None:
-      implementation = ['CPython', 'PyPy']
-    base = [
-      Classifier.create_classifier('language', 'Python'),
-      Classifier.create_classifier('language', 'Python', '3'),
-    ]
-    base.append(Classifier.create_classifier('language', 'Python', '3', 'Only'))
-    base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version])
-    base.extend([
-      Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation
-    ])
-    return base
-
-  @staticmethod
-  def create_status_classifier(level: int) -> str:
-    return Classifier.create_classifier('status', Classifier.status()[level])
-
-
-@dataclasses.dataclass(frozen=True)
-class Dependencies:
-  name: str
-  git_repo_url: t.Optional[str] = None
-  branch: t.Optional[str] = None
-  extensions: t.Optional[t.List[str]] = None
-  subdirectory: t.Optional[str] = None
-  requires_gpu: bool = False
-  lower_constraint: t.Optional[str] = None
-  upper_constraint: t.Optional[str] = None
-  platform: t.Optional[t.Tuple[t.Literal['Linux', 'Windows', 'Darwin'], t.Literal['eq', 'ne']]] = None
-
-  def with_options(self, **kwargs: t.Any) -> Dependencies:
-    return dataclasses.replace(self, **kwargs)
-
-  @property
-  def has_constraint(self) -> bool:
-    return self.lower_constraint is not None or self.upper_constraint is not None
-
-  @property
-  def pypi_extensions(self) -> str:
-    return '' if self.extensions is None else f"[{','.join(self.extensions)}]"
-
-  @staticmethod
-  def platform_restriction(platform: t.LiteralString, op: t.Literal['eq', 'ne'] = 'eq') -> str:
-    return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
-
-  def to_str(self) -> str:
-    deps: list[str] = []
-    if self.lower_constraint is not None and self.upper_constraint is not None:
-      dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}'
-    elif self.lower_constraint is not None:
-      dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint}'
-    elif self.upper_constraint is not None:
-      dep = f'{self.name}{self.pypi_extensions}<{self.upper_constraint}'
-    elif self.subdirectory is not None:
-      dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}'
-    elif self.branch is not None:
-      dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}'
-    else:
-      dep = f'{self.name}{self.pypi_extensions}'
-    deps.append(dep)
-    if self.platform:
-      deps.append(self.platform_restriction(*self.platform))
-    return ';'.join(deps)
-
-  @classmethod
-  def from_tuple(cls, *decls: t.Any) -> Dependencies:
-    return cls(*decls)
-
-
-_LOWER_BENTOML_CONSTRAINT = '1.2.16'
-_BENTOML_EXT = ['io']
-
-
-fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}'
-
-
-def correct_style(it: t.Any) -> t.Any:
-  return it
-
-
-def create_classifiers() -> Array:
-  arr = correct_style(tomlkit.array())
-  arr.extend([
-    Classifier.create_status_classifier(5),
-    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA'),
-    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '12'),
-    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.8'),
-    Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.7'),
-    Classifier.apache(),
-    Classifier.create_classifier('topic', 'Scientific/Engineering', 'Artificial Intelligence'),
-    Classifier.create_classifier('topic', 'Software Development', 'Libraries'),
-    Classifier.create_classifier('os', 'OS Independent'),
-    Classifier.create_classifier('audience', 'Developers'),
-    Classifier.create_classifier('audience', 'Science/Research'),
-    Classifier.create_classifier('audience', 'System Administrators'),
-    Classifier.create_classifier('typing', 'Typed'),
-    *Classifier.create_python_classifier(),
-  ])
-  return arr.multiline(True)
-
-
-def create_url_table(_info: t.Any) -> Table:
-  table = tomlkit.table()
-  _urls = {
-    'Blog': 'https://modelserving.com',
-    'Chat': 'https://discord.gg/openllm',
-    'Documentation': 'https://github.com/bentoml/openllm#readme',
-    'GitHub': _info.html_url,
-    'History': f'{_info.html_url}/blob/main/CHANGELOG.md',
-    'Homepage': _info.homepage,
-    'Tracker': f'{_info.html_url}/issues',
-    'Twitter': 'https://twitter.com/bentomlai',
-  }
-  table.update({k: v for k, v in sorted(_urls.items())})
-  return table
-
-
-def build_system() -> Table:
-  table = tomlkit.table()
-  table.add('build-backend', 'hatchling.build')
-  requires_array = correct_style(tomlkit.array())
-  requires_array.extend(['hatchling==1.18.0', 'hatch-vcs==0.3.0', 'hatch-fancy-pypi-readme==23.1.0'])
-  table.add('requires', requires_array.multiline(True))
-  return table
-
-
-def keywords() -> Array:
-  arr = correct_style(tomlkit.array())
-  arr.extend([
-    'MLOps',
-    'AI',
-    'BentoML',
-    'Model Serving',
-    'Model Deployment',
-    'LLMOps',
-    'Falcon',
-    'Vicuna',
-    'Llama 2',
-    'Fine tuning',
-    'Serverless',
-    'Large Language Model',
-    'Generative AI',
-    'StableLM',
-    'Alpaca',
-    'PyTorch',
-    'Mistral',
-    'vLLM',
-    'Transformers',
-  ])
-  return arr.multiline(True)
-
-
-def build_cli_extensions() -> Table:
-  table = tomlkit.table()
-  table.update({'openllm': '_openllm_tiny._entrypoint:cli'})
-  return table
-
-
-def main(args) -> int:
-  api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
-  _info = api.repos.get()
-  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
-    pyproject = tomlkit.parse(f.read())
-
-  if args.release_version is not None:
-    release_version = args.release_version
-  else:
-    try:
-      release_version = api.repos.get_latest_release()['tag_name'].lstrip('v')
-    except Exception as err:
-      raise err
-
-  _BASE_DEPENDENCIES = [
-    Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=_LOWER_BENTOML_CONSTRAINT),
-    Dependencies(name='openllm-client', lower_constraint=release_version),
-    Dependencies(name='openllm-core', lower_constraint=release_version),
-    Dependencies(name='safetensors'),
-    Dependencies(name='vllm', lower_constraint='0.4.3'),
-    Dependencies(name='ghapi'),
-    Dependencies(name='einops'),
-    Dependencies(name='sentencepiece'),
-    Dependencies(name='scipy'),
-    Dependencies(name='click', lower_constraint='8.1.3'),
-    Dependencies(name='cuda-python', platform=('Darwin', 'ne')),
-  ]
-
-  dependencies_array = correct_style(tomlkit.array())
-  dependencies_array.extend([v.to_str() for v in _BASE_DEPENDENCIES])
-  # dynamic field
-  dyn_arr = tomlkit.array()
-  dyn_arr.extend(['version', 'readme'])
-
-  pyproject['build-system'] = build_system()
-  pyproject['project']['classifiers'] = create_classifiers()
-  pyproject['project']['dependencies'] = dependencies_array.multiline(True)
-  pyproject['project']['description'] = f'{_info.name}: {_info.description}'
-  pyproject['project']['dynamic'] = dyn_arr
-  pyproject['project']['keywords'] = keywords()
-  pyproject['project']['license'] = _info.license.spdx_id
-  pyproject['project']['name'] = f'{_info.name.lower()}'
-  pyproject['project']['requires-python'] = '>=3.8'
-
-  pyproject['project']['urls'] = create_url_table(_info)
-  pyproject['project']['scripts'] = build_cli_extensions()
-
-  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f:
-    f.write(tomlkit.dumps(pyproject))
-  return 0
-
-
-if __name__ == '__main__':
-  import argparse
-
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--release-version', type=str, default=None)
-  raise SystemExit(main(parser.parse_args()))
--- a/tools/generate-coverage.py
+++ b/tools/generate-coverage.py
@@ -1,53 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-from collections import defaultdict
-from pathlib import Path
-
-import orjson
-from lxml import etree
-
-ROOT = Path(__file__).resolve().parent.parent
-
-PACKAGES = {'openllm-python/src/openllm/': 'openllm'}
-
-
-def main() -> int:
-  coverage_report = ROOT / 'coverage.xml'
-  root = etree.fromstring(coverage_report.read_text())
-
-  raw_package_data: defaultdict[str, dict[str, int]] = defaultdict(lambda: {'hits': 0, 'misses': 0})
-  for package in root.find('packages'):
-    for module in package.find('classes'):
-      filename = module.attrib['filename']
-      for relative_path, package_name in PACKAGES.items():
-        if filename.startswith(relative_path):
-          data = raw_package_data[package_name]
-          break
-      else:
-        message = f'unknown package: {module}'
-        raise ValueError(message)
-
-      for line in module.find('lines'):
-        if line.attrib['hits'] == '1':
-          data['hits'] += 1
-        else:
-          data['misses'] += 1
-
-  total_statements_covered = 0
-  total_statements = 0
-  coverage_data = {}
-  for package_name, data in sorted(raw_package_data.items()):
-    statements_covered = data['hits']
-    statements = statements_covered + data['misses']
-    total_statements_covered += statements_covered
-    total_statements += statements
-    coverage_data[package_name] = {'statements_covered': statements_covered, 'statements': statements}
-  coverage_data['total'] = {'statements_covered': total_statements_covered, 'statements': total_statements}
-
-  coverage_summary = ROOT / 'coverage-summary.json'
-  coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8')
-  return 0
-
-
-if __name__ == '__main__':
-  raise SystemExit(main())
--- a/tools/lock-actions.sh
+++ b/tools/lock-actions.sh
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-GIT_ROOT=$(git rev-parse --show-toplevel)
-
-cd "$GIT_ROOT" || exit 1
-
-[[ -x "$(command -v docker)" ]] || (
-  echo "docker not found. Make sure to have docker running to run this job."
-  exit 1
-)
-
-RATCHET_EXP_KEEP_NEWLINES=true find "${GIT_ROOT}/.github/workflows" -type f -iname '*.yml' -exec docker run -it --rm -v "${PWD}":"${PWD}" -w "${PWD}" -e RATCHET_EXP_KEEP_NEWLINES=true ghcr.io/sethvargo/ratchet:0.4.0 pin {} \;
--- a/tools/machines.py
+++ b/tools/machines.py
@@ -1,91 +0,0 @@
-from __future__ import annotations
-
-import httpx,os,dataclasses,datetime,time,argparse,typing as t
-if (ENV:=os.getenv("PAPERSPACE_API_KEY")) is None:raise RuntimeError('This script requires setting "PAPERSPACE_API_KEY"')
-HEADERS=httpx.Headers({'Authorization': f'Bearer {ENV}', 'Accept': 'application/json'})
-API_URL='https://api.paperspace.com/v1'
-
-@dataclasses.dataclass
-class Machine:
-  id:str
-  inner:httpx.Client=dataclasses.field(default_factory=lambda:httpx.Client(headers=HEADERS,base_url=API_URL,timeout=60),repr=False)
-  def close(self):self.inner.close()
-  def __del__(self):self.close()
-  def __enter__(self):return self
-  def __exit__(self, *_: t.Any)->None:self.close()
-  @property
-  def metadata(self)->dict[str,t.Any]:return self.inner.get(f'/machines/{self.id}').json()
-  @property
-  def status(self)->t.Literal['off','ready','stopping','starting']:return self.metadata['state']
-  def start(self)->bool:
-    response=self.inner.patch(f'/machines/{self.id}/start')
-    if response.status_code==400 or self.status=='ready':
-      print('machine is already running')
-      return False
-    elif response.status_code!=200:raise ValueError(f'Error while starting machine: {response.json()}')
-    return True
-  def stop(self)->bool:
-    response=self.inner.patch(f'/machines/{self.id}/stop')
-    if response.status_code==400 or self.status=='off':
-      print('machine is already off')
-      return False
-    elif response.status_code!=200:raise ValueError(f'Error while stopping machine {response.json()}')
-    return True
-  @classmethod
-  def ci(cls,template_id:str)->Machine:
-    client = httpx.Client(headers=HEADERS,base_url=API_URL,timeout=60)
-    machines=client.get('/machines',params=dict(limit=1,name='openllm-ci')).json()
-    if len(machines['items']) == 1:
-      return cls(id=machines['items'][0]['id'],inner=client)
-    response=client.post('/machines',json=dict(
-      name=f'openllm-ci-{datetime.datetime.now().timestamp()}', machineType='A4000',templateId=template_id,
-      networkId=os.getenv("PAPERSPACE_NETWORK_ID"), diskSize=500,region='ny2',publicIpType='dynamic',startOnCreate=True))
-    if response.status_code!=200:raise ValueError(f'Failed while creating a machine: {response.json()}')
-    return cls(id=response.json()['data']['id'],inner=client)
-  def actions(self,ip:bool=False,id:bool=False)->str:
-    if ip:return self.metadata["publicIp"]
-    if id:return self.id
-    raise ValueError('cannot be all false.')
-def main():
-  parser=argparse.ArgumentParser()
-  group=parser.add_mutually_exclusive_group(required=True)
-  group.add_argument('--start',metavar='ID')
-  group.add_argument('--stop',metavar='ID')
-  group.add_argument('--delete',metavar='ID')
-  group.add_argument('--ci-template',metavar='ID')
-  parser.add_argument('--output-ip',action='store_true')
-  parser.add_argument('--output-id',action='store_true')
-  args=parser.parse_args()
-
-  if args.ci_template:
-    machine=Machine.ci(args.ci_template)
-    while machine.status!='ready':time.sleep(5)
-    print(machine.actions(args.output_ip,args.output_id))
-    machine.close()
-  elif args.delete:
-    with httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60) as client:
-      response=client.delete(f'/machines/{args.delete}')
-      if response.status_code!=200:
-        print('Error while deleting machine', response.json())
-        return 1
-  elif args.start:
-    with Machine(id=args.start) as machine:
-      if machine.start():
-        while machine.status!='ready':
-          print('Waiting for machine to be ready...')
-          time.sleep(5)
-      else:
-        print('Failed to start machine "%s"', machine.id)
-        return 1
-  elif args.stop:
-    with Machine(id=args.stop) as machine:
-      if machine.stop():
-        while machine.status!='ready':
-          print('Waiting for machine to stop...')
-          time.sleep(5)
-      else:
-        print('Failed to stop machine')
-        return 1
-  return 0
-
-if __name__ == "__main__":raise SystemExit(main())
--- a/tools/mirror.sh
+++ b/tools/mirror.sh
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-set -eo pipefail
-
-GIT_ROOT="$(git rev-parse --show-toplevel)"
-cd "$GIT_ROOT" || exit 1
-
-mirror() {
-  cp $1 $2
-}
-
-mirror README.md openllm-python/README.md
-mirror LICENSE.md openllm-python/LICENSE.md
-mirror CHANGELOG.md openllm-python/CHANGELOG.md
--- a/tools/requirements.txt
+++ b/tools/requirements.txt
@@ -1,6 +0,0 @@
-ipython
-jupyter
-tomlkit
-ghapi
-pre-commit
-orjson
--- a/tools/run-release-action
+++ b/tools/run-release-action
@@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-
-set -e -o pipefail
-
-# Function to print script usage
-print_usage() {
-  echo "Usage: $0 [--release <major|minor|patch|alpha>]"
-}
-
-# Function to validate release argument
-validate_release() {
-  local release=$1
-
-  if [[ $release == "major" || $release == "minor" || $release == "patch" || $release == "alpha" ]]; then
-    return 0
-  else
-    return 1
-  fi
-}
-
-check_membership() {
-  local org="BentoML"
-  local username=$(gh api user | jq -r '.login')
-  if gh api orgs/$org/members/$username -q '.message' | grep -q "Not Found"; then
-    echo "ERROR: You must be a member of $org to run this script."
-    exit 1
-  fi
-}
-
-for cmd in gh jq; do
-  if ! command -v "$cmd" @ >&1 >/dev/null; then
-    echo "ERROR: $cmd not installed. Aborting..."
-    exit 1
-  fi
-done
-
-check_membership
-
-# Check if release flag is provided
-if [[ $1 == "--release" ]]; then
-  # Check if release argument is provided
-  if [[ -z $2 ]]; then
-    echo "Error: No release argument provided."
-    print_usage
-    exit 1
-  fi
-
-  release=$2
-
-  if ! validate_release "$release"; then
-    echo "Error: Invalid release argument. Only 'major', 'minor', 'patch', or 'alpha' are allowed."
-    print_usage
-    exit 1
-  fi
-else
-  echo "Error: Unknown option or no option provided."
-  print_usage
-  exit 1
-fi
-
-# Get the current version and separate the alpha part if it exists
-version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
-VERSION="${version#v}"
-
-# Initialize variables for alpha versioning
-ALPHA=""
-ALPHA_NUM=0
-
-# Check if current version is an alpha version and split accordingly
-if [[ $VERSION =~ -alpha ]]; then
-  IFS='-' read -r BASE_VERSION ALPHA <<<"$VERSION"
-  if [[ $ALPHA =~ [.] ]]; then
-    IFS='.' read -r ALPHA ALPHA_NUM <<<"$ALPHA"
-  fi
-else
-  BASE_VERSION="$VERSION"
-fi
-
-# Save the current value of IFS to restore it later and split the base version
-OLD_IFS=$IFS
-IFS='.'
-read -ra VERSION_BITS <<<"$BASE_VERSION"
-IFS=$OLD_IFS
-
-# Assign split version numbers
-VNUM1=${VERSION_BITS[0]}
-VNUM2=${VERSION_BITS[1]}
-VNUM3=${VERSION_BITS[2]}
-
-# Adjust the version numbers based on the release type
-if [[ $release == 'major' ]]; then
-  VNUM1=$((VNUM1 + 1))
-  VNUM2=0
-  VNUM3=0
-  ALPHA="" # Reset alpha for major release
-elif [[ $release == 'minor' ]]; then
-  if [[ -n $ALPHA ]]; then
-    ALPHA="" # Remove alpha suffix for minor release from an alpha version
-  else
-    VNUM2=$((VNUM2 + 1))
-    VNUM3=0
-  fi
-elif [[ $release == 'patch' ]]; then
-  VNUM3=$((VNUM3 + 1))
-  ALPHA="" # Reset alpha for patch release
-elif [[ $release == 'alpha' ]]; then
-  if [ -n "$ALPHA" ]; then
-    ALPHA_NUM=$((ALPHA_NUM + 1))
-  else
-    VNUM2=$((VNUM2 + 1))
-    VNUM3=0
-    ALPHA="alpha"
-    ALPHA_NUM=0
-  fi
-fi
-
-# Construct the new version string
-if [ -n "$ALPHA" ]; then
-  if ((ALPHA_NUM > 0)); then
-    RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3-alpha.$ALPHA_NUM"
-  else
-    RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3-alpha"
-  fi
-else
-  RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3"
-fi
-
-echo "Releasing version: $RELEASE_TAG"
-
-if [[ -v DRYRUN ]]; then
-  exit 0
-fi
-
-echo "Running release actions (create-releases.yml)..."
-echo '{"release_type": "'"$release"'"}' | gh workflow run create-releases.yml --repo bentoml/openllm --json
-
-sleep 20
-
-set -x
-
-echo "Waiting for new tags to be released from 'create-releases.yml'"
-while true; do
-  git pull --autostash --no-edit --gpg-sign --ff origin main
-  if git ls-remote -t --exit-code origin "refs/tags/${RELEASE_TAG}" &>/dev/null; then
-    break
-  fi
-  sleep 10
-done
--- a/tools/tests-requirements.txt
+++ b/tools/tests-requirements.txt
@@ -1,9 +0,0 @@
-pytest
-pytest-cov
-pytest-mock
-pytest-randomly
-pytest-rerunfailures
-pytest-asyncio>=0.21.0
-coverage[toml]
-filelock>=3.7.1
-pytest-xdist[psutil]
--- a/tools/update-actions.sh
+++ b/tools/update-actions.sh
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-DEBUG=${DEBUG:-false}
-[[ "${DEBUG}" == "true" ]] && set -x
-
-GIT_ROOT=$(git rev-parse --show-toplevel)
-
-cd "$GIT_ROOT" || exit 1
-
-[[ -x "$(command -v docker)" ]] || (
-    echo "docker not found. Make sure to have docker running to run this job."
-    exit 1
-)
-
-docker version &>/dev/null || (
-  echo "docker is not healthy. Make sure to have docker running"
-  exit 1
-)
-
-[[ -z "${ACTIONS_TOKEN}" ]] && (
-    echo "ACTIONS_TOKEN not found. Make sure to have ACTIONS_TOKEN set to run this job."
-    exit 1
-)
-
-find "${GIT_ROOT}/.github/workflows" -type f -iname '*.yml' -exec docker run --rm -v "${PWD}":"${PWD}" -w "${PWD}" -e ACTIONS_TOKEN -e RATCHET_EXP_KEEP_NEWLINES=true ghcr.io/sethvargo/ratchet:0.4.0 update {} \;
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -1,170 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-import os, sys
-from pathlib import Path
-
-# currently we are assuming the indentatio level is 2 for comments
-START_COMMENT = f'# {os.path.basename(__file__)}: start\n'
-END_COMMENT = f'# {os.path.basename(__file__)}: stop\n'
-# Stubs for auto class
-START_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs start\n'
-END_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs stop\n'
-# Stubs for actual imports
-START_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs start\n'
-END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'
-
-ROOT = Path(__file__).parent.parent
-_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
-_TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
-_TARGET_CORE_INIT_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / '__init__.py'
-_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
-_TARGET_IMPORT_UTILS_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'utils' / 'import_utils.pyi'
-
-sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
-from openllm_core._configuration import GenerationConfig, ModelSettings
-from openllm_core.config.configuration_auto import CONFIG_MAPPING_NAMES
-from openllm_core.utils import codegen, import_utils as iutils
-# from openllm_core.utils.peft import PeftType
-
-
-def process_annotations(annotations: str) -> str:
-  if 'NotRequired' in annotations:
-    return annotations[len('NotRequired[') : -1]
-  elif 'Required' in annotations:
-    return annotations[len('Required[') : -1]
-  else:
-    return annotations
-
-
-_transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
-
-
-def main() -> int:
-  with _TARGET_FILE.open('r') as f:
-    processed = f.readlines()
-
-  start_idx, end_idx = processed.index(' ' * 2 + START_COMMENT), processed.index(' ' * 2 + END_COMMENT)
-
-  # NOTE: inline runtime __getitem__ overload process
-  lines: list[str] = []
-  lines.append(' ' * 2 + '# NOTE: ModelSettings arguments\n')
-  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
-    lines.extend([
-      ' ' * 2 + line
-      for line in [
-        '@overload\n',
-        f"def __getitem__(self, item: t.Literal['{keys}']) -> {process_annotations(ForwardRef.__forward_arg__)}: ...\n",
-      ]
-    ])
-  lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n')
-  generation_config_anns = codegen.get_annotations(GenerationConfig)
-  for keys, type_pep563 in generation_config_anns.items():
-    lines.extend([
-      ' ' * 2 + line
-      for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]
-    ])
-  # backward compatible
-  lines.extend([
-    ' ' * 2 + line
-    for line in [
-      '@overload\n',
-      "def __getitem__(self, item: t.Literal['max_new_tokens']) -> int: ...\n",
-      '@overload\n',
-      "def __getitem__(self, item: t.Literal['start_name']) -> str: ...\n",
-      '@overload\n',
-      "def __getitem__(self, item: t.Literal['model_name']) -> str: ...\n",
-    ]
-  ])
-
-  processed = (
-    processed[:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT] + processed[end_idx + 1 :]
-  )
-  with _TARGET_FILE.open('w') as f:
-    f.writelines(processed)
-
-  with _TARGET_AUTO_FILE.open('r') as f:
-    processed = f.readlines()
-
-  start_auto_stubs_idx, end_auto_stubs_idx = (
-    processed.index(' ' * 2 + START_AUTO_STUBS_COMMENT),
-    processed.index(' ' * 2 + END_AUTO_STUBS_COMMENT),
-  )
-  lines = []
-  for model, class_name in CONFIG_MAPPING_NAMES.items():
-    lines.extend([
-      ' ' * 2 + line
-      for line in [
-        '@t.overload\n',
-        '@classmethod\n',
-        f"def for_model(cls, model_name: t.Literal['{model}'], **attrs: t.Any) -> openllm_core.config.{class_name}: ...\n",
-      ]
-    ])
-  lines.extend([
-    ' ' * 2 + line
-    for line in [
-      '@t.overload\n',
-      '@classmethod\n',
-      'def for_model(cls, model_name: LiteralString, **attrs: t.Any) -> openllm_core.LLMConfig: ...\n',
-    ]
-  ])
-  processed = (
-    processed[:start_auto_stubs_idx]
-    + [' ' * 2 + START_AUTO_STUBS_COMMENT, *lines, ' ' * 2 + END_AUTO_STUBS_COMMENT]
-    + processed[end_auto_stubs_idx + 1 :]
-  )
-  with _TARGET_AUTO_FILE.open('w') as f:
-    f.writelines(processed)
-
-  with _TARGET_INIT_FILE.open('r') as f:
-    processed = f.readlines()
-
-  start_import_stubs_idx, end_import_stubs_idx = (
-    processed.index(START_IMPORT_STUBS_COMMENT),
-    processed.index(END_IMPORT_STUBS_COMMENT),
-  )
-  mm = {
-    '_configuration': ('GenerationConfig', 'LLMConfig'),
-    '_schemas': ('GenerationInput', 'GenerationOutput', 'MetadataOutput', 'MessageParam'),
-    'utils': ('api',),
-  }
-  lines = [
-    'from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient',
-    f'from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, {", ".join([a+" as "+a for a in CONFIG_MAPPING_NAMES.values()])}',
-  ]
-  lines.extend([
-    f'from openllm_core.{module} import {", ".join([a+" as "+a for a in attr])}' for module, attr in mm.items()
-  ])
-  processed = (
-    processed[:start_import_stubs_idx]
-    + [START_IMPORT_STUBS_COMMENT, '\n'.join(lines) + '\n', END_IMPORT_STUBS_COMMENT]
-    + processed[end_import_stubs_idx + 1 :]
-  )
-  with _TARGET_INIT_FILE.open('w') as f:
-    f.writelines(processed)
-
-  lines = [
-    '# fmt: off\n',
-    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
-    'from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig\n',
-    *[f'from .configuration_{k} import {a} as {a}\n' for k, a in CONFIG_MAPPING_NAMES.items()],
-  ]
-  with _TARGET_CORE_INIT_FILE.open('w') as f:
-    f.writelines(lines)
-
-  lines = [
-    '# fmt: off\n',
-    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
-    'import typing as t\n',
-    'def is_autoawq_available() -> bool: ...\n',
-    'def is_vllm_available() -> bool: ...\n',
-    *[f'def {k}() -> bool: ...\n' for k in iutils.caller],
-    'ENV_VARS_TRUE_VALUES: t.Set[str] = ...\n',
-  ]
-  with _TARGET_IMPORT_UTILS_FILE.open('w') as f:
-    f.writelines(lines)
-
-  return 0
-
-
-if __name__ == '__main__':
-  raise SystemExit(main())
--- a/tools/update-mypy.py
+++ b/tools/update-mypy.py
@@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-import concurrent.futures
-import configparser
-import os
-from typing import List
-
-_MYPY_CONFIG = {
-  'pretty': 'true',
-  'python_version': '3.9',
-  'show_error_codes': 'true',
-  'strict': 'true',
-  'plugins': 'pydantic.mypy',
-  'ignore_missing_imports': 'true',
-  'warn_unreachable': 'true',
-  'explicit_package_bases': 'true',
-}
-
-
-# Function to find .pyi files in a given directory
-def pyi_in_subdir(directory: str, git_root: str) -> List[str]:
-  pyi_files = []
-  for root, _, files in os.walk(directory):
-    for file in files:
-      if file.endswith('.pyi') or file == '_typing_compat.py' or '_openllm_tiny' in file:
-        full_path = os.path.join(root, file)
-        # Convert to relative path with respect to the git root
-        relative_path = os.path.relpath(full_path, git_root)
-        pyi_files.append(relative_path)
-  return pyi_files
-
-
-def find_pyi_files(git_root: str) -> List[str]:
-  # List all subdirectories
-  subdirectories = [
-    os.path.join(git_root, name)
-    for name in os.listdir(git_root)
-    if os.path.isdir(os.path.join(git_root, name)) and name not in ['venv', '.git', '.venv']
-  ]
-
-  # Use a thread pool to execute searches concurrently
-  with concurrent.futures.ThreadPoolExecutor() as executor:
-    # Map of future to subdirectory
-    future_to_subdir = {executor.submit(pyi_in_subdir, subdir, git_root): subdir for subdir in subdirectories}
-
-    all_pyi_files = set()
-    for future in concurrent.futures.as_completed(future_to_subdir):
-      pyi_files = future.result()
-      all_pyi_files.update(pyi_files)
-
-  return list(all_pyi_files)
-
-
-# Function to update mypy.ini file
-def update_mypy_ini(pyi_files: List[str], mypy_ini_path: str) -> int:
-  config = configparser.ConfigParser()
-  config.read(mypy_ini_path)
-
-  # Existing files from mypy.ini
-  existing_files = config.get('mypy', 'files', fallback='').split(', ')
-
-  # Add new .pyi files if they are not already in the list
-  updated_files = existing_files + [f for f in pyi_files if f not in existing_files]
-
-  # Update the 'files' entry
-  config['mypy']['files'] = ', '.join(updated_files)
-  for key, value in _MYPY_CONFIG.items():
-    config.set('mypy', key, value)
-
-  # Write changes back to mypy.ini
-  with open(mypy_ini_path, 'w') as configfile:
-    configfile.write(
-      f'# The following is autogenerated by {os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))}\n'
-    )
-    config.write(configfile)
-  # Remove last newline if exists
-  with open(mypy_ini_path, 'rb+') as file:
-    file.seek(-1, os.SEEK_END)
-    if file.read(1) == b'\n':
-      file.seek(-1, os.SEEK_END)
-      file.truncate()
-  return 0
-
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-MYPY_CONFIG = os.path.join(ROOT, 'mypy.ini')
-
-if __name__ == '__main__':
-  raise SystemExit(update_mypy_ini(find_pyi_files(ROOT), MYPY_CONFIG))
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-import os, shutil, sys, tomlkit
-
-START_COMMENT = f'<!-- {os.path.basename(__file__)}: start -->\n'
-END_COMMENT = f'<!-- {os.path.basename(__file__)}: stop -->\n'
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, os.path.join(ROOT, 'openllm-core', 'src'))
-from openllm_core.config import CONFIG_MAPPING
-from openllm_core.config.configuration_auto import CONFIG_TO_ALIAS_NAMES
-
-
-def markdown_noteblock(text: str):
-  return ['\n', f'> **Note:** {text}\n']
-
-
-def main() -> int:
-  with open(os.path.join(ROOT, 'README.md'), 'r') as f:
-    readme = f.readlines()
-
-  start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
-
-  content = []
-
-  for it in CONFIG_MAPPING.values():
-    it = it()
-    architecture_name = it.__class__.__name__[:-6]
-    details_block = ['<details>\n', f'<summary>{architecture_name}</summary>\n\n', '### Quickstart\n']
-    nitem = CONFIG_TO_ALIAS_NAMES[it.__class__.__name__]
-    details_block.extend([
-      f'Run the following command to quickly spin up a {architecture_name} server:\n',
-      f"""\
-```bash
-openllm start {it['default_id']}{'' if not it['trust_remote_code'] else ' --trust-remote-code'}
-```""",
-      'You can run the following code in a different terminal to interact with the server:',
-      """\
-```python
-import openllm_client
-client = openllm_client.HTTPClient('http://localhost:3000')
-client.generate('What are large language models?')
-```""",
-      *markdown_noteblock(
-        f'Any {architecture_name} variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search={nitem}) to see more {architecture_name}-compatible models.\n'
-      ),
-      '\n### Supported models\n',
-      f'You can specify any of the following {architecture_name} models via `openllm start`:\n\n',
-    ])
-    list_ids = [f'- [{model_id}](https://huggingface.co/{model_id})' for model_id in it['model_ids']]
-    details_block.extend(list_ids)
-    details_block.append('\n</details>\n\n')
-
-    content.append('\n'.join(details_block))
-
-  readme = readme[:start_index] + [START_COMMENT] + content + [END_COMMENT] + readme[stop_index + 1 :]
-  with open(os.path.join(ROOT, 'README.md'), 'w') as f:
-    f.writelines(readme)
-
-  shutil.copyfile(os.path.join(ROOT, 'README.md'), os.path.join(ROOT, 'openllm-python', 'README.md'))
-  return 0
-
-
-if __name__ == '__main__':
-  raise SystemExit(main())
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -1,46 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-from decimal import ROUND_DOWN, Decimal
-from pathlib import Path
-import orjson
-
-PRECISION = Decimal('.01')
-
-ROOT = Path(__file__).resolve().parent.parent
-
-
-def main() -> int:
-  coverage_summary = ROOT / 'coverage-summary.json'
-
-  coverage_data = orjson.loads(coverage_summary.read_text(encoding='utf-8'))
-  total_data = coverage_data.pop('total')
-
-  lines = ['\n', 'Package | Statements\n', '------- | ----------\n']
-
-  for package, data in sorted(coverage_data.items()):
-    statements_covered = data['statements_covered']
-    statements = data['statements']
-
-    rate = Decimal(statements_covered) / Decimal(statements) * 100
-    rate = rate.quantize(PRECISION, rounding=ROUND_DOWN)
-    lines.append(f'{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n')
-
-  total_statements_covered = total_data['statements_covered']
-  total_statements = total_data['statements']
-  total_rate = Decimal(total_statements_covered) / Decimal(total_statements) * 100
-  total_rate = total_rate.quantize(PRECISION, rounding=ROUND_DOWN)
-  color = 'ok' if float(total_rate) >= 95 else 'critical'
-  lines.insert(0, f'![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n')
-
-  lines.append(
-    f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n'
-  )
-
-  coverage_report = ROOT / 'coverage-report.md'
-  with coverage_report.open('w', encoding='utf-8') as f:
-    f.write(''.join(lines))
-  return 0
-
-
-if __name__ == '__main__':
-  raise SystemExit(main())
--- a/wheels.sh
+++ b/wheels.sh
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-set -eo pipefail
-
-GIT_ROOT="$(git rev-parse --show-toplevel)"
-cd "$GIT_ROOT" || exit 1
-
-mkdir -p dist
-
-pushd openllm-client &>/dev/null
-
-python -m build -w && mv dist/* ../dist
-
-popd &>/dev/null
-
-pushd openllm-core &>/dev/null
-
-python -m build -w && mv dist/* ../dist
-
-popd &>/dev/null
-
-pushd openllm-python &>/dev/null
-
-python -m build -w && mv dist/* ../dist
-
-popd &>/dev/null
				`@@ -0,0 +1 @@`
				`/Users/aarnphm/workspace/bentoml/projects/openllm/upstream/.python-version-default`