diff --git a/.editorconfig b/.editorconfig
index b0815bbc..3fcb4be2 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -9,5 +9,5 @@ charset = utf-8
 indent_style = space
 indent_size = 2
 
-[src/openllm/cli/entrypoint.py]
+[openllm-python/src/openllm/cli/entrypoint.py]
 indent_size = unset
diff --git a/.gitattributes b/.gitattributes
index 1925cb33..226718d4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,10 +1,10 @@
-nightly-requirements.txt linguist-generated=true
-nightly-requirements-gpu.txt linguist-generated=true
-tests/models/__snapshots__/* linguist-generated=true
+openllm-python/tests/models/__snapshots__/* linguist-generated=true
 typings/**/*.pyi linguist-generated=true
 Formula/openllm.rb linguist-generated=true
-src/openllm/utils/dummy_*.py linguist-generated=true
-src/openllm/models/__init__.py linguist-generated=true
+openllm-python/src/openllm/utils/dummy_*.py linguist-generated=true
+openllm-python/src/openllm/models/__init__.py linguist-generated=true
+openllm-python/README.md linguist-generated=true
 * text=auto eol=lf
 # Needed for setuptools-scm-git-archive
 .git_archival.txt  export-subst
+openllm-python/.git_archival.txt  export-subst
diff --git a/.github/actions/create_release_and_archive.sh b/.github/actions/create_release_and_archive.sh
index 30529c09..51e98f18 100755
--- a/.github/actions/create_release_and_archive.sh
+++ b/.github/actions/create_release_and_archive.sh
@@ -26,7 +26,7 @@ All available models: \`\`\`openllm models\`\`\`
 
 To start a LLM: \`\`\`python -m openllm start opt\`\`\`
 
-To run OpenLLM within a container environment (requires GPUs): \`\`\`docker run --gpus all -it --entrypoint=/bin/bash -P ghcr.io/bentoml/openllm:${TAG} openllm --help\`\`\`
+To run OpenLLM within a container environment (requires GPUs): \`\`\`docker run --gpus all -it -P ghcr.io/bentoml/openllm:${TAG} start opt\`\`\`
 
 Find more information about this release in the [CHANGELOG.md](https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md)
 
diff --git a/.github/actions/release.sh b/.github/actions/release.sh
index 1b91427a..c6525d5a 100755
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -47,11 +47,11 @@ release_package() {
     local version="$1"
     echo "Releasing version ${version}..."
     jq --arg release_version "${version}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
-    pushd src/openllm-node &>/dev/null
+    pushd openllm-node &>/dev/null
     jq --arg release_version "${version}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
     popd &>/dev/null
     towncrier build --yes --version "${version}"
-    git add CHANGELOG.md changelog.d package.json src/openllm-node/package.json
+    git add CHANGELOG.md changelog.d package.json openllm-node/package.json
     git commit -S -sm "infra: prepare for release ${version} [generated] [skip ci]"
     git push origin main
     echo "Releasing tag ${version}..." && git tag -a "v${version}" -sm "Release ${version} [generated by GitHub Actions]"
diff --git a/assets/agent.gif b/.github/assets/agent.gif
similarity index 100%
rename from assets/agent.gif
rename to .github/assets/agent.gif
diff --git a/assets/main-banner.png b/.github/assets/main-banner.png
similarity index 100%
rename from assets/main-banner.png
rename to .github/assets/main-banner.png
diff --git a/assets/output.gif b/.github/assets/output.gif
similarity index 100%
rename from assets/output.gif
rename to .github/assets/output.gif
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index dc2ca198..9cef2e59 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,17 +1,3 @@
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 version: 2
 updates:
   - package-ecosystem: github-actions
@@ -22,7 +8,7 @@ updates:
       time: '09:00'
   # maintain required dependencies
   - package-ecosystem: pip
-    directory: '/'
+    directory: '/openllm-python'
     schedule:
       interval: 'weekly'
     open-pull-requests-limit: 5
diff --git a/.github/workflows/auto-bot.yml b/.github/workflows/auto-bot.yml
index 4f4cc5e0..a868079c 100644
--- a/.github/workflows/auto-bot.yml
+++ b/.github/workflows/auto-bot.yml
@@ -13,7 +13,7 @@ jobs:
     if: ${{ github.actor == 'dependabot[bot]' }}
     steps:
       - name: Wait for tests to succeed
-        uses: lewagon/wait-on-check-action@v1.3.1
+        uses: lewagon/wait-on-check-action@e106e5c43e8ca1edea6383a39a01c5ca495fd812 # ratchet:lewagon/wait-on-check-action@v1.3.1
         with:
           ref: ${{ github.ref }}
           check-name: evergreen
diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml
index fc464fa4..aa9d2c92 100644
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -6,12 +6,14 @@ on:
   pull_request:
     branches: [main]
     paths-ignore:
+      - '*.md'
       - 'docs/**'
       - 'bazel/**'
       - 'typings/**'
-      - '*.md'
       - 'changelog.d/**'
       - 'assets/**'
+      - 'openllm-node/**'
+      - 'Formula/**'
 defaults:
   run:
     shell: bash --noprofile --norc -exo pipefail {0}
@@ -29,7 +31,7 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ github.actor != 'dependabot[bot]' }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup CI
@@ -41,11 +43,12 @@ jobs:
         run: python -m pip install --upgrade build
       - name: Build
         run: python -m build
+        working-directory: ./openllm-python
       - name: Upload artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: binary-artefacts
-          path: dist/*
+          path: openllm-python/dist/*
           if-no-files-found: error
   binaries:
     name: ${{ matrix.job.target }} (${{ matrix.job.os }})
@@ -92,7 +95,7 @@ jobs:
       PYAPP_PIP_EXTERNAL: 'true'
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Pull latest change
@@ -102,18 +105,18 @@ jobs:
         run: >-
           mkdir $PYAPP_REPO && curl -L https://github.com/ofek/pyapp/releases/download/v$PYAPP_VERSION/source.tar.gz | tar --strip-components=1 -xzf - -C $PYAPP_REPO
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # ratchet:actions/setup-python@v4
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Install Hatch
         run: pip install -U hatch
       - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@stable
+        uses: dtolnay/rust-toolchain@00b49be78f40fba4e87296b2ead62868750bdd83 # ratchet:dtolnay/rust-toolchain@stable
         with:
           targets: ${{ matrix.job.target }}
       - name: Set up cross compiling
         if: matrix.job.cross
-        uses: taiki-e/install-action@v2
+        uses: taiki-e/install-action@5265bea15b0d6367d8e293f306c880f1d0271190 # ratchet:taiki-e/install-action@v2
         with:
           tool: cross
       - name: Configure cross compiling
@@ -127,7 +130,7 @@ jobs:
           fi
       - name: Download Python artifacts
         if: ${{ !startsWith(github.event.ref, 'refs/tags') }}
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: binary-artefacts
           path: dist
@@ -173,14 +176,14 @@ jobs:
           done
       - name: Upload staged archive
         if: runner.os != 'Linux'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: staged-${{ runner.os }}
           path: packaging/*
           if-no-files-found: error
       - name: Upload archive
         if: runner.os == 'Linux'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: standalone
           path: packaging/*
@@ -194,9 +197,9 @@ jobs:
     if: ${{ github.event_name != 'pull_request' }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # ratchet:actions/setup-python@v4
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Pull latest change
@@ -204,7 +207,7 @@ jobs:
       - name: Install PyOxidizer ${{ env.PYOXIDIZER_VERSION }}
         run: pip install pyoxidizer==${{ env.PYOXIDIZER_VERSION }}
       - name: Download staged binaries
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: staged-${{ runner.os }}
           path: archives
@@ -233,13 +236,13 @@ jobs:
           mkdir installers
           mv build/*/release/*/*.{exe,msi} installers
       - name: Upload binaries
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: standalone
           path: archives/*
           if-no-files-found: error
       - name: Upload installers
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: installers
           path: installers/*
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 172a4ec9..fe5ee927 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,8 +11,8 @@ on:
       - "main"
     paths:
       - ".github/workflows/build.yaml"
-      - "src/openllm/bundle/oci/Dockerfile"
-      - "src/openllm/**"
+      - "openllm-python/src/openllm/bundle/oci/Dockerfile"
+      - "openllm-python/src/openllm/**"
 env:
   LINES: 120
   COLUMNS: 120
@@ -37,14 +37,14 @@ jobs:
       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
     steps:
       - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v2
+        uses: aws-actions/configure-aws-credentials@5fd3084fc36e372ff1fff382a39b10d03659f355 # ratchet:aws-actions/configure-aws-credentials@v2
         with:
           aws-access-key-id: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
           aws-region: ${{ env.AWS_REGION }}
       - name: Start EC2 Runner
         id: start-ec2-runner
-        uses: aarnphm/ec2-github-runner@main
+        uses: aarnphm/ec2-github-runner@af796d217e24ecbbc5a2c49e780cd90616e2b962 # ratchet:aarnphm/ec2-github-runner@main
         with:
           mode: start
           github-token: ${{ secrets.OPENLLM_PAT }}
@@ -68,15 +68,15 @@ jobs:
       id-token: write
       security-events: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 1
       - name: Inject slug/short variables
-        uses: rlespinasse/github-slug-action@v4.4.1
+        uses: rlespinasse/github-slug-action@102b1a064a9b145e56556e22b18b19c624538d94 # ratchet:rlespinasse/github-slug-action@v4.4.1
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2.2.0
+        uses: docker/setup-qemu-action@2b82ce82d56a2a04d2637cd93a637ae1b359c0a7 # ratchet:docker/setup-qemu-action@v2.2.0
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2.9.1
+        uses: docker/setup-buildx-action@4c0219f9ac95b02789c1075625400b2acbff50b1 # ratchet:docker/setup-buildx-action@v2.9.1
         with:
           install: true
           driver-opts: |
@@ -84,18 +84,18 @@ jobs:
             network=host
       - name: Install cosign
         if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@v3.1.1
+        uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 # ratchet:sigstore/cosign-installer@v3.1.1
         with:
           cosign-release: 'v2.1.1'
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@v2.2.0
+        uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # ratchet:docker/login-action@v2.2.0
         if: github.event_name != 'pull_request'
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
       - name: Login to public ECR
-        uses: docker/login-action@v2.2.0
+        uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # ratchet:docker/login-action@v2.2.0
         with:
           registry: public.ecr.aws
           username: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -105,7 +105,7 @@ jobs:
       - name: Extract metadata tags and labels on PRs
         if: github.event_name == 'pull_request'
         id: meta-pr
-        uses: docker/metadata-action@v4.6.0
+        uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0
         with:
           images: |
             public.ecr.aws/y5w8i4y6/bentoml/openllm
@@ -116,7 +116,7 @@ jobs:
       - name: Extract metadata tags and labels for main, release or tag
         if: github.event_name != 'pull_request'
         id: meta
-        uses: docker/metadata-action@v4.6.0
+        uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0
         with:
           flavor: |
             latest=auto
@@ -133,13 +133,13 @@ jobs:
             org.opencontainers.image.source="https://github.com/bentoml/OpenLLM"
       - name: Build and push Docker image
         id: build-and-push
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@2eb1c1961a95fc15694676618e422e8ba1d63825 # ratchet:docker/build-push-action@v4
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
         with:
           context: .
-          file: src/openllm/bundle/oci/Dockerfile
+          file: openllm-python/src/openllm/bundle/oci/Dockerfile
           push: true
           platforms: 'linux/amd64'
           build-args: |
@@ -160,7 +160,7 @@ jobs:
           COSIGN_EXPERIMENTAL: "true"
         run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }}
       - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
-        uses: aquasecurity/trivy-action@master
+        uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master
         if: ${{ github.event_name != 'pull_request' }}
         with:
           image-ref: 'ghcr.io/bentoml/openllm:sha-${{ env.GITHUB_SHA_SHORT }}'
@@ -169,7 +169,7 @@ jobs:
           github-pat: ${{ secrets.GITHUB_TOKEN }}
           scanners: 'vuln'
       - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@master
+        uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master
         if: ${{ github.event_name != 'pull_request' }}
         with:
           image-ref: 'ghcr.io/bentoml/openllm:sha-${{ env.GITHUB_SHA_SHORT }}'
@@ -178,7 +178,7 @@ jobs:
           severity: 'CRITICAL'
           scanners: 'vuln'
       - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@v2
+        uses: github/codeql-action/upload-sarif@a09933a12a80f87b87005513f0abb1494c27a716 # ratchet:github/codeql-action/upload-sarif@v2
         if: ${{ github.event_name != 'pull_request' }}
         with:
           sarif_file: 'trivy-results.sarif'
@@ -192,13 +192,13 @@ jobs:
     if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
     steps:
       - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v2
+        uses: aws-actions/configure-aws-credentials@5fd3084fc36e372ff1fff382a39b10d03659f355 # ratchet:aws-actions/configure-aws-credentials@v2
         with:
           aws-access-key-id: ${{ secrets.AWS_PROD_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
           aws-region: ${{ env.AWS_REGION }}
       - name: Stop EC2 runner
-        uses: aarnphm/ec2-github-runner@main
+        uses: aarnphm/ec2-github-runner@af796d217e24ecbbc5a2c49e780cd90616e2b962 # ratchet:aarnphm/ec2-github-runner@main
         with:
           mode: stop
           github-token: ${{ secrets.OPENLLM_PAT }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0c27ae8a..f86b2c91 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: ubuntu-latest
     name: code-quality
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup CI
@@ -42,8 +42,8 @@ jobs:
         with:
           python-version: ${{ env.STABLE_PYTHON_VERSION }}
       - name: Install OpenLLM
-        run: pip install -e .
-      - uses: pre-commit/action@v3.0.0
+        run: pip install -e ./openllm-python
+      - uses: pre-commit/action@646c83fcd040023954eafda54b4db0192ce70507 # ratchet:pre-commit/action@v3.0.0
         with:
           extra_args: --verbose
   tests:
@@ -58,7 +58,7 @@ jobs:
           - os: 'windows-latest'
     name: tests (${{ matrix.python-version }}.${{ matrix.os }})
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup CI
@@ -70,17 +70,18 @@ jobs:
       - name: Disambiguate coverage filename
         run: mv .coverage ".coverage.${{ matrix.os }}.${{ matrix.python-version }}"
       - name: Upload coverage data
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: coverage-data
           path: .coverage.*
   coverage:
     name: report-coverage
     runs-on: ubuntu-latest
+    if: false
     needs:
       - tests
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup CI
@@ -88,17 +89,17 @@ jobs:
         with:
           python-version: ${{ env.STABLE_PYTHON_VERSION }}
       - name: Download coverage data
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: coverage-data
       - name: Combine coverage data
         run: hatch run coverage:combine
       - name: Export coverage reports
         run: |
-          hatch run coverage:report-xml
-          hatch run coverage:report-uncovered-html
+          hatch run coverage:report-xml openllm-python
+          hatch run coverage:report-uncovered-html openllm-python
       - name: Upload uncovered HTML report
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: uncovered-html-report
           path: htmlcov
@@ -109,7 +110,7 @@ jobs:
         run: hatch run coverage:write-summary-report
       - name: Update coverage pull request comment
         if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork
-        uses: marocchino/sticky-pull-request-comment@v2
+        uses: marocchino/sticky-pull-request-comment@f6a2580ed520ae15da6076e7410b088d1c5dddd9 # ratchet:marocchino/sticky-pull-request-comment@v2
         with:
           path: coverage-report.md
   cli-benchmark:
@@ -118,7 +119,7 @@ jobs:
     env:
       HYPERFINE_VERSION: '1.12.0'
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Install hyperfine
@@ -130,7 +131,7 @@ jobs:
         with:
           python-version: ${{ env.STABLE_PYTHON_VERSION }}
       - name: Install self
-        run: pip install .
+        run: pip install ./openllm-python
       - name: Speed
         run: hyperfine -m 100 --warmup 10 openllm
   brew-dry-run:
@@ -146,7 +147,6 @@ jobs:
   evergreen: # https://github.com/marketplace/actions/alls-green#why
     if: always()
     needs:
-      - coverage
       - tests
       - quality
       - cli-benchmark
@@ -154,7 +154,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Decide whether the needed jobs succeeded or failed
-        uses: re-actors/alls-green@release/v1
+        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
         with:
           jobs: ${{ toJSON(needs) }}
 concurrency:
diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml
index 7b9d7051..0cab3126 100644
--- a/.github/workflows/cleanup.yml
+++ b/.github/workflows/cleanup.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
       - name: Cleanup
         run: "gh extension install actions/gh-actions-cache\n\nREPO=${{ github.repository }}\nBRANCH=\"refs/pull/${{ github.event.pull_request.number }}/merge\"\n\necho \"Fetching list of cache key\"\ncacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )\n\n## Setting this to not fail the workflow while deleting cache keys. \nset +e\necho \"Deleting caches...\"\nfor cacheKey in $cacheKeysForPR\ndo\n    gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm\ndone\necho \"Done\"\n"
         env:
diff --git a/.github/workflows/compile-pypi.yml b/.github/workflows/compile-pypi.yml
index 0aa2b985..e2e8c742 100644
--- a/.github/workflows/compile-pypi.yml
+++ b/.github/workflows/compile-pypi.yml
@@ -35,7 +35,7 @@ jobs:
     name: Pure wheels and sdist distribution
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
           ref: '${{ inputs.tags }}'
@@ -43,15 +43,19 @@ jobs:
         uses: ./.github/actions/setup-repo
       - name: Build
         run: hatch build
+        working-directory: openllm-python
       - name: Upload artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: python-artefacts
-          path: dist/*
+          path: openllm-python/dist/*
           if-no-files-found: error
   mypyc:
     name: Compiled mypyc wheels (${{ matrix.name }})
     runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        working-directory: ./openllm-python
     strategy:
       fail-fast: false
       matrix:
@@ -71,22 +75,26 @@ jobs:
             name: macos-universal2
             macos_arch: "universal2"
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
           ref: '${{ inputs.tags }}'
       - name: Setup CI
         uses: ./.github/actions/setup-repo
         with:
-          python-version: 3.9
+          python-version: 3.8
       - name: Build wheels via cibuildwheel
-        uses: pypa/cibuildwheel@v2.15.0
+        uses: pypa/cibuildwheel@39a63b5912f086dd459cf6fcb13dcdd3fe3bc24d # ratchet:pypa/cibuildwheel@v2.15.0
+        with:
+          package-dir: openllm-python
+          config-file: pyproject.toml
         env:
+          CIBW_PRERELEASE_PYTHONS: True
           CIBW_BEFORE_BUILD_MACOS: "rustup target add aarch64-apple-darwin"
           CIBW_ARCHS_MACOS: "${{ matrix.macos_arch }}"
           MYPYPATH: /project/typings
       - name: Upload wheels as workflow artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
         with:
           name: ${{ matrix.name }}-mypyc-wheels
           path: ./wheelhouse/*.whl
@@ -98,27 +106,27 @@ jobs:
     steps:
       # NOTE: Keep this section in sync with compile-pypi.yml
       - name: Download Python artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: python-artefacts
           path: dist
       - name: Download Linux x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: linux-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS arm64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-arm64-mypyc-wheels
           path: dist
       - name: Download MacOS universal2 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-universal2-mypyc-wheels
           path: dist
@@ -134,7 +142,7 @@ jobs:
       success: ${{ steps.everygreen.outputs.success }}
     steps:
       - name: Decide whether the needed jobs succeeded or failed
-        uses: re-actors/alls-green@release/v1
+        uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # ratchet:re-actors/alls-green@release/v1
         id: evergreen
         with:
           jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/create-releases.yml b/.github/workflows/create-releases.yml
index 276a72cf..aef5c489 100644
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -30,7 +30,7 @@ jobs:
     outputs:
       version: ${{ steps.version.outputs.version }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
           token: ${{ secrets.OPENLLM_PAT }}
@@ -40,7 +40,7 @@ jobs:
           python-version: '3.11'
       - name: Import bot's GPG key for signing commits
         id: import-gpg
-        uses: crazy-max/ghaction-import-gpg@v5
+        uses: crazy-max/ghaction-import-gpg@72b6676b71ab476b77e676928516f6982eef7a41 # ratchet:crazy-max/ghaction-import-gpg@v5
         with:
           gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
           passphrase: ${{ secrets.GPG_PASSPHRASE }}
@@ -68,7 +68,7 @@ jobs:
     needs:
       - release
     name: Compile PyPI distribution for OpenLLM
-    uses: bentoml/OpenLLM/.github/workflows/compile-pypi.yml@main
+    uses: bentoml/OpenLLM/.github/workflows/compile-pypi.yml@2d33100d729008e322209b090f9dcbb40840b5f4 # ratchet:bentoml/OpenLLM/.github/workflows/compile-pypi.yml@main
     with:
       tags: ${{ needs.release.outputs.version }}
   publish-python:
@@ -78,38 +78,40 @@ jobs:
     permissions:
       id-token: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           ref: '${{ needs.release.outputs.version }}'
           token: ${{ secrets.OPENLLM_PAT }}
       # NOTE: Keep this section in sync with compile-pypi.yml
       - name: Download Python artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: python-artefacts
           path: dist
       - name: Download Linux x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: linux-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS arm64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-arm64-mypyc-wheels
           path: dist
       - name: Download MacOS universal2 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-universal2-mypyc-wheels
           path: dist
+      - name: Smoke test compiled artefacts
+        run: ls -R dist
       - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@b7f401de30cb6434a1e19f805ff006643653240e # ratchet:pypa/gh-action-pypi-publish@release/v1
         with:
           print-hash: true
   prepare-next-dev-cycle:
@@ -122,18 +124,18 @@ jobs:
       contents: write
       id-token: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # ratchet:actions/setup-python@v4
       - name: Install dependencies
         run: pip install hatch towncrier
       - name: Install jq and curl
         run: sudo apt-get install -y jq curl
       - name: Import bot's GPG key for signing commits
         id: import-gpg-key
-        uses: crazy-max/ghaction-import-gpg@v5
+        uses: crazy-max/ghaction-import-gpg@72b6676b71ab476b77e676928516f6982eef7a41 # ratchet:crazy-max/ghaction-import-gpg@v5
         with:
           gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
           passphrase: ${{ secrets.GPG_PASSPHRASE }}
@@ -162,16 +164,16 @@ jobs:
           DEV_VERSION="$VNUM1.$VNUM2.$VNUM3.dev0"
           echo "Bumping version to ${DEV_VERSION}..."
           jq --arg release_version "${DEV_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
-          pushd src/openllm-node &>/dev/null
+          pushd openllm-node &>/dev/null
           jq --arg release_version "${DEV_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
           popd &>/dev/null
-          git add package.json src/openllm-node/package.json && git commit -S -sm "infra: bump to dev version of ${DEV_VERSION} [generated] [skip ci]"
+          git add package.json openllm-node/package.json && git commit -S -sm "infra: bump to dev version of ${DEV_VERSION} [generated] [skip ci]"
           git push origin HEAD:main
   binary-distribution:
     if: github.repository_owner == 'bentoml'
     needs: compile-pypi
     name: Create binary/wheels distribution
-    uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@main
+    uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@2d33100d729008e322209b090f9dcbb40840b5f4 # ratchet:bentoml/OpenLLM/.github/workflows/binary-releases.yml@main
   release-notes:
     if: github.repository_owner == 'bentoml'
     needs:
@@ -179,7 +181,7 @@ jobs:
       - publish-python
       - binary-distribution
     name: Create release notes and setup for next cycle
-    uses: bentoml/OpenLLM/.github/workflows/release-notes.yml@main
+    uses: bentoml/OpenLLM/.github/workflows/release-notes.yml@2d33100d729008e322209b090f9dcbb40840b5f4 # ratchet:bentoml/OpenLLM/.github/workflows/release-notes.yml@main
     with:
       tags: ${{ needs.release.outputs.version }}
   bump-homebrew-tap:
@@ -192,7 +194,7 @@ jobs:
       contents: write
       id-token: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup CI
@@ -203,7 +205,7 @@ jobs:
         run: sudo apt-get install -y jq curl
       - name: Import bot's GPG key for signing commits
         id: import-gpg-key
-        uses: crazy-max/ghaction-import-gpg@v5
+        uses: crazy-max/ghaction-import-gpg@72b6676b71ab476b77e676928516f6982eef7a41 # ratchet:crazy-max/ghaction-import-gpg@v5
         with:
           gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}
           passphrase: ${{ secrets.GPG_PASSPHRASE }}
diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml
index b0e35acf..dc4a68d1 100644
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -24,7 +24,7 @@ jobs:
       contents: write
       id-token: write
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
           fetch-depth: 0
           ref: '${{ inputs.tags }}'
@@ -35,47 +35,47 @@ jobs:
       - name: Create release notes
         run: ./.github/actions/create_release_and_archive.sh ${{ inputs.tags }}
       - name: Download Python artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: python-artefacts
           path: dist
       - name: Download Linux x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: linux-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS x86_64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-x86_64-mypyc-wheels
           path: dist
       - name: Download MacOS arm64 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-arm64-mypyc-wheels
           path: dist
       - name: Download MacOS universal2 compiled artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: macos-universal2-mypyc-wheels
           path: dist
       - name: Download binaries
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: standalone
           path: archives
       - name: Download standalone MacOS
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: staged-macOS
           path: archives
       - name: Download installers
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # ratchet:actions/download-artifact@v3
         with:
           name: installers
           path: installers
       - name: Create release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@de2c0eb89ae2a093876385947365aca7b0e5f844 # ratchet:softprops/action-gh-release@v1
         with:
           # Use GH feature to populate the changelog automatically
           generate_release_notes: true
diff --git a/.gitignore b/.gitignore
index 45dff7a8..5496bede 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,4 +141,4 @@ pyapp
 /target
 
 .pdm-python
-/src/openllm/_version.py
+/openllm-python/src/openllm/_version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 80826181..e754b4ac 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@
 
 ci:
   autoupdate_schedule: weekly
-  skip: [check-models-table-update, changelog-dry-run, pyright, yapf, mypy]
+  skip: [check-models-table-update, changelog-dry-run, pyright, yapf, mypy, sync-readme]
   autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci"
   autoupdate_commit_msg: 'ci: pre-commit autoupdate [pre-commit.ci]'
   autofix_prs: false
@@ -37,7 +37,7 @@ repos:
       - id: interrogate
         verbose: true
         types: [python]
-        exclude: ^(docs|tools|tests)
+        exclude: ^(docs|tools|openllm-python/tests)
         args: [--config=pyproject.toml]
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.5.0
@@ -49,9 +49,11 @@ repos:
               examples/.*|
               tools/.*|
               tests/.*|
-              src/openllm/playground/.*|
+              openllm-python/src/openllm/playground/.*|
+              openllm-python/tests/.*|
               .github/.*|
-              cz.py
+              cz.py |
+              hatch_build.py
           )$
         additional_dependencies:
           - click==8.1.3
@@ -72,7 +74,7 @@ repos:
         verbose: true
         exclude: |
           (?x)^(
-              tests/models/.*
+              openllm-python/tests/models/.*
           )$
       - id: check-yaml
         args: ['--unsafe']
@@ -93,6 +95,12 @@ repos:
         language: script
         verbose: true
         files: README.md
+      - id: sync-readme
+        name: sync readme with python core library
+        entry: ./tools/sync-readme
+        language: script
+        verbose: true
+        files: README.md
       - id: changelog-dry-run
         name: Running changelog dry-run
         entry: hatch run changelog
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 48d853e5..6061e67d 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -65,24 +65,31 @@ Before you can start developing, you'll need to set up your environment:
    This will automatically enter a virtual environment and update the relevant
    dependencies.
 
+> [!NOTE]
+> If you want to install editable, make sure to install it from `openllm-python` folder
+
 ## Project Structure
 
 Here's a high-level overview of our project structure:
 
-```
+```prolog
 openllm/
-├── examples                 # Usage demonstration scripts
-├── src
-│   ├── openllm              # openllm core
-│   └── openllm-node         # openllm nodejs library
-├── tests                    # Automated Tests
-├── tools                    # Utilities Script
-├── typings                  # Typing Checking Utilities Module and Classes
-├── DEVELOPMENT.md           # The project's Developer Guide
-├── LICENSE                  # Use terms and conditions
-├── package.json             # Node.js or JavaScript dependencies
-├── pyproject.toml           # Python Project Specification File (PEP 518)
-└── README.md                # The project's README file
+├── ADDING_NEW_MODEL.md  # How to add a new model
+├── CHANGELOG.md         # Generated changelog
+├── CITATION.cff         # Citation File Format
+├── DEVELOPMENT.md       # The project's Developer Guide
+├── Formula              # Homebrew Formula
+├── LICENSE.md           # Use terms and conditions
+├── README.md            # The project's README file
+├── STYLE.md             # The project's Style Guide
+├── cz.py                # code-golf commitizen
+├── examples             # Usage demonstration scripts
+├── openllm-node         # openll node library
+├── openllm-python       # openllm python library
+│   └── src
+│       └── openllm      # openllm core implementation
+├── pyproject.toml       # Python Project Specification File (PEP 518)
+└── tools                # Utilities Script
 ```
 
 ## Development Workflow
@@ -167,13 +174,25 @@ hatch run compile
 ```
 
 > [!IMPORTANT]
-> This will compiled some performance sensitive modules with mypyc. The compiled `.so` or `.pyd` can be found
-> under `/src/openllm`. If you run into any issue, run `hatch run recompile`
+> This will compiled some performance sensitive modules with mypyc.
+> The compiled `.so` or `.pyd` can be found
+> under `/openllm-python/src/openllm`. If you run into any issue, run `hatch run recompile`
 
 ## Style
 
 See [STYLE.md](STYLE.md) for our style guide.
 
+## Working with OpenLLM's CI/CD
+
+After you change or update any CI related under `.github`, run `./tools/lock-actions` to lock the action version.
+
+## Install from git archive install
+
+```bash
+pip install 'https://github.com/bentoml/OpenLLM/archive/main.tar.gz#subdirectory=openllm-python'
+```
+
+
 ## Releasing a New Version
 
 To release a new version, use `./tools/run-release-action`. It requires `gh`,
diff --git a/README.md b/README.md
index adbfc459..1327dd53 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![Banner for OpenLLM](/assets/main-banner.png)
+![Banner for OpenLLM](/.github/assets/main-banner.png)
 
 <!-- hatch-fancy-pypi-readme intro start -->
 
@@ -58,7 +58,7 @@ Images or deploy as serverless endpoint via
 
 <!-- hatch-fancy-pypi-readme intro stop -->
 
-![Gif showing OpenLLM Intro](/assets/output.gif)
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
 
 <br/>
 
@@ -136,9 +136,9 @@ openllm start flan-t5 --model-id google/flan-t5-large
 ```
 
 > [!NOTE]
-> `openllm` also supports all variants of fine-tuning weights,
-> custom model path as well as quantized weights for any of the supported models
-> as long as it can be loaded with the model architecture. Refer to
+> `openllm` also supports all variants of fine-tuning weights, custom
+> model path as well as quantized weights for any of the supported models as
+> long as it can be loaded with the model architecture. Refer to
 > [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models)
 > section for models' architecture.
 
@@ -532,7 +532,8 @@ client.embed("I like to eat apples")
 ```
 
 > [!NOTE]
-> Currently, the following model family supports embeddings: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
+> Currently, the following model family supports embeddings: Llama, T5
+> (Flan-T5, FastChat, etc.), ChatGLM
 
 ## ⚙️ Integrations
 
@@ -582,8 +583,9 @@ llm("What is the difference between a duck and a goose? And why there are so man
 ```
 
 > [!IMPORTANT]
-> By default, OpenLLM use `safetensors` format for saving models. If the model doesn't support safetensors,
-> make sure to pass `serialisation="legacy"` to use the legacy PyTorch bin format.
+> By default, OpenLLM use `safetensors` format for saving models.
+> If the model doesn't support safetensors, make sure to pass
+> `serialisation="legacy"` to use the legacy PyTorch bin format.
 
 `langchain.llms.OpenLLM` has the capability to interact with remote OpenLLM
 Server. Given there is an OpenLLM server deployed elsewhere, you can connect to
@@ -638,8 +640,8 @@ agent.run("Is the following `text` positive or negative?", text="I don't like ho
 ```
 
 > [!IMPORTANT]
-> Only `starcoder` is currently supported with Agent integration. The
-> example above was also run with four T4s on EC2 `g4dn.12xlarge`
+> Only `starcoder` is currently supported with Agent integration.
+> The example above was also run with four T4s on EC2 `g4dn.12xlarge`
 
 If you want to use OpenLLM client to ask questions to the running agent, you can
 also do so:
@@ -657,7 +659,7 @@ client.ask_agent(
 
 <!-- hatch-fancy-pypi-readme interim stop -->
 
-![Gif showing Agent integration](/assets/agent.gif)
+![Gif showing Agent integration](/.github/assets/agent.gif)
 
 <br/>
 
@@ -751,7 +753,7 @@ OpenLLM collects usage data to enhance user experience and improve the product.
 We only report OpenLLM's internal API calls and ensure maximum privacy by
 excluding sensitive information. We will never collect user code, model data, or
 stack traces. For usage tracking, check out the
-[code](./src/openllm/utils/analytics.py).
+[code](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/src/openllm/utils/analytics.py).
 
 You can opt out of usage tracking by using the `--do-not-track` CLI option:
 
diff --git a/compile.sh b/compile.sh
index c3e8a361..8b1e843a 100644
--- a/compile.sh
+++ b/compile.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
-HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose "$@"
+HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-python -w -C--global-option=--verbose "$@"
 hatch clean
diff --git a/cz.py b/cz.py
index e363e732..8068b77a 100755
--- a/cz.py
+++ b/cz.py
@@ -1,29 +1,24 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import itertools
-import os
-import token
-import tokenize
-
+import itertools, os, token, tokenize
 from tabulate import tabulate
 
 TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING]
-
-if __name__ == "__main__":
+def main() -> int:
   headers = ["Name", "Lines", "Tokens/Line"]
   table = []
-  for path, _subdirs, files in os.walk(os.path.join("src", "openllm")):
+  for path, _, files in os.walk(os.path.join("openllm-python", "src", "openllm")):
     for name in files:
       if not name.endswith(".py"): continue
       filepath = os.path.join(path, name)
       with tokenize.open(filepath) as file_:
         tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
         token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
-        table.append([filepath, line_count, token_count / line_count if line_count != 0 else 0])
-
+        table.append([filepath.replace(os.path.join("openllm-python","src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
   print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers="firstrow", floatfmt=".1f") + "\n")
-
   for dir_name, group in itertools.groupby(sorted([(x[0].rsplit("/", 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
-    print(f"{dir_name:40s} : {sum([x[1] for x in group]):6d}")
-
+    print(f"{dir_name:35s} : {sum([x[1] for x in group]):6d}")
   print(f"\ntotal line count: {sum([x[1] for x in table])}")
+  return 0
+
+if __name__ == "__main__": raise SystemExit(main())
diff --git a/hatch.toml b/hatch.toml
index 027abb15..6a75a848 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -5,7 +5,7 @@ content-type = "text/markdown"
 text = """
 <p align="center">
   <a href="https://github.com/bentoml/openllm">
-    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/main-banner.png" alt="Banner for OpenLLM" />
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
   </a>
 </p>
 
@@ -18,7 +18,7 @@ start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
 text = """
 
 <p align="center">
-  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/output.gif" alt="Gif showing OpenLLM Intro" />
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
 </p>
 """
 [[metadata.hooks.fancy-pypi-readme.fragments]]
@@ -29,7 +29,7 @@ start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
 text = """
 
 <p align="center">
-  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/agent.gif" alt="Gif showing Agent integration" />
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
 </p>
 """
 [[metadata.hooks.fancy-pypi-readme.fragments]]
@@ -55,6 +55,7 @@ text = """
 """
 [envs.default]
 dependencies = [
+    "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
     # NOTE: To run all hooks
     "pre-commit",
     # NOTE: towncrier for changelog
@@ -73,6 +74,9 @@ dependencies = [
     "types-PyYAML",
     "types-protobuf",
 ]
+pre-install-commands = [
+    "pip install -e ./openllm-python", #
+]
 [envs.default.scripts]
 changelog = "towncrier build --version main --draft"
 check-stubs = ["./tools/update-config-stubs.py", "./tools/update-models-import.py", "update-dummy"]
@@ -92,6 +96,7 @@ typing = ["- pre-commit run mypy {args:-a}", "- pre-commit run pyright {args:-a}
 update-dummy = ["- ./tools/update-dummy.py", "./tools/update-dummy.py"]
 [envs.tests]
 dependencies = [
+    "openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
     # NOTE: interact with docker for container tests.
     "docker",
     # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
@@ -108,15 +113,14 @@ dependencies = [
     "hypothesis",
     "syrupy",
 ]
-features = ["flan-t5", "baichuan"]
 skip-install = false
 template = "tests"
 [envs.tests.scripts]
 _run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml -vv"
-distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}"
-models = "_run_script -s {args:tests/models}"
-python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}"
-snapshot-models = "_run_script -s --snapshot-update {args:tests/models}"
+distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore openllm-python/tests/models -n 3 -r aR {args:openllm-python/tests}"
+models = "_run_script -s {args:openllm-python/tests/models}"
+python = "_run_script --reruns 5 --reruns-delay 3 --ignore openllm-python/tests/models -r aR {args:openllm-python/tests}"
+snapshot-models = "_run_script -s --snapshot-update {args:openllm-python/tests/models}"
 [envs.tests.overrides]
 env.GITHUB_ACTIONS.env-vars = "COVERAGE_REPORT="
 [envs.coverage]
@@ -125,6 +129,6 @@ detached = true
 [envs.coverage.scripts]
 combine = "coverage combine {args}"
 generate-summary = "python tools/generate-coverage.py"
-report-uncovered-html = "coverage html --skip-covered --skip-empty"
-report-xml = "coverage xml"
+report-uncovered-html = "coverage html --skip-covered --skip-empty {args}"
+report-xml = "coverage xml {args}"
 write-summary-report = "python tools/write-coverage-report.py"
diff --git a/hatch_build.py b/hatch_build.py
new file mode 100644
index 00000000..07d2c476
--- /dev/null
+++ b/hatch_build.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+import os, typing as t
+from hatchling.metadata.plugin.interface import MetadataHookInterface
+
+class CustomMetadataHook(MetadataHookInterface):
+  def update(self, metadata: dict[str, t.Any]) -> None:
+    if os.environ.get("HATCH_ENV_ACTIVE", "not-dev") != "dev": metadata["dependencies"] = [f"openllm[opt,chatglm,fine-tune]=={metadata['version']}"]
diff --git a/nightly-requirements-gpu.txt b/nightly-requirements-gpu.txt
deleted file mode 100644
index 387d3ab8..00000000
--- a/nightly-requirements-gpu.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is generated by `tools/dependencies.py`. # DO NOT EDIT
-# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.
--r nightly-requirements.txt
--e .[all]
-vllm @ git+https://github.com/vllm-project/vllm.git@main
diff --git a/nightly-requirements.txt b/nightly-requirements.txt
deleted file mode 100644
index 7ec37722..00000000
--- a/nightly-requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# This file is generated by `tools/dependencies.py`. DO NOT EDIT
--e .[playground,flan-t5]
-bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main
-peft @ git+https://github.com/huggingface/peft.git@main
-transformers[torch,tokenizers,accelerate] @ git+https://github.com/huggingface/transformers.git@main
-optimum @ git+https://github.com/huggingface/optimum.git@main
-accelerate @ git+https://github.com/huggingface/accelerate.git@main
-bitsandbytes @ git+https://github.com/TimDettmers/bitsandbytes.git@main
-trl @ git+https://github.com/lvwerra/trl.git@main
diff --git a/src/openllm-node/package.json b/openllm-node/package.json
similarity index 100%
rename from src/openllm-node/package.json
rename to openllm-node/package.json
diff --git a/src/openllm-node/src/index.ts b/openllm-node/src/index.ts
similarity index 100%
rename from src/openllm-node/src/index.ts
rename to openllm-node/src/index.ts
diff --git a/src/openllm-node/tsconfig.cjs.json b/openllm-node/tsconfig.cjs.json
similarity index 100%
rename from src/openllm-node/tsconfig.cjs.json
rename to openllm-node/tsconfig.cjs.json
diff --git a/src/openllm-node/tsconfig.json b/openllm-node/tsconfig.json
similarity index 100%
rename from src/openllm-node/tsconfig.json
rename to openllm-node/tsconfig.json
diff --git a/openllm-python/.git_archival.txt b/openllm-python/.git_archival.txt
new file mode 120000
index 00000000..d7a42b25
--- /dev/null
+++ b/openllm-python/.git_archival.txt
@@ -0,0 +1 @@
+../.git_archival.txt
\ No newline at end of file
diff --git a/openllm-python/LICENSE.md b/openllm-python/LICENSE.md
new file mode 100644
index 00000000..7fd0f057
--- /dev/null
+++ b/openllm-python/LICENSE.md
@@ -0,0 +1,194 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_&lt;<http://www.apache.org/licenses/>&gt;_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+    Copyright 2023 Atalaya Tech Inc.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
diff --git a/openllm-python/README.md b/openllm-python/README.md
new file mode 100644
index 00000000..1327dd53
--- /dev/null
+++ b/openllm-python/README.md
@@ -0,0 +1,786 @@
+![Banner for OpenLLM](/.github/assets/main-banner.png)
+
+<!-- hatch-fancy-pypi-readme intro start -->
+
+<div align="center">
+    <h1 align="center">🦾 OpenLLM</h1>
+    <a href="https://pypi.org/project/openllm">
+        <img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://twitter.com/bentomlai">
+        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
+    </a><a href="https://l.bentoml.com/join-openllm-discord">
+        <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
+    </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
+        <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
+    </a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
+        <img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
+    </a><br>
+    <a href="https://pypi.org/project/openllm">
+        <img src="https://img.shields.io/pypi/pyversions/openllm.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="https://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><a href="https://github.com/python/mypy">
+        <img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
+    </a><a href="https://github.com/microsoft/pyright">
+        <img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
+    </a><br>
+    <p>An open platform for operating large language models (LLMs) in production.</br>
+    Fine-tune, serve, deploy, and monitor any LLMs with ease.</p>
+    <i></i>
+</div>
+
+## 📖 Introduction
+
+With OpenLLM, you can run inference with any open-source large-language models,
+deploy to the cloud or on-premises, and build powerful AI apps.
+
+🚂 **State-of-the-art LLMs**: built-in supports a wide range of open-source LLMs
+and model runtime, including Llama 2，StableLM, Falcon, Dolly, Flan-T5, ChatGLM,
+StarCoder and more.
+
+🔥 **Flexible APIs**: serve LLMs over RESTful API or gRPC with one command,
+query via WebUI, CLI, our Python/Javascript client, or any HTTP client.
+
+⛓️ **Freedom To Build**: First-class support for LangChain, BentoML and Hugging
+Face that allows you to easily create your own AI apps by composing LLMs with
+other models and services.
+
+🎯 **Streamline Deployment**: Automatically generate your LLM server Docker
+Images or deploy as serverless endpoint via
+[☁️ BentoCloud](https://l.bentoml.com/bento-cloud).
+
+🤖️ **Bring your own LLM**: Fine-tune any LLM to suit your needs with
+`LLM.tuning()`. (Coming soon)
+
+<!-- hatch-fancy-pypi-readme intro stop -->
+
+![Gif showing OpenLLM Intro](/.github/assets/output.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 🏃 Getting Started
+
+To use OpenLLM, you need to have Python 3.8 (or newer) and `pip` installed on
+your system. We highly recommend using a Virtual Environment to prevent package
+conflicts.
+
+You can install OpenLLM using pip as follows:
+
+```bash
+pip install openllm
+```
+
+To verify if it's installed correctly, run:
+
+```
+$ openllm -h
+
+Usage: openllm [OPTIONS] COMMAND [ARGS]...
+
+   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
+  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
+  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
+  ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
+  ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
+   ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
+
+  An open platform for operating large language models in production.
+  Fine-tune, serve, deploy, and monitor any LLMs with ease.
+```
+
+### Starting an LLM Server
+
+To start an LLM server, use `openllm start`. For example, to start a
+[`OPT`](https://huggingface.co/docs/transformers/model_doc/opt) server, do the
+following:
+
+```bash
+openllm start opt
+```
+
+Following this, a Web UI will be accessible at http://localhost:3000 where you
+can experiment with the endpoints and sample input prompts.
+
+OpenLLM provides a built-in Python client, allowing you to interact with the
+model. In a different terminal window or a Jupyter Notebook, create a client to
+start interacting with the model:
+
+```python
+import openllm
+client = openllm.client.HTTPClient('http://localhost:3000')
+client.query('Explain to me the difference between "further" and "farther"')
+```
+
+You can also use the `openllm query` command to query the model from the
+terminal:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'Explain to me the difference between "further" and "farther"'
+```
+
+Visit `http://localhost:3000/docs.json` for OpenLLM's API specification.
+
+OpenLLM seamlessly supports many models and their variants. Users can also
+specify different variants of the model to be served, by providing the
+`--model-id` argument, e.g.:
+
+```bash
+openllm start flan-t5 --model-id google/flan-t5-large
+```
+
+> [!NOTE]
+> `openllm` also supports all variants of fine-tuning weights, custom
+> model path as well as quantized weights for any of the supported models as
+> long as it can be loaded with the model architecture. Refer to
+> [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models)
+> section for models' architecture.
+
+Use the `openllm models` command to see the list of models and their variants
+supported in OpenLLM.
+
+## 🧩 Supported Models
+
+The following models are currently supported in OpenLLM. By default, OpenLLM
+doesn't include dependencies to run all models. The extra model-specific
+dependencies can be installed with the instructions below:
+
+<!-- update-readme.py: start -->
+
+<table align='center'>
+<tr>
+<th>Model</th>
+<th>Architecture</th>
+<th>Model Ids</th>
+<th>Installation</th>
+</tr>
+<tr>
+
+<td><a href=https://github.com/THUDM/ChatGLM-6B>chatglm</a></td>
+<td><a href=https://github.com/THUDM/ChatGLM-6B><code>ChatGLMForConditionalGeneration</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/thudm/chatglm-6b><code>thudm/chatglm-6b</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm-6b-int8><code>thudm/chatglm-6b-int8</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm-6b-int4><code>thudm/chatglm-6b-int4</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm2-6b><code>thudm/chatglm2-6b</code></a></li>
+<li><a href=https://huggingface.co/thudm/chatglm2-6b-int4><code>thudm/chatglm2-6b-int4</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[chatglm]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/databrickslabs/dolly>dolly-v2</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/databricks/dolly-v2-3b><code>databricks/dolly-v2-3b</code></a></li>
+<li><a href=https://huggingface.co/databricks/dolly-v2-7b><code>databricks/dolly-v2-7b</code></a></li>
+<li><a href=https://huggingface.co/databricks/dolly-v2-12b><code>databricks/dolly-v2-12b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://falconllm.tii.ae/>falcon</a></td>
+<td><a href=https://falconllm.tii.ae/><code>FalconForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/tiiuae/falcon-7b><code>tiiuae/falcon-7b</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-40b><code>tiiuae/falcon-40b</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-7b-instruct><code>tiiuae/falcon-7b-instruct</code></a></li>
+<li><a href=https://huggingface.co/tiiuae/falcon-40b-instruct><code>tiiuae/falcon-40b-instruct</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[falcon]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/docs/transformers/model_doc/flan-t5>flan-t5</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/t5#transformers.T5ForConditionalGeneration><code>T5ForConditionalGeneration</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/google/flan-t5-small><code>google/flan-t5-small</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-base><code>google/flan-t5-base</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-large><code>google/flan-t5-large</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-xl><code>google/flan-t5-xl</code></a></li>
+<li><a href=https://huggingface.co/google/flan-t5-xxl><code>google/flan-t5-xxl</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[flan-t5]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/EleutherAI/gpt-neox>gpt-neox</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/eleutherai/gpt-neox-20b><code>eleutherai/gpt-neox-20b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/facebookresearch/llama>llama</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM><code>LlamaForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/meta-llama/Llama-2-70b-chat-hf><code>meta-llama/Llama-2-70b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-13b-chat-hf><code>meta-llama/Llama-2-13b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-7b-chat-hf><code>meta-llama/Llama-2-7b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-70b-hf><code>meta-llama/Llama-2-70b-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-13b-hf><code>meta-llama/Llama-2-13b-hf</code></a></li>
+<li><a href=https://huggingface.co/meta-llama/Llama-2-7b-hf><code>meta-llama/Llama-2-7b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-70b-chat-hf><code>NousResearch/llama-2-70b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-13b-chat-hf><code>NousResearch/llama-2-13b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-7b-chat-hf><code>NousResearch/llama-2-7b-chat-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-70b-hf><code>NousResearch/llama-2-70b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-13b-hf><code>NousResearch/llama-2-13b-hf</code></a></li>
+<li><a href=https://huggingface.co/NousResearch/llama-2-7b-hf><code>NousResearch/llama-2-7b-hf</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_7b_v2><code>openlm-research/open_llama_7b_v2</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_3b_v2><code>openlm-research/open_llama_3b_v2</code></a></li>
+<li><a href=https://huggingface.co/openlm-research/open_llama_13b><code>openlm-research/open_llama_13b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-65b><code>huggyllama/llama-65b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-30b><code>huggyllama/llama-30b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-13b><code>huggyllama/llama-13b</code></a></li>
+<li><a href=https://huggingface.co/huggyllama/llama-7b><code>huggyllama/llama-7b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[llama]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/mosaicml>mpt</a></td>
+<td><a href=https://huggingface.co/mosaicml><code>MPTForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/mosaicml/mpt-7b><code>mosaicml/mpt-7b</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-instruct><code>mosaicml/mpt-7b-instruct</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-chat><code>mosaicml/mpt-7b-chat</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-7b-storywriter><code>mosaicml/mpt-7b-storywriter</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b><code>mosaicml/mpt-30b</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b-instruct><code>mosaicml/mpt-30b-instruct</code></a></li>
+<li><a href=https://huggingface.co/mosaicml/mpt-30b-chat><code>mosaicml/mpt-30b-chat</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[mpt]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/facebook/opt-125m><code>facebook/opt-125m</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-350m><code>facebook/opt-350m</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-1.3b><code>facebook/opt-1.3b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-2.7b><code>facebook/opt-2.7b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-6.7b><code>facebook/opt-6.7b</code></a></li>
+<li><a href=https://huggingface.co/facebook/opt-66b><code>facebook/opt-66b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[opt]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/Stability-AI/StableLM>stablelm</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b><code>stabilityai/stablelm-tuned-alpha-3b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b><code>stabilityai/stablelm-tuned-alpha-7b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-base-alpha-3b><code>stabilityai/stablelm-base-alpha-3b</code></a></li>
+<li><a href=https://huggingface.co/stabilityai/stablelm-base-alpha-7b><code>stabilityai/stablelm-base-alpha-7b</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install openllm
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/bigcode-project/starcoder>starcoder</a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_bigcode#transformers.GPTBigCodeForCausalLM><code>GPTBigCodeForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/bigcode/starcoder><code>bigcode/starcoder</code></a></li>
+<li><a href=https://huggingface.co/bigcode/starcoderbase><code>bigcode/starcoderbase</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[starcoder]"
+```
+
+</td>
+</tr>
+<tr>
+
+<td><a href=https://github.com/baichuan-inc/Baichuan-7B>baichuan</a></td>
+<td><a href=https://github.com/baichuan-inc/Baichuan-7B><code>BaiChuanForCausalLM</code></a></td>
+<td>
+
+<ul><li><a href=https://huggingface.co/baichuan-inc/baichuan-7b><code>baichuan-inc/baichuan-7b</code></a></li>
+<li><a href=https://huggingface.co/baichuan-inc/baichuan-13b-base><code>baichuan-inc/baichuan-13b-base</code></a></li>
+<li><a href=https://huggingface.co/baichuan-inc/baichuan-13b-chat><code>baichuan-inc/baichuan-13b-chat</code></a></li>
+<li><a href=https://huggingface.co/fireballoon/baichuan-vicuna-chinese-7b><code>fireballoon/baichuan-vicuna-chinese-7b</code></a></li>
+<li><a href=https://huggingface.co/fireballoon/baichuan-vicuna-7b><code>fireballoon/baichuan-vicuna-7b</code></a></li>
+<li><a href=https://huggingface.co/hiyouga/baichuan-7b-sft><code>hiyouga/baichuan-7b-sft</code></a></li></ul>
+
+</td>
+<td>
+
+```bash
+pip install "openllm[baichuan]"
+```
+
+</td>
+</tr>
+</table>
+
+<!-- update-readme.py: stop -->
+
+### Runtime Implementations (Experimental)
+
+Different LLMs may have multiple runtime implementations. For instance, they
+might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`).
+
+If you wish to specify a particular runtime for a model, you can do so by
+setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable
+before running `openllm start`.
+
+For example, if you want to use the Tensorflow (`tf`) implementation for the
+`flan-t5` model, you can use the following command:
+
+```bash
+OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
+```
+
+> [!NOTE]
+> For GPU support on Flax, refers to
+> [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
+> to make sure that you have Jax support for the corresponding CUDA version.
+
+### Quantisation
+
+OpenLLM supports quantisation with
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and
+[GPTQ](https://arxiv.org/abs/2210.17323)
+
+```bash
+openllm start mpt --quantize int8
+```
+
+To run inference with `gptq`, simply pass `--quantize gptq`:
+
+```bash
+openllm start falcon --model-id TheBloke/falcon-40b-instruct-GPTQ --quantize gptq --device 0
+```
+
+> [!NOTE]
+> In order to run GPTQ, make sure to install with
+> `pip install "openllm[gptq]"`. The weights of all supported models should be
+> quantized before serving. See
+> [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) for more
+> information on GPTQ quantisation.
+
+### Fine-tuning support (Experimental)
+
+One can serve OpenLLM models with any PEFT-compatible layers with
+`--adapter-id`:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6-7b-quotes
+```
+
+It also supports adapters from custom paths:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
+```
+
+To use multiple adapters, use the following format:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
+```
+
+By default, the first adapter-id will be the default Lora layer, but optionally
+users can change what Lora layer to use for inference via `/v1/adapters`:
+
+```bash
+curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
+```
+
+Note that for multiple adapter-name and adapter-id, it is recommended to update
+to use the default adapter before sending the inference, to avoid any
+performance degradation
+
+To include this into the Bento, one can also provide a `--adapter-id` into
+`openllm build`:
+
+```bash
+openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
+```
+
+> [!NOTE]
+> We will gradually roll out support for fine-tuning all models. The
+> following models contain fine-tuning support: OPT, Falcon, LlaMA.
+
+### Integrating a New Model
+
+OpenLLM encourages contributions by welcoming users to incorporate their custom
+LLMs into the ecosystem. Check out
+[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
+to see how you can do it yourself.
+
+### Embeddings
+
+OpenLLM tentatively provides embeddings endpoint for supported models. This can
+be accessed via `/v1/embeddings`.
+
+To use via CLI, simply call `openllm embed`:
+
+```bash
+openllm embed --endpoint http://localhost:3000 "I like to eat apples" -o json
+{
+  "embeddings": [
+    0.006569798570126295,
+    -0.031249752268195152,
+    -0.008072729222476482,
+    0.00847396720200777,
+    -0.005293501541018486,
+    ...<many embeddings>...
+    -0.002078012563288212,
+    -0.00676426338031888,
+    -0.002022686880081892
+  ],
+  "num_tokens": 9
+}
+```
+
+To invoke this endpoint, use `client.embed` from the Python SDK:
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient("http://localhost:3000")
+
+client.embed("I like to eat apples")
+```
+
+> [!NOTE]
+> Currently, the following model family supports embeddings: Llama, T5
+> (Flan-T5, FastChat, etc.), ChatGLM
+
+## ⚙️ Integrations
+
+OpenLLM is not just a standalone product; it's a building block designed to
+integrate with other powerful tools easily. We currently offer integration with
+[BentoML](https://github.com/bentoml/BentoML),
+[LangChain](https://github.com/hwchase17/langchain), and
+[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents).
+
+### BentoML
+
+OpenLLM models can be integrated as a
+[Runner](https://docs.bentoml.com/en/latest/concepts/runner.html) in your
+BentoML service. These runners have a `generate` method that takes a string as a
+prompt and returns a corresponding output string. This will allow you to plug
+and play any OpenLLM models with your existing ML workflow.
+
+```python
+import bentoml
+import openllm
+
+model = "opt"
+
+llm_config = openllm.AutoConfig.for_model(model)
+llm_runner = openllm.Runner(model, llm_config=llm_config)
+
+svc = bentoml.Service(
+    name=f"llm-opt-service", runners=[llm_runner]
+)
+
+@svc.api(input=Text(), output=Text())
+async def prompt(input_text: str) -> str:
+    answer = await llm_runner.generate(input_text)
+    return answer
+```
+
+### [LangChain](https://python.langchain.com/docs/ecosystem/integrations/openllm)
+
+To quickly start a local LLM with `langchain`, simply do the following:
+
+```python
+from langchain.llms import OpenLLM
+
+llm = OpenLLM(model_name="llama", model_id='meta-llama/Llama-2-7b-hf')
+
+llm("What is the difference between a duck and a goose? And why there are so many Goose in Canada?")
+```
+
+> [!IMPORTANT]
+> By default, OpenLLM use `safetensors` format for saving models.
+> If the model doesn't support safetensors, make sure to pass
+> `serialisation="legacy"` to use the legacy PyTorch bin format.
+
+`langchain.llms.OpenLLM` has the capability to interact with remote OpenLLM
+Server. Given there is an OpenLLM server deployed elsewhere, you can connect to
+it by specifying its URL:
+
+```python
+from langchain.llms import OpenLLM
+
+llm = OpenLLM(server_url='http://44.23.123.1:3000', server_type='grpc')
+llm("What is the difference between a duck and a goose? And why there are so many Goose in Canada?")
+```
+
+To integrate a LangChain agent with BentoML, you can do the following:
+
+```python
+llm = OpenLLM(
+    model_name='flan-t5',
+    model_id='google/flan-t5-large',
+    embedded=False,
+    serialisation="legacy"
+)
+tools = load_tools(["serpapi", "llm-math"], llm=llm)
+agent = initialize_agent(
+    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
+)
+svc = bentoml.Service("langchain-openllm", runners=[llm.runner])
+@svc.api(input=Text(), output=Text())
+def chat(input_text: str):
+    return agent.run(input_text)
+```
+
+> [!NOTE]
+> You can find out more examples under the
+> [examples](https://github.com/bentoml/OpenLLM/tree/main/examples) folder.
+
+### Transformers Agents
+
+OpenLLM seamlessly integrates with
+[Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents).
+
+> [!WARNING]
+> The Transformers Agent is still at an experimental stage. It is
+> recommended to install OpenLLM with `pip install -r nightly-requirements.txt`
+> to get the latest API update for HuggingFace agent.
+
+```python
+import transformers
+
+agent = transformers.HfAgent("http://localhost:3000/hf/agent")  # URL that runs the OpenLLM server
+
+agent.run("Is the following `text` positive or negative?", text="I don't like how this models is generate inputs")
+```
+
+> [!IMPORTANT]
+> Only `starcoder` is currently supported with Agent integration.
+> The example above was also run with four T4s on EC2 `g4dn.12xlarge`
+
+If you want to use OpenLLM client to ask questions to the running agent, you can
+also do so:
+
+```python
+import openllm
+
+client = openllm.client.HTTPClient("http://localhost:3000")
+
+client.ask_agent(
+    task="Is the following `text` positive or negative?",
+    text="What are you thinking about?",
+)
+```
+
+<!-- hatch-fancy-pypi-readme interim stop -->
+
+![Gif showing Agent integration](/.github/assets/agent.gif)
+
+<br/>
+
+<!-- hatch-fancy-pypi-readme meta start -->
+
+## 🚀 Deploying to Production
+
+There are several ways to deploy your LLMs:
+
+### 🐳 Docker container
+
+1. **Building a Bento**: With OpenLLM, you can easily build a Bento for a
+   specific model, like `dolly-v2`, using the `build` command.:
+
+   ```bash
+   openllm build dolly-v2
+   ```
+
+   A
+   [Bento](https://docs.bentoml.com/en/latest/concepts/bento.html#what-is-a-bento),
+   in BentoML, is the unit of distribution. It packages your program's source
+   code, models, files, artefacts, and dependencies.
+
+2. **Containerize your Bento**
+
+   ```bash
+   bentoml containerize <name:version>
+   ```
+   This generates a OCI-compatible docker image that can be deployed anywhere
+   docker runs. For best scalability and reliability of your LLM service in
+   production, we recommend deploy with BentoCloud。
+
+### ☁️ BentoCloud
+
+Deploy OpenLLM with [BentoCloud](https://www.bentoml.com/bento-cloud/), the
+serverless cloud for shipping and scaling AI applications.
+
+1. **Create a BentoCloud account:** [sign up here](https://bentoml.com/cloud)
+   for early access
+
+2. **Log into your BentoCloud account:**
+
+   ```bash
+   bentoml cloud login --api-token <your-api-token> --endpoint <bento-cloud-endpoint>
+   ```
+
+> [!NOTE]
+> Replace `<your-api-token>` and `<bento-cloud-endpoint>` with your
+> specific API token and the BentoCloud endpoint respectively.
+
+3. **Bulding a Bento**: With OpenLLM, you can easily build a Bento for a
+   specific model, such as `dolly-v2`:
+
+   ```bash
+   openllm build dolly-v2
+   ```
+
+4. **Pushing a Bento**: Push your freshly-built Bento service to BentoCloud via
+   the `push` command:
+
+   ```bash
+   bentoml push <name:version>
+   ```
+
+5. **Deploying a Bento**: Deploy your LLMs to BentoCloud with a single
+   `bentoml deployment create` command following the
+   [deployment instructions](https://docs.bentoml.com/en/latest/reference/cli.html#bentoml-deployment-create).
+
+## 👥 Community
+
+Engage with like-minded individuals passionate about LLMs, AI, and more on our
+[Discord](https://l.bentoml.com/join-openllm-discord)!
+
+OpenLLM is actively maintained by the BentoML team. Feel free to reach out and
+join us in our pursuit to make LLMs more accessible and easy to use 👉
+[Join our Slack community!](https://l.bentoml.com/join-slack)
+
+## 🎁 Contributing
+
+We welcome contributions! If you're interested in enhancing OpenLLM's
+capabilities or have any questions, don't hesitate to reach out in our
+[discord channel](https://l.bentoml.com/join-openllm-discord).
+
+Checkout our
+[Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md)
+if you wish to contribute to OpenLLM's codebase.
+
+## 🍇 Telemetry
+
+OpenLLM collects usage data to enhance user experience and improve the product.
+We only report OpenLLM's internal API calls and ensure maximum privacy by
+excluding sensitive information. We will never collect user code, model data, or
+stack traces. For usage tracking, check out the
+[code](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/src/openllm/utils/analytics.py).
+
+You can opt out of usage tracking by using the `--do-not-track` CLI option:
+
+```bash
+openllm [command] --do-not-track
+```
+
+Or by setting the environment variable `OPENLLM_DO_NOT_TRACK=True`:
+
+```bash
+export OPENLLM_DO_NOT_TRACK=True
+```
+
+## 📔 Citation
+
+If you use OpenLLM in your research, we provide a [citation](./CITATION.cff) to
+use:
+
+```bibtex
+@software{Pham_OpenLLM_Operating_LLMs_2023,
+author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and  Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
+license = {Apache-2.0},
+month = jun,
+title = {{OpenLLM: Operating LLMs in production}},
+url = {https://github.com/bentoml/OpenLLM},
+year = {2023}
+}
+```
+
+<!-- hatch-fancy-pypi-readme meta stop -->
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
new file mode 100644
index 00000000..3a41a86e
--- /dev/null
+++ b/openllm-python/pyproject.toml
@@ -0,0 +1,207 @@
+# NOTE: PEP517 is manged via ./tools/dependencies.py
+[build-system]
+build-backend = "hatchling.build"
+requires = [
+    "hatchling==1.18.0",
+    "hatch-vcs==0.3.0",
+    "hatch-fancy-pypi-readme==23.1.0",
+    "hatch-mypyc==0.16.0",
+]
+
+[project]
+authors = [
+    {name = "Aaron Pham",email = "aarnphm@bentoml.com"},
+    {name = "BentoML Team",email = "contact@bentoml.com"},
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: GPU :: NVIDIA CUDA",
+    "Environment :: GPU :: NVIDIA CUDA :: 12",
+    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+    "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+    "License :: OSI Approved :: Apache Software License",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: System Administrators",
+    "Typing :: Typed",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+    "bentoml[grpc,io]>=1.0.25",
+    "transformers[torch,tokenizers,accelerate]>=4.29.0",
+    "safetensors",
+    "optimum",
+    "attrs>=23.1.0",
+    "cattrs>=23.1.0",
+    "orjson",
+    "inflection",
+    "tabulate[widechars]>=0.9.0",
+    "httpx",
+    "click>=8.1.3",
+    "typing_extensions",
+    "mypy_extensions",
+    "ghapi",
+    "cuda-python;platform_system!=\"Darwin\"",
+    "bitsandbytes<0.42",
+]
+description = "OpenLLM: Operating LLMs in production"
+dynamic = ["version", "readme"]
+keywords = [
+    "MLOps",
+    "AI",
+    "BentoML",
+    "Model Serving",
+    "Model Deployment",
+    "LLMOps",
+    "Falcon",
+    "Vicuna",
+    "Llama 2",
+    "Fine tuning",
+    "Serverless",
+    "Large Language Model",
+    "Generative AI",
+    "StableLM",
+    "Alpaca",
+    "PyTorch",
+    "Transformers",
+]
+license = "Apache-2.0"
+name = "openllm"
+requires-python = ">=3.8"
+[project.scripts]
+openllm = "openllm.cli.entrypoint:cli"
+openllm-build-base-container = "openllm.cli.extension.build_base_container:cli"
+openllm-dive-bentos = "openllm.cli.extension.dive_bentos:cli"
+openllm-get-containerfile = "openllm.cli.extension.get_containerfile:cli"
+openllm-get-prompt = "openllm.cli.extension.get_prompt:cli"
+openllm-list-bentos = "openllm.cli.extension.list_bentos:cli"
+openllm-list-models = "openllm.cli.extension.list_models:cli"
+openllm-playground = "openllm.cli.extension.playground:cli"
+
+[project.urls]
+Blog = "https://modelserving.com"
+Chat = "https://discord.gg/openllm"
+Documentation = "https://github.com/bentoml/openllm#readme"
+GitHub = "https://github.com/bentoml/OpenLLM"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
+Homepage = "https://bentoml.com"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
+Twitter = "https://twitter.com/bentomlai"
+
+[project.optional-dependencies]
+agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+all = [
+    "openllm[agents]",
+    "openllm[baichuan]",
+    "openllm[chatglm]",
+    "openllm[falcon]",
+    "openllm[fine-tune]",
+    "openllm[flan-t5]",
+    "openllm[ggml]",
+    "openllm[gptq]",
+    "openllm[llama]",
+    "openllm[mpt]",
+    "openllm[openai]",
+    "openllm[opt]",
+    "openllm[playground]",
+    "openllm[starcoder]",
+    "openllm[vllm]",
+]
+baichuan = ["cpm-kernels", "sentencepiece"]
+chatglm = ["cpm-kernels", "sentencepiece"]
+falcon = ["einops", "xformers"]
+fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+ggml = ["ctransformers"]
+gptq = ["auto-gptq[triton]"]
+llama = ["fairscale", "sentencepiece"]
+mpt = ["triton", "einops"]
+openai = ["openai", "tiktoken"]
+opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
+starcoder = ["bitsandbytes"]
+vllm = ["vllm", "ray"]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/openllm/_version.py"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+    "git",
+    "describe",
+    "--dirty",
+    "--tags",
+    "--long",
+    "--first-parent",
+]
+local_scheme = "no-local-version"
+root = ".."
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+only-include = ["src/openllm"]
+sources = ["src"]
+dev-mode-dirs = ["."]
+[tool.hatch.build.targets.sdist]
+exclude = ["/.git_archival.txt"]
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = [
+    "hatch-mypyc==0.16.0",
+    "mypy==1.4.1",
+    # avoid https://github.com/pallets/click/issues/2558
+    "click==8.1.3",
+    "bentoml==1.1.1",
+    "transformers>=4.31.0",
+    "pandas-stubs",
+    "types-psutil",
+    "types-tabulate",
+    "types-PyYAML",
+    "types-protobuf",
+]
+enable-by-default = false
+include = [
+    "src/openllm/bundle",
+    "src/openllm/models/__init__.py",
+    "src/openllm/models/auto/__init__.py",
+    "src/openllm/utils/__init__.py",
+    "src/openllm/utils/codegen.py",
+    "src/openllm/__init__.py",
+    "src/openllm/_prompt.py",
+    "src/openllm/_schema.py",
+    "src/openllm/_quantisation.py",
+    "src/openllm/_generation.py",
+    "src/openllm/_strategies.py",
+    "src/openllm/exceptions.py",
+    "src/openllm/testing.py",
+]
+# NOTE: This is consistent with pyproject.toml
+mypy-args = [
+    "--strict",
+    # this is because all transient library doesn't have types
+    "--allow-subclassing-any",
+    "--follow-imports=skip",
+    "--check-untyped-defs",
+    "--ignore-missing-imports",
+    "--no-warn-return-any",
+    "--warn-unreachable",
+    "--no-warn-no-return",
+    "--no-warn-unused-ignores",
+    "--exclude='/src\\/openllm\\/playground\\/**'",
+    "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
+]
+options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
+require-runtime-dependencies = true
diff --git a/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
similarity index 98%
rename from src/openllm/__init__.py
rename to openllm-python/src/openllm/__init__.py
index aafd18e4..e1d45aec 100644
--- a/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -155,7 +155,7 @@ else:
     from .models.opt import TFOPT as TFOPT
 
 # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
+__lazy = utils.LazyModule(__name__, _os.path.abspath("__file__"), _import_structure, extra_objects={"COMPILED": COMPILED})
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
diff --git a/src/openllm/__main__.py b/openllm-python/src/openllm/__main__.py
similarity index 100%
rename from src/openllm/__main__.py
rename to openllm-python/src/openllm/__main__.py
diff --git a/src/openllm/_configuration.py b/openllm-python/src/openllm/_configuration.py
similarity index 99%
rename from src/openllm/_configuration.py
rename to openllm-python/src/openllm/_configuration.py
index 682cfcdb..22cb65c9 100644
--- a/src/openllm/_configuration.py
+++ b/openllm-python/src/openllm/_configuration.py
@@ -1082,7 +1082,7 @@ class LLMConfig(_ConfigAttr):
   def keys(self) -> list[str]: return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__)
   def values(self) -> list[t.Any]: return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values()))
   def items(self) -> list[tuple[str, t.Any]]: return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items()))
-  def __iter__(self) -> t.Iterable[str]: return iter(self.keys())
+  def __iter__(self) -> t.Iterator[str]: return iter(self.keys())
   def __contains__(self, item: t.Any) -> bool:
     if item in self.__openllm_extras__: return True
     return item in self.__openllm_accepted_keys__
@@ -1230,7 +1230,7 @@ class LLMConfig(_ConfigAttr):
 
 bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
 
-def structure_llm_config(data: DictStrAny, cls: type[LLMConfig]) -> LLMConfig:
+def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
   """Structure a dictionary to a LLMConfig object.
 
   Essentially, if the given dictionary contains a 'generation_config' key, then we will
@@ -1240,7 +1240,6 @@ def structure_llm_config(data: DictStrAny, cls: type[LLMConfig]) -> LLMConfig:
   parse the remaining keys into LLMConfig.generation_config
   """
   if not isinstance(data, dict): raise RuntimeError(f"Expected a dictionary, but got {type(data)}")
-
   cls_attrs = {k: v for k, v in data.items() if k in cls.__openllm_accepted_keys__}
   generation_cls_fields = attr.fields_dict(cls.__openllm_generation_class__)
   if "generation_config" in data:
diff --git a/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py
similarity index 98%
rename from src/openllm/_generation.py
rename to openllm-python/src/openllm/_generation.py
index bf00d0b0..8139b0fd 100644
--- a/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -1,3 +1,4 @@
+# mypy: disable-error-code="misc"
 from __future__ import annotations
 import typing as t, transformers
 if t.TYPE_CHECKING: import torch, openllm
diff --git a/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
similarity index 99%
rename from src/openllm/_llm.py
rename to openllm-python/src/openllm/_llm.py
index 8fd78d06..77313d89 100644
--- a/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -66,7 +66,7 @@ else:
   torch = LazyLoader("torch", globals(), "torch")
   peft = LazyLoader("peft", globals(), "peft")
 
-ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[t.Union[str, t.Literal["default"]], t.Tuple["peft.PeftConfig", str]]]
+ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple["peft.PeftConfig", str]]]
 
 logger = logging.getLogger(__name__)
 class ModelSignatureDict(t.TypedDict, total=False):
diff --git a/src/openllm/_prompt.py b/openllm-python/src/openllm/_prompt.py
similarity index 100%
rename from src/openllm/_prompt.py
rename to openllm-python/src/openllm/_prompt.py
diff --git a/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
similarity index 99%
rename from src/openllm/_quantisation.py
rename to openllm-python/src/openllm/_quantisation.py
index b1123d4a..06f110f8 100644
--- a/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -6,7 +6,7 @@ if sys.version_info[:2] >= (3, 11): from typing import overload
 else: from typing_extensions import overload
 if t.TYPE_CHECKING:
   from ._llm import LLM
-  from ._types import DictStrAny
+  from ._typing_compat import DictStrAny
 
 autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers")
 
diff --git a/src/openllm/_schema.py b/openllm-python/src/openllm/_schema.py
similarity index 100%
rename from src/openllm/_schema.py
rename to openllm-python/src/openllm/_schema.py
diff --git a/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
similarity index 100%
rename from src/openllm/_service.py
rename to openllm-python/src/openllm/_service.py
diff --git a/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py
similarity index 98%
rename from src/openllm/_strategies.py
rename to openllm-python/src/openllm/_strategies.py
index 7a71e9d1..e40d43c1 100644
--- a/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -230,7 +230,11 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
   TODO: Support CloudTPUResource
   """
   @classmethod
-  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int:
+  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float) -> int:
+    """Return the number of workers to be used for the given runnable class.
+
+    Note that for all available GPU, the number of workers will always be 1.
+    """
     if resource_request is None: resource_request = system_resources()
     # use NVIDIA
     kind = "nvidia.com/gpu"
diff --git a/src/openllm/_typing_compat.py b/openllm-python/src/openllm/_typing_compat.py
similarity index 100%
rename from src/openllm/_typing_compat.py
rename to openllm-python/src/openllm/_typing_compat.py
diff --git a/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py
similarity index 100%
rename from src/openllm/bundle/__init__.py
rename to openllm-python/src/openllm/bundle/__init__.py
diff --git a/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
similarity index 93%
rename from src/openllm/bundle/_package.py
rename to openllm-python/src/openllm/bundle/_package.py
index 71deb419..288cf487 100644
--- a/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -77,7 +77,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
   if built_wheels is not None: wheels.append(llm_fs.getsyspath(f"/{built_wheels.split('/')[-1]}"))
   return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
 
-def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
+def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
   _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
   _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
   _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
@@ -100,7 +100,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
   return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
 
 @inject
-def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | int | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
+def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
                 runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release",
                 _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
   framework_envvar = llm.config["env"]["framework_value"]
diff --git a/src/openllm/bundle/oci/Dockerfile b/openllm-python/src/openllm/bundle/oci/Dockerfile
similarity index 93%
rename from src/openllm/bundle/oci/Dockerfile
rename to openllm-python/src/openllm/bundle/oci/Dockerfile
index 829e661c..17a8ecd0 100644
--- a/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -146,8 +146,8 @@ COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x8
 COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Install required dependencies
-COPY src src
-COPY hatch.toml README.md CHANGELOG.md pyproject.toml ./
+COPY openllm-python/src src
+COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
@@ -158,7 +158,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         rm -rf /var/lib/apt/lists/*
 
 # Install all required dependencies
-RUN --mount=type=cache,target=/root/.cache/pip pip install  "ray==2.6.0" "einops" "jax[cuda11_local]" "torch>=2.0.1" xformers -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir
+RUN --mount=type=cache,target=/root/.cache/pip pip install "ray==2.6.0" "einops" "jax[cuda11_local]" "torch>=2.0.1" xformers -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]" -v --no-cache-dir
 
 FROM base-container
 
diff --git a/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
similarity index 98%
rename from src/openllm/bundle/oci/__init__.py
rename to openllm-python/src/openllm/bundle/oci/__init__.py
index fda395f5..b2ece9dc 100644
--- a/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -8,7 +8,7 @@ from openllm.utils.lazy import VersionInfo
 
 if t.TYPE_CHECKING:
   from ghapi import all
-  from openllm._typing_compat import RefTuple
+  from openllm._typing_compat import RefTuple, LiteralString
 
 all = openllm.utils.LazyLoader("all", globals(), "ghapi.all")  # noqa: F811
 
@@ -78,7 +78,7 @@ class RefResolver:
     return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
   @classmethod
   @functools.lru_cache(maxsize=64)
-  def from_strategy(cls, strategy_or_version: t.Literal["release", "nightly"] | str | None = None) -> RefResolver:
+  def from_strategy(cls, strategy_or_version: t.Literal["release", "nightly"] | LiteralString | None = None) -> RefResolver:
     # using default strategy
     if strategy_or_version is None or strategy_or_version == "release": return cls(*cls._release_ref())
     elif strategy_or_version == "latest": return cls("latest", "0.0.0", "latest")
diff --git a/src/openllm/cli/__init__.py b/openllm-python/src/openllm/cli/__init__.py
similarity index 100%
rename from src/openllm/cli/__init__.py
rename to openllm-python/src/openllm/cli/__init__.py
diff --git a/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
similarity index 100%
rename from src/openllm/cli/_factory.py
rename to openllm-python/src/openllm/cli/_factory.py
diff --git a/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
similarity index 96%
rename from src/openllm/cli/_sdk.py
rename to openllm-python/src/openllm/cli/_sdk.py
index 27ab8069..6fba8e74 100644
--- a/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
   return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
 
 @inject
-def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: int | float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
+def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
   """Package a LLM into a Bento.
 
   The LLM will be built into a BentoService with the following structure:
diff --git a/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
similarity index 100%
rename from src/openllm/cli/entrypoint.py
rename to openllm-python/src/openllm/cli/entrypoint.py
diff --git a/src/openllm/cli/extension/__init__.py b/openllm-python/src/openllm/cli/extension/__init__.py
similarity index 100%
rename from src/openllm/cli/extension/__init__.py
rename to openllm-python/src/openllm/cli/extension/__init__.py
diff --git a/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py
similarity index 100%
rename from src/openllm/cli/extension/build_base_container.py
rename to openllm-python/src/openllm/cli/extension/build_base_container.py
diff --git a/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py
similarity index 100%
rename from src/openllm/cli/extension/dive_bentos.py
rename to openllm-python/src/openllm/cli/extension/dive_bentos.py
diff --git a/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py
similarity index 100%
rename from src/openllm/cli/extension/get_containerfile.py
rename to openllm-python/src/openllm/cli/extension/get_containerfile.py
diff --git a/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py
similarity index 100%
rename from src/openllm/cli/extension/get_prompt.py
rename to openllm-python/src/openllm/cli/extension/get_prompt.py
diff --git a/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py
similarity index 100%
rename from src/openllm/cli/extension/list_bentos.py
rename to openllm-python/src/openllm/cli/extension/list_bentos.py
diff --git a/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py
similarity index 100%
rename from src/openllm/cli/extension/list_models.py
rename to openllm-python/src/openllm/cli/extension/list_models.py
diff --git a/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py
similarity index 100%
rename from src/openllm/cli/extension/playground.py
rename to openllm-python/src/openllm/cli/extension/playground.py
diff --git a/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py
similarity index 100%
rename from src/openllm/cli/termui.py
rename to openllm-python/src/openllm/cli/termui.py
diff --git a/src/openllm/client/__init__.py b/openllm-python/src/openllm/client/__init__.py
similarity index 100%
rename from src/openllm/client/__init__.py
rename to openllm-python/src/openllm/client/__init__.py
diff --git a/src/openllm/client/runtimes/__init__.py b/openllm-python/src/openllm/client/runtimes/__init__.py
similarity index 100%
rename from src/openllm/client/runtimes/__init__.py
rename to openllm-python/src/openllm/client/runtimes/__init__.py
diff --git a/src/openllm/client/runtimes/base.py b/openllm-python/src/openllm/client/runtimes/base.py
similarity index 100%
rename from src/openllm/client/runtimes/base.py
rename to openllm-python/src/openllm/client/runtimes/base.py
diff --git a/src/openllm/client/runtimes/grpc.py b/openllm-python/src/openllm/client/runtimes/grpc.py
similarity index 100%
rename from src/openllm/client/runtimes/grpc.py
rename to openllm-python/src/openllm/client/runtimes/grpc.py
diff --git a/src/openllm/client/runtimes/http.py b/openllm-python/src/openllm/client/runtimes/http.py
similarity index 100%
rename from src/openllm/client/runtimes/http.py
rename to openllm-python/src/openllm/client/runtimes/http.py
diff --git a/src/openllm/exceptions.py b/openllm-python/src/openllm/exceptions.py
similarity index 100%
rename from src/openllm/exceptions.py
rename to openllm-python/src/openllm/exceptions.py
diff --git a/src/openllm/models/__init__.py b/openllm-python/src/openllm/models/__init__.py
similarity index 100%
rename from src/openllm/models/__init__.py
rename to openllm-python/src/openllm/models/__init__.py
diff --git a/src/openllm/models/auto/__init__.py b/openllm-python/src/openllm/models/auto/__init__.py
similarity index 100%
rename from src/openllm/models/auto/__init__.py
rename to openllm-python/src/openllm/models/auto/__init__.py
diff --git a/src/openllm/models/auto/configuration_auto.py b/openllm-python/src/openllm/models/auto/configuration_auto.py
similarity index 100%
rename from src/openllm/models/auto/configuration_auto.py
rename to openllm-python/src/openllm/models/auto/configuration_auto.py
diff --git a/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py
similarity index 100%
rename from src/openllm/models/auto/factory.py
rename to openllm-python/src/openllm/models/auto/factory.py
diff --git a/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py
similarity index 100%
rename from src/openllm/models/auto/modeling_auto.py
rename to openllm-python/src/openllm/models/auto/modeling_auto.py
diff --git a/src/openllm/models/auto/modeling_flax_auto.py b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
similarity index 100%
rename from src/openllm/models/auto/modeling_flax_auto.py
rename to openllm-python/src/openllm/models/auto/modeling_flax_auto.py
diff --git a/src/openllm/models/auto/modeling_tf_auto.py b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
similarity index 100%
rename from src/openllm/models/auto/modeling_tf_auto.py
rename to openllm-python/src/openllm/models/auto/modeling_tf_auto.py
diff --git a/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
similarity index 100%
rename from src/openllm/models/auto/modeling_vllm_auto.py
rename to openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
diff --git a/src/openllm/models/baichuan/__init__.py b/openllm-python/src/openllm/models/baichuan/__init__.py
similarity index 100%
rename from src/openllm/models/baichuan/__init__.py
rename to openllm-python/src/openllm/models/baichuan/__init__.py
diff --git a/src/openllm/models/baichuan/configuration_baichuan.py b/openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
similarity index 100%
rename from src/openllm/models/baichuan/configuration_baichuan.py
rename to openllm-python/src/openllm/models/baichuan/configuration_baichuan.py
diff --git a/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
similarity index 100%
rename from src/openllm/models/baichuan/modeling_baichuan.py
rename to openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
diff --git a/src/openllm/models/baichuan/modeling_vllm_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
similarity index 100%
rename from src/openllm/models/baichuan/modeling_vllm_baichuan.py
rename to openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
diff --git a/src/openllm/models/chatglm/__init__.py b/openllm-python/src/openllm/models/chatglm/__init__.py
similarity index 100%
rename from src/openllm/models/chatglm/__init__.py
rename to openllm-python/src/openllm/models/chatglm/__init__.py
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
similarity index 100%
rename from src/openllm/models/chatglm/configuration_chatglm.py
rename to openllm-python/src/openllm/models/chatglm/configuration_chatglm.py
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
similarity index 100%
rename from src/openllm/models/chatglm/modeling_chatglm.py
rename to openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
diff --git a/src/openllm/models/dolly_v2/__init__.py b/openllm-python/src/openllm/models/dolly_v2/__init__.py
similarity index 100%
rename from src/openllm/models/dolly_v2/__init__.py
rename to openllm-python/src/openllm/models/dolly_v2/__init__.py
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
similarity index 100%
rename from src/openllm/models/dolly_v2/configuration_dolly_v2.py
rename to openllm-python/src/openllm/models/dolly_v2/configuration_dolly_v2.py
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
similarity index 100%
rename from src/openllm/models/dolly_v2/modeling_dolly_v2.py
rename to openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
diff --git a/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
similarity index 100%
rename from src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
rename to openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
diff --git a/src/openllm/models/falcon/__init__.py b/openllm-python/src/openllm/models/falcon/__init__.py
similarity index 100%
rename from src/openllm/models/falcon/__init__.py
rename to openllm-python/src/openllm/models/falcon/__init__.py
diff --git a/src/openllm/models/falcon/configuration_falcon.py b/openllm-python/src/openllm/models/falcon/configuration_falcon.py
similarity index 100%
rename from src/openllm/models/falcon/configuration_falcon.py
rename to openllm-python/src/openllm/models/falcon/configuration_falcon.py
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
similarity index 100%
rename from src/openllm/models/falcon/modeling_falcon.py
rename to openllm-python/src/openllm/models/falcon/modeling_falcon.py
diff --git a/src/openllm/models/flan_t5/__init__.py b/openllm-python/src/openllm/models/flan_t5/__init__.py
similarity index 100%
rename from src/openllm/models/flan_t5/__init__.py
rename to openllm-python/src/openllm/models/flan_t5/__init__.py
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
similarity index 100%
rename from src/openllm/models/flan_t5/configuration_flan_t5.py
rename to openllm-python/src/openllm/models/flan_t5/configuration_flan_t5.py
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
similarity index 100%
rename from src/openllm/models/flan_t5/modeling_flan_t5.py
rename to openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
similarity index 100%
rename from src/openllm/models/flan_t5/modeling_flax_flan_t5.py
rename to openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
similarity index 100%
rename from src/openllm/models/flan_t5/modeling_tf_flan_t5.py
rename to openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
diff --git a/src/openllm/models/gpt_neox/__init__.py b/openllm-python/src/openllm/models/gpt_neox/__init__.py
similarity index 100%
rename from src/openllm/models/gpt_neox/__init__.py
rename to openllm-python/src/openllm/models/gpt_neox/__init__.py
diff --git a/src/openllm/models/gpt_neox/configuration_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
similarity index 100%
rename from src/openllm/models/gpt_neox/configuration_gpt_neox.py
rename to openllm-python/src/openllm/models/gpt_neox/configuration_gpt_neox.py
diff --git a/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
similarity index 100%
rename from src/openllm/models/gpt_neox/modeling_gpt_neox.py
rename to openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
diff --git a/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
similarity index 100%
rename from src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
rename to openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
diff --git a/src/openllm/models/llama/__init__.py b/openllm-python/src/openllm/models/llama/__init__.py
similarity index 100%
rename from src/openllm/models/llama/__init__.py
rename to openllm-python/src/openllm/models/llama/__init__.py
diff --git a/src/openllm/models/llama/configuration_llama.py b/openllm-python/src/openllm/models/llama/configuration_llama.py
similarity index 100%
rename from src/openllm/models/llama/configuration_llama.py
rename to openllm-python/src/openllm/models/llama/configuration_llama.py
diff --git a/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py
similarity index 100%
rename from src/openllm/models/llama/modeling_llama.py
rename to openllm-python/src/openllm/models/llama/modeling_llama.py
diff --git a/src/openllm/models/llama/modeling_vllm_llama.py b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
similarity index 100%
rename from src/openllm/models/llama/modeling_vllm_llama.py
rename to openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
diff --git a/src/openllm/models/mpt/__init__.py b/openllm-python/src/openllm/models/mpt/__init__.py
similarity index 100%
rename from src/openllm/models/mpt/__init__.py
rename to openllm-python/src/openllm/models/mpt/__init__.py
diff --git a/src/openllm/models/mpt/configuration_mpt.py b/openllm-python/src/openllm/models/mpt/configuration_mpt.py
similarity index 100%
rename from src/openllm/models/mpt/configuration_mpt.py
rename to openllm-python/src/openllm/models/mpt/configuration_mpt.py
diff --git a/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
similarity index 100%
rename from src/openllm/models/mpt/modeling_mpt.py
rename to openllm-python/src/openllm/models/mpt/modeling_mpt.py
diff --git a/src/openllm/models/mpt/modeling_vllm_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
similarity index 100%
rename from src/openllm/models/mpt/modeling_vllm_mpt.py
rename to openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
diff --git a/src/openllm/models/opt/__init__.py b/openllm-python/src/openllm/models/opt/__init__.py
similarity index 100%
rename from src/openllm/models/opt/__init__.py
rename to openllm-python/src/openllm/models/opt/__init__.py
diff --git a/src/openllm/models/opt/configuration_opt.py b/openllm-python/src/openllm/models/opt/configuration_opt.py
similarity index 100%
rename from src/openllm/models/opt/configuration_opt.py
rename to openllm-python/src/openllm/models/opt/configuration_opt.py
diff --git a/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
similarity index 100%
rename from src/openllm/models/opt/modeling_flax_opt.py
rename to openllm-python/src/openllm/models/opt/modeling_flax_opt.py
diff --git a/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py
similarity index 100%
rename from src/openllm/models/opt/modeling_opt.py
rename to openllm-python/src/openllm/models/opt/modeling_opt.py
diff --git a/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
similarity index 100%
rename from src/openllm/models/opt/modeling_tf_opt.py
rename to openllm-python/src/openllm/models/opt/modeling_tf_opt.py
diff --git a/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
similarity index 100%
rename from src/openllm/models/opt/modeling_vllm_opt.py
rename to openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
diff --git a/src/openllm/models/stablelm/__init__.py b/openllm-python/src/openllm/models/stablelm/__init__.py
similarity index 100%
rename from src/openllm/models/stablelm/__init__.py
rename to openllm-python/src/openllm/models/stablelm/__init__.py
diff --git a/src/openllm/models/stablelm/configuration_stablelm.py b/openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
similarity index 100%
rename from src/openllm/models/stablelm/configuration_stablelm.py
rename to openllm-python/src/openllm/models/stablelm/configuration_stablelm.py
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
similarity index 100%
rename from src/openllm/models/stablelm/modeling_stablelm.py
rename to openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
diff --git a/src/openllm/models/stablelm/modeling_vllm_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
similarity index 100%
rename from src/openllm/models/stablelm/modeling_vllm_stablelm.py
rename to openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
diff --git a/src/openllm/models/starcoder/__init__.py b/openllm-python/src/openllm/models/starcoder/__init__.py
similarity index 100%
rename from src/openllm/models/starcoder/__init__.py
rename to openllm-python/src/openllm/models/starcoder/__init__.py
diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
similarity index 100%
rename from src/openllm/models/starcoder/configuration_starcoder.py
rename to openllm-python/src/openllm/models/starcoder/configuration_starcoder.py
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
similarity index 100%
rename from src/openllm/models/starcoder/modeling_starcoder.py
rename to openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
diff --git a/src/openllm/models/starcoder/modeling_vllm_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
similarity index 100%
rename from src/openllm/models/starcoder/modeling_vllm_starcoder.py
rename to openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
diff --git a/src/openllm/playground/README.md b/openllm-python/src/openllm/playground/README.md
similarity index 100%
rename from src/openllm/playground/README.md
rename to openllm-python/src/openllm/playground/README.md
diff --git a/src/openllm/playground/__init__.py b/openllm-python/src/openllm/playground/__init__.py
similarity index 100%
rename from src/openllm/playground/__init__.py
rename to openllm-python/src/openllm/playground/__init__.py
diff --git a/src/openllm/playground/_meta.yml b/openllm-python/src/openllm/playground/_meta.yml
similarity index 100%
rename from src/openllm/playground/_meta.yml
rename to openllm-python/src/openllm/playground/_meta.yml
diff --git a/src/openllm/playground/falcon_tuned.py b/openllm-python/src/openllm/playground/falcon_tuned.py
similarity index 100%
rename from src/openllm/playground/falcon_tuned.py
rename to openllm-python/src/openllm/playground/falcon_tuned.py
diff --git a/src/openllm/playground/features.py b/openllm-python/src/openllm/playground/features.py
similarity index 100%
rename from src/openllm/playground/features.py
rename to openllm-python/src/openllm/playground/features.py
diff --git a/src/openllm/playground/llama2_qlora.py b/openllm-python/src/openllm/playground/llama2_qlora.py
similarity index 100%
rename from src/openllm/playground/llama2_qlora.py
rename to openllm-python/src/openllm/playground/llama2_qlora.py
diff --git a/src/openllm/playground/opt_tuned.py b/openllm-python/src/openllm/playground/opt_tuned.py
similarity index 100%
rename from src/openllm/playground/opt_tuned.py
rename to openllm-python/src/openllm/playground/opt_tuned.py
diff --git a/src/openllm/py.typed b/openllm-python/src/openllm/py.typed
similarity index 100%
rename from src/openllm/py.typed
rename to openllm-python/src/openllm/py.typed
diff --git a/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
similarity index 92%
rename from src/openllm/serialisation/__init__.py
rename to openllm-python/src/openllm/serialisation/__init__.py
index 495f85c4..4adadbe2 100644
--- a/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import importlib, typing as t
 import cloudpickle, fs, openllm
 from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
-from openllm._typing_compat import M, T, ParamSpec, Concatenate
+from openllm._typing_compat import M, T, ParamSpec
 
 if t.TYPE_CHECKING:
   import bentoml
@@ -62,9 +62,12 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
     else: tokenizer.add_special_tokens({"pad_token": "[PAD]"})
   return tokenizer
 
+class _Caller(t.Protocol[P]):
+  def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ...
+
 _extras = ["get", "import_model", "save_pretrained", "load_model"]
-def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[openllm.LLM[t.Any, t.Any], P], t.Any]:
-  def caller(llm: openllm.LLM[t.Any, t.Any], *args: P.args, **kwargs: P.kwargs) -> t.Any:
+def _make_dispatch_function(fn: str) -> _Caller[P]:
+  def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
     """Generic function dispatch to correct serialisation submodules based on LLM runtime.
 
     > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
diff --git a/src/openllm/serialisation/constants.py b/openllm-python/src/openllm/serialisation/constants.py
similarity index 100%
rename from src/openllm/serialisation/constants.py
rename to openllm-python/src/openllm/serialisation/constants.py
diff --git a/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py
similarity index 100%
rename from src/openllm/serialisation/ggml.py
rename to openllm-python/src/openllm/serialisation/ggml.py
diff --git a/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
similarity index 100%
rename from src/openllm/serialisation/transformers/__init__.py
rename to openllm-python/src/openllm/serialisation/transformers/__init__.py
diff --git a/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
similarity index 100%
rename from src/openllm/serialisation/transformers/_helpers.py
rename to openllm-python/src/openllm/serialisation/transformers/_helpers.py
diff --git a/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py
similarity index 100%
rename from src/openllm/serialisation/transformers/weights.py
rename to openllm-python/src/openllm/serialisation/transformers/weights.py
diff --git a/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
similarity index 100%
rename from src/openllm/testing.py
rename to openllm-python/src/openllm/testing.py
diff --git a/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
similarity index 97%
rename from src/openllm/utils/__init__.py
rename to openllm-python/src/openllm/utils/__init__.py
index cc39040f..147a1b16 100644
--- a/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -84,7 +84,7 @@ def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
   _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
   if not hasattr(obj, name): _setattr(name, value)
 
-def field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
+def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
 
 # Special debug flag controled via OPENLLMDEVDEBUG
 DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
@@ -242,7 +242,7 @@ def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t
     if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
   return attrs, tokenizer_attrs
 
-def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM] | type[openllm.AutoTFLLM] | type[openllm.AutoFlaxLLM] | type[openllm.AutoVLLM]:
+def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
   import openllm
   if implementation == "tf": return openllm.AutoTFLLM
   elif implementation == "flax": return openllm.AutoFlaxLLM
diff --git a/src/openllm/utils/analytics.py b/openllm-python/src/openllm/utils/analytics.py
similarity index 100%
rename from src/openllm/utils/analytics.py
rename to openllm-python/src/openllm/utils/analytics.py
diff --git a/src/openllm/utils/codegen.py b/openllm-python/src/openllm/utils/codegen.py
similarity index 100%
rename from src/openllm/utils/codegen.py
rename to openllm-python/src/openllm/utils/codegen.py
diff --git a/src/openllm/utils/dantic.py b/openllm-python/src/openllm/utils/dantic.py
similarity index 100%
rename from src/openllm/utils/dantic.py
rename to openllm-python/src/openllm/utils/dantic.py
diff --git a/src/openllm/utils/dummy_flax_objects.py b/openllm-python/src/openllm/utils/dummy_flax_objects.py
similarity index 100%
rename from src/openllm/utils/dummy_flax_objects.py
rename to openllm-python/src/openllm/utils/dummy_flax_objects.py
diff --git a/src/openllm/utils/dummy_pt_objects.py b/openllm-python/src/openllm/utils/dummy_pt_objects.py
similarity index 100%
rename from src/openllm/utils/dummy_pt_objects.py
rename to openllm-python/src/openllm/utils/dummy_pt_objects.py
diff --git a/src/openllm/utils/dummy_tf_objects.py b/openllm-python/src/openllm/utils/dummy_tf_objects.py
similarity index 100%
rename from src/openllm/utils/dummy_tf_objects.py
rename to openllm-python/src/openllm/utils/dummy_tf_objects.py
diff --git a/src/openllm/utils/dummy_vllm_objects.py b/openllm-python/src/openllm/utils/dummy_vllm_objects.py
similarity index 100%
rename from src/openllm/utils/dummy_vllm_objects.py
rename to openllm-python/src/openllm/utils/dummy_vllm_objects.py
diff --git a/src/openllm/utils/import_utils.py b/openllm-python/src/openllm/utils/import_utils.py
similarity index 100%
rename from src/openllm/utils/import_utils.py
rename to openllm-python/src/openllm/utils/import_utils.py
diff --git a/src/openllm/utils/lazy.py b/openllm-python/src/openllm/utils/lazy.py
similarity index 99%
rename from src/openllm/utils/lazy.py
rename to openllm-python/src/openllm/utils/lazy.py
index 4989709f..da8c46dd 100644
--- a/src/openllm/utils/lazy.py
+++ b/openllm-python/src/openllm/utils/lazy.py
@@ -21,7 +21,7 @@ class VersionInfo:
     if not isinstance(cmp, tuple): raise NotImplementedError
     if not (1 <= len(cmp) <= 4): raise NotImplementedError
     return t.cast(t.Tuple[int, int, int, str], attr.astuple(self)[:len(cmp)]), t.cast(t.Tuple[int, int, int, str], cmp)
-  def __eq__(self, other: object) -> bool:
+  def __eq__(self, other: t.Any) -> bool:
     try: us, them = self._ensure_tuple(other)
     except NotImplementedError: return NotImplemented
     return us == them
diff --git a/src/openllm/utils/representation.py b/openllm-python/src/openllm/utils/representation.py
similarity index 100%
rename from src/openllm/utils/representation.py
rename to openllm-python/src/openllm/utils/representation.py
diff --git a/tests/__init__.py b/openllm-python/tests/__init__.py
similarity index 100%
rename from tests/__init__.py
rename to openllm-python/tests/__init__.py
diff --git a/tests/_strategies/__init__.py b/openllm-python/tests/_strategies/__init__.py
similarity index 100%
rename from tests/_strategies/__init__.py
rename to openllm-python/tests/_strategies/__init__.py
diff --git a/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py
similarity index 100%
rename from tests/_strategies/_configuration.py
rename to openllm-python/tests/_strategies/_configuration.py
diff --git a/tests/compiled_test.py b/openllm-python/tests/compiled_test.py
similarity index 100%
rename from tests/compiled_test.py
rename to openllm-python/tests/compiled_test.py
diff --git a/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
similarity index 100%
rename from tests/configuration_test.py
rename to openllm-python/tests/configuration_test.py
diff --git a/tests/conftest.py b/openllm-python/tests/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to openllm-python/tests/conftest.py
diff --git a/tests/models/__init__.py b/openllm-python/tests/models/__init__.py
similarity index 100%
rename from tests/models/__init__.py
rename to openllm-python/tests/models/__init__.py
diff --git a/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
similarity index 100%
rename from tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
rename to openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
diff --git a/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
similarity index 100%
rename from tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
rename to openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
diff --git a/tests/models/__snapshots__/opt_test/test_opt_125m[container].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json
similarity index 100%
rename from tests/models/__snapshots__/opt_test/test_opt_125m[container].json
rename to openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json
diff --git a/tests/models/__snapshots__/opt_test/test_opt_125m[local].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json
similarity index 100%
rename from tests/models/__snapshots__/opt_test/test_opt_125m[local].json
rename to openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json
diff --git a/tests/models/conftest.py b/openllm-python/tests/models/conftest.py
similarity index 100%
rename from tests/models/conftest.py
rename to openllm-python/tests/models/conftest.py
diff --git a/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py
similarity index 100%
rename from tests/models/flan_t5_test.py
rename to openllm-python/tests/models/flan_t5_test.py
diff --git a/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py
similarity index 100%
rename from tests/models/opt_test.py
rename to openllm-python/tests/models/opt_test.py
diff --git a/tests/models_test.py b/openllm-python/tests/models_test.py
similarity index 100%
rename from tests/models_test.py
rename to openllm-python/tests/models_test.py
diff --git a/tests/package_test.py b/openllm-python/tests/package_test.py
similarity index 100%
rename from tests/package_test.py
rename to openllm-python/tests/package_test.py
diff --git a/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
similarity index 100%
rename from tests/strategies_test.py
rename to openllm-python/tests/strategies_test.py
diff --git a/package.json b/package.json
index 9e74db7d..8f122de0 100644
--- a/package.json
+++ b/package.json
@@ -9,7 +9,7 @@
     "node": ">=16"
   },
   "workspaces": [
-    "src/openllm-node"
+    "openllm-node"
   ],
   "private": true,
   "devDependencies": {
diff --git a/pyproject.toml b/pyproject.toml
index 864723b8..1d25eb75 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,17 @@
-# NOTE: The following are managed by ./tools/dependencies.py
-# project.classifiers, project.dependencies, project.optional-dependencies, project.urls
+# NOTE: PEP517 is manged via ./tools/dependencies.py
 [build-system]
 build-backend = "hatchling.build"
-requires = ["hatchling", "hatch-vcs", "hatch-fancy-pypi-readme", "hatch-mypyc==0.16.0"]
-
+requires = [
+    "hatchling==1.18.0",
+    "hatch-vcs==0.3.0",
+    "hatch-fancy-pypi-readme==23.1.0",
+    "hatch-mypyc==0.16.0",
+]
 [project]
-authors = [{ name = "Aaron Pham", email = "aarnphm@bentoml.com" }]
+authors = [
+    { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+    { name = "BentoML Team", email = "contact@bentoml.com" },
+]
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Environment :: GPU :: NVIDIA CUDA",
@@ -31,26 +37,8 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = [
-    "bentoml[grpc,io]>=1.0.25",
-    "transformers[torch,tokenizers,accelerate]>=4.29.0",
-    "safetensors",
-    "optimum",
-    "attrs>=23.1.0",
-    "cattrs>=23.1.0",
-    "orjson",
-    "inflection",
-    "tabulate[widechars]>=0.9.0",
-    "httpx",
-    "click>=8.1.3",
-    "typing_extensions",
-    "mypy_extensions",
-    "ghapi",
-    "cuda-python;platform_system!=\"Darwin\"",
-    "bitsandbytes<0.42",
-]
-description = 'OpenLLM: Operating LLMs in production'
-dynamic = ["version", "readme"]
+description = "OpenLLM: Operating LLMs in production"
+dynamic = ["version", "readme", "dependencies"]
 keywords = [
     "MLOps",
     "AI",
@@ -58,6 +46,11 @@ keywords = [
     "Model Serving",
     "Model Deployment",
     "LLMOps",
+    "Falcon",
+    "Vicuna",
+    "Llama 2",
+    "Fine tuning",
+    "Serverless",
     "Large Language Model",
     "Generative AI",
     "StableLM",
@@ -68,60 +61,38 @@ keywords = [
 license = "Apache-2.0"
 name = "openllm"
 requires-python = ">=3.8"
-
-[project.scripts]
-openllm = "openllm.cli.entrypoint:cli"
-openllm-build-base-container = "openllm.cli.extension.build_base_container:cli"
-openllm-dive-bentos = "openllm.cli.extension.dive_bentos:cli"
-openllm-get-containerfile = "openllm.cli.extension.get_containerfile:cli"
-openllm-get-prompt = "openllm.cli.extension.get_prompt:cli"
-openllm-list-bentos = "openllm.cli.extension.list_bentos:cli"
-openllm-list-models = "openllm.cli.extension.list_models:cli"
-openllm-playground = "openllm.cli.extension.playground:cli"
-
 [project.urls]
 Blog = "https://modelserving.com"
 Chat = "https://discord.gg/openllm"
 Documentation = "https://github.com/bentoml/openllm#readme"
-GitHub = "https://github.com/bentoml/openllm"
-History = "https://github.com/bentoml/openllm/blob/main/CHANGELOG.md"
+GitHub = "https://github.com/bentoml/OpenLLM"
+History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
 Homepage = "https://bentoml.com"
-Tracker = "https://github.com/bentoml/openllm/issues"
+Tracker = "https://github.com/bentoml/OpenLLM/issues"
 Twitter = "https://twitter.com/bentomlai"
 
-[project.optional-dependencies]
-agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-all = [
-    "openllm[agents]",
-    "openllm[baichuan]",
-    "openllm[chatglm]",
-    "openllm[falcon]",
-    "openllm[fine-tune]",
-    "openllm[flan-t5]",
-    "openllm[ggml]",
-    "openllm[gptq]",
-    "openllm[llama]",
-    "openllm[mpt]",
-    "openllm[openai]",
-    "openllm[opt]",
-    "openllm[playground]",
-    "openllm[starcoder]",
-    "openllm[vllm]",
+[tool.hatch.build]
+exclude = ["*"]
+dev-mode-dirs = ["openllm-python"]
+[tool.hatch.build.sources]
+"openllm-python/src/openllm" = "openllm"
+[tool.hatch.metadata.hooks.custom]
+
+[tool.hatch.version]
+fallback-version = "0.0.0"
+source = "vcs"
+[tool.hatch.version.raw-options]
+git_describe_command = [
+    "git",
+    "describe",
+    "--dirty",
+    "--tags",
+    "--long",
+    "--first-parent",
 ]
-baichuan = ["cpm-kernels", "sentencepiece"]
-chatglm = ["cpm-kernels", "sentencepiece"]
-falcon = ["einops", "xformers"]
-fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
-flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
-ggml = ["ctransformers"]
-gptq = ["auto-gptq[triton]"]
-llama = ["fairscale", "sentencepiece"]
-mpt = ["triton", "einops"]
-openai = ["openai", "tiktoken"]
-opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
-playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
-starcoder = ["bitsandbytes"]
-vllm = ["vllm", "ray"]
+local_scheme = "no-local-version"
+[tool.hatch.metadata]
+allow-direct-references = true
 
 [tool.cibuildwheel]
 build-verbosity = 1
@@ -187,20 +158,15 @@ whitelist-regex = ["test_.*"]
 [tool.check-wheel-contents]
 toplevel = ["openllm"]
 
-[tool.pytest.ini_options]
-addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"]
-python_files = ["test_*.py", "*_test.py"]
-testpaths = ["tests"]
-
 [tool.ruff]
 extend-exclude = [
     "tools",
     "examples",
-    "src/openllm/playground",
-    "src/openllm/__init__.py",
-    "src/openllm/_version.py",
-    "src/openllm/utils/dummy_*.py",
-    "src/openllm/models/__init__.py",
+    "openllm-python/src/openllm/playground",
+    "openllm-python/src/openllm/__init__.py",
+    "openllm-python/src/openllm/_version.py",
+    "openllm-python/src/openllm/utils/dummy_*.py",
+    "openllm-python/src/openllm/models/__init__.py",
 ]
 extend-include = ["*.ipynb"]
 extend-select = [
@@ -216,7 +182,6 @@ extend-select = [
     "PLW",  # pylint-warning
     "PLR",  # pylint-refactor
     "PT",   # flake8-pytest-style
-    "PYI",  # flake8-pyi
     "PERF", # perflint
     "FLY",  # flynt
     "RUF",  # Ruff-specific rules
@@ -239,7 +204,6 @@ ignore = [
     "E401",    # ignore multiple line import
     "E702",
     "I001",    # unsorted imports
-    "PYI021",  # ignore docstring in stubs, as pyright will include docstring in stubs.
     "D103",    # Just missing docstring for magic methods.
     "D102",
     "D101",
@@ -255,7 +219,7 @@ target-version = "py38"
 typing-modules = ["openllm._typing_compat"]
 unfixable = ["TCH004"]
 [tool.ruff.flake8-type-checking]
-exempt-modules = ["typing", "typing_extensions", "."]
+exempt-modules = ["typing", "typing_extensions", "openllm._typing_compat"]
 runtime-evaluated-base-classes = [
     "pydantic.BaseModel",
     "openllm._configuration.LLMConfig",
@@ -272,7 +236,14 @@ combine-as-imports = true
 force-single-line = false
 force-wrap-aliases = true
 known-first-party = ["openllm", "bentoml"]
-known-third-party = ["transformers", "click", "huggingface_hub", "torch", "vllm", "auto_gptq"]
+known-third-party = [
+    "transformers",
+    "click",
+    "huggingface_hub",
+    "torch",
+    "vllm",
+    "auto_gptq",
+]
 lines-after-imports = 0
 lines-between-types = 0
 no-lines-before = ["future", "standard-library"]
@@ -281,19 +252,14 @@ required-imports = ["from __future__ import annotations"]
 [tool.ruff.flake8-quotes]
 avoid-escape = false
 [tool.ruff.extend-per-file-ignores]
-"src/openllm/_service.py" = ["E401"]
-"src/openllm/cli/entrypoint.py" = ["D301"]
-"src/openllm/client/runtimes/*" = ["D107"]
-"src/openllm/models/**" = ["E", "D", "F"]
-"src/openllm/utils/import_utils.py" = ["PLW0603"]
-"tests/**/*" = [
-    "S101",
-    "TID252",
-    "D",      # No docstring in tests
-    "PT011",  # ignore too broad raises, as it can be use pytest.raises().match()
-    "S307",   # Ignore eval(compile) as it is a known script execution
-]
-"typings/**" = ["D", "F", "E", "PYI002"]
+"openllm-python/src/openllm/_service.py" = ["E401"]
+"openllm-python/src/openllm/cli/entrypoint.py" = ["D301"]
+"openllm-python/src/openllm/client/runtimes/*" = ["D107"]
+"openllm-python/src/openllm/models/**" = ["E", "D", "F"]
+"openllm-python/src/openllm/utils/import_utils.py" = ["PLW0603"]
+"openllm-python/src/openllm/_configuration.py" = ["F811"]
+"openllm-python/tests/**/*" = ["S101", "TID252", "D", "PT011", "S307"]
+"typings/**" = ["D", "F", "E"]
 
 [tool.yapf]
 ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
@@ -338,18 +304,23 @@ SPLIT_PENALTY_BEFORE_IF_EXPR = 10000
 SPLIT_PENALTY_COMPREHENSION = 3000
 SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 8000
 
+[tool.pytest.ini_options]
+addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"]
+python_files = ["test_*.py", "*_test.py"]
+testpaths = ["openllm-python/tests"]
+
 [tool.coverage.paths]
 openllm = ["src/openllm", "*/openllm/src/openllm"]
 [tool.coverage.run]
 branch = true
 omit = [
     "__pypackages__/*",
-    "src/openllm/_version.py",
-    "src/openllm/playground/",
-    "src/openllm/__init__.py",
-    "src/openllm/__main__.py",
-    "src/openllm/utils/dummy_*.py",
-    "src/openllm/_typing_compat.py",
+    "openllm-python/src/openllm/_version.py",
+    "openllm-python/src/openllm/playground/",
+    "openllm-python/src/openllm/__init__.py",
+    "openllm-python/src/openllm/__main__.py",
+    "openllm-python/src/openllm/utils/dummy_*.py",
+    "openllm-python/src/openllm/_typing_compat.py",
 ]
 source_pkgs = ["openllm"]
 [tool.coverage.report]
@@ -375,32 +346,35 @@ exclude_lines = [
 ]
 omit = [
     "__pypackages__/*",
-    "src/openllm/_version.py",
-    "src/openllm/playground/",
-    "src/openllm/__init__.py",
-    "src/openllm/__main__.py",
-    "src/openllm/utils/dummy_*.py",
-    "src/openllm/_typing_compat.py",
+    "openllm-python/src/openllm/_version.py",
+    "openllm-python/src/openllm/playground/",
+    "openllm-python/src/openllm/__init__.py",
+    "openllm-python/src/openllm/__main__.py",
+    "openllm-python/src/openllm/utils/dummy_*.py",
+    "openllm-python/src/openllm/_typing_compat.py",
 ]
 precision = 2
 show_missing = true
 
+# NOTE: The following strict type checker we are using:
+# - mypy: for both runtime and static type checking (since we compile the wheels )
+# - pyright: for static type checking only, as we want to use newer type feature
 [tool.pyright]
 analysis.useLibraryCodeForTypes = true
 exclude = [
     "__pypackages__/*",
-    "src/openllm/playground/",
-    "src/openllm/models/",
-    "src/openllm/__init__.py",
-    "src/openllm/__main__.py",
-    "src/openllm/utils/dummy_*.py",
-    "src/openllm/_typing_compat.py",
+    "openllm-python/src/openllm/playground/",
+    "openllm-python/src/openllm/models/",
+    "openllm-python/src/openllm/__init__.py",
+    "openllm-python/src/openllm/__main__.py",
+    "openllm-python/src/openllm/utils/dummy_*.py",
+    "openllm-python/src/openllm/_typing_compat.py",
+    "openllm-python/tests",
     "tools",
     "examples",
-    "tests",
 ]
-include = ["src/openllm"]
-pythonVersion = "3.8"
+include = ["openllm-python/src/openllm"]
+pythonVersion = "3.12"
 reportMissingImports = "warning"
 reportMissingTypeStubs = false
 reportPrivateUsage = "warning"
@@ -413,16 +387,21 @@ typeCheckingMode = "strict"
 
 [tool.mypy]
 exclude = [
-    "src/openllm/playground/",
-    "src/openllm/utils/dummy_*.py",
-    "src/openllm/models",
-    "src/openllm/_typing_compat.py",
+    "examples",
+    "tools",
+    "cz.py",
+    "openllm-python/tests",
+    "openllm-python/src/openllm/playground",
+    "openllm-python/src/openllm/utils/dummy_*.py",
+    "openllm-python/src/openllm/models",
+    "openllm-python/src/openllm/_typing_compat.py",
 ]
 modules = ["openllm"]
 mypy_path = "typings"
 pretty = true
 python_version = "3.8"
 show_error_codes = true
+strict = true
 warn_no_return = false
 warn_return_any = false
 warn_unreachable = true
@@ -452,67 +431,3 @@ module = [
 [[tool.mypy.overrides]]
 ignore_errors = true
 module = ["openllm.models.*", "openllm.playground.*", "openllm._typing_compat"]
-
-[tool.hatch.version]
-fallback-version = "0.0.0"
-source = "vcs"
-[tool.hatch.build.hooks.vcs]
-version-file = "src/openllm/_version.py"
-[tool.hatch.version.raw-options]
-git_describe_command = ["git", "describe", "--dirty", "--tags", "--long", "--first-parent"]
-local_scheme = "no-local-version"
-[tool.hatch.metadata]
-allow-direct-references = true
-[tool.hatch.build.targets.wheel]
-exclude = ["/src/openllm-node"]
-only-include = ["src"]
-sources = ["src"]
-[tool.hatch.build.targets.sdist]
-exclude = ["/.github", "/typings", "/examples", "/assets", "/changelog.d", "/tools", ".git"]
-[tool.hatch.build.targets.wheel.hooks.mypyc]
-dependencies = [
-    "hatch-mypyc==0.16.0",
-    "mypy==1.4.1",
-    # avoid https://github.com/pallets/click/issues/2558
-    "click==8.1.3",
-    "bentoml==1.1.1",
-    "transformers>=4.31.0",
-    "pandas-stubs",
-    "types-psutil",
-    "types-tabulate",
-    "types-PyYAML",
-    "types-protobuf",
-]
-enable-by-default = false
-include = [
-    "src/openllm/bundle",
-    "src/openllm/models/__init__.py",
-    "src/openllm/models/auto/__init__.py",
-    "src/openllm/utils/__init__.py",
-    "src/openllm/utils/codegen.py",
-    "src/openllm/__init__.py",
-    "src/openllm/_prompt.py",
-    "src/openllm/_schema.py",
-    "src/openllm/_quantisation.py",
-    "src/openllm/_generation.py",
-    "src/openllm/_strategies.py",
-    "src/openllm/exceptions.py",
-    "src/openllm/testing.py",
-]
-# NOTE: This is consistent with pyproject.toml
-mypy-args = [
-    "--strict",
-    # this is because all transient library doesn't have types
-    "--allow-subclassing-any",
-    "--follow-imports=skip",
-    "--check-untyped-defs",
-    "--ignore-missing-imports",
-    "--no-warn-return-any",
-    "--warn-unreachable",
-    "--no-warn-no-return",
-    "--no-warn-unused-ignores",
-    "--exclude='/src\\/openllm\\/playground\\/**'",
-    "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
-]
-options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
-require-runtime-dependencies = true
diff --git a/taplo.toml b/taplo.toml
deleted file mode 100644
index 994b031b..00000000
--- a/taplo.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-include = ["*.toml"]
-
-[formatting]
-align_entries = false
-column_width = 120
-indent_string = "    "
-reorder_keys = true
diff --git a/tools/assert-model-table-latest b/tools/assert-model-table-latest
index c481c1fa..c0221136 100755
--- a/tools/assert-model-table-latest
+++ b/tools/assert-model-table-latest
@@ -1,25 +1,21 @@
 #!/usr/bin/env python3
-
 from __future__ import annotations
-import os
-import subprocess
-import sys
-
+import os, subprocess, sys
 from markdown_it import MarkdownIt
 
 md = MarkdownIt()
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-with open(os.path.join(ROOT, "README.md"), "r") as f:
-  readme = md.parse(f.read())
+with open(os.path.join(ROOT, "README.md"), "r") as f: readme = md.parse(f.read())
+sys.path.insert(0, os.path.join(ROOT,"openllm-python","src"))
+import openllm
 
 # NOTE: Currently, we only have one table in README, which is the Model readme.
 table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]
 
 prev = os.environ.pop("OPENLLMDEVDEBUG", None)
-available = (subprocess.check_output([sys.executable, "-m", "openllm", "models", "-o", "porcelain"],).strip().decode("utf-8").count("\n") + 1)
-if prev:
-  os.environ["OPENLLMDEVDEBUG"] = prev
+available = len(openllm.CONFIG_MAPPING.keys())
+if prev: os.environ["OPENLLMDEVDEBUG"] = prev
 
 on_table = len(table)  # NOTE: minus the header
 
diff --git a/tools/dependencies.py b/tools/dependencies.py
index f0c8dda4..98270b73 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -1,55 +1,42 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import dataclasses
-import os
-import typing as t
+import dataclasses, os, typing as t, sys
+import inflection, tomlkit
+from ghapi.all import GhApi
+if t.TYPE_CHECKING: from tomlkit.items import Array, Table
 
-import inflection
-import tomlkit
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, os.path.join(ROOT, "openllm-python", "src"))
 
 import openllm
 
-if t.TYPE_CHECKING:
-  from tomlkit.items import Array, Table
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+_OWNER, _REPO = "bentoml", "openllm"
 
 @dataclasses.dataclass(frozen=True)
 class Classifier:
   identifier: t.Dict[str, str] = dataclasses.field(default_factory=lambda: {"status": "Development Status", "environment": "Environment", "license": "License", "topic": "Topic", "os": "Operating System", "audience": "Intended Audience", "typing": "Typing", "language": "Programming Language",})
-
   joiner: str = " :: "
-
   @staticmethod
-  def status() -> dict[int, str]:
-    return {v: status for v, status in zip(range(1, 8), ["1 - Planning", "2 - Pre-Alpha", "3 - Alpha", "4 - Beta", "5 - Production/Stable", "6 - Mature", "7 - Inactive",],)}
-
+  def status() -> dict[int, str]: return {v: status for v, status in zip(range(1, 8), ["1 - Planning", "2 - Pre-Alpha", "3 - Alpha", "4 - Beta", "5 - Production/Stable", "6 - Mature", "7 - Inactive",],)}
   @staticmethod
-  def apache() -> str:
-    return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
-
+  def apache() -> str: return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
   @staticmethod
   def create_classifier(identifier: str, *decls: t.Any) -> str:
     cls_ = Classifier()
     if identifier not in cls_.identifier:
       raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
     return cls_.joiner.join([cls_.identifier[identifier], *decls])
-
   @staticmethod
   def create_python_classifier(implementation: list[str] | None = None, supported_version: list[str] | None = None) -> list[str]:
-    if supported_version is None:
-      supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
-    if implementation is None:
-      implementation = ["CPython", "PyPy"]
+    if supported_version is None: supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    if implementation is None: implementation = ["CPython", "PyPy"]
     base = [Classifier.create_classifier("language", "Python"), Classifier.create_classifier("language", "Python", "3"),]
     base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
     base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
     base.extend([Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation])
     return base
-
   @staticmethod
-  def create_status_classifier(level: int) -> str:
-    return Classifier.create_classifier("status", Classifier.status()[level])
+  def create_status_classifier(level: int) -> str: return Classifier.create_classifier("status", Classifier.status()[level])
 
 @dataclasses.dataclass(frozen=True)
 class Dependencies:
@@ -62,47 +49,26 @@ class Dependencies:
   lower_constraint: t.Optional[str] = None
   upper_constraint: t.Optional[str] = None
   platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
-
-  def with_options(self, **kwargs: t.Any) -> Dependencies:
-    return dataclasses.replace(self, **kwargs)
-
+  def with_options(self, **kwargs: t.Any) -> Dependencies: return dataclasses.replace(self, **kwargs)
   @property
-  def has_constraint(self) -> bool:
-    return self.lower_constraint is not None or self.upper_constraint is not None
-
+  def has_constraint(self) -> bool: return self.lower_constraint is not None or self.upper_constraint is not None
   @property
-  def pypi_extensions(self) -> str:
-    return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
-
+  def pypi_extensions(self) -> str: return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
   @staticmethod
-  def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str:
-    return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
-
+  def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str: return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
   def to_str(self) -> str:
     deps: list[str] = []
-    if self.lower_constraint is not None and self.upper_constraint is not None:
-      dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}"
-    elif self.lower_constraint is not None:
-      dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
-    elif self.upper_constraint is not None:
-      dep = f"{self.name}{self.pypi_extensions}<{self.upper_constraint}"
-    elif self.subdirectory is not None:
-      dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
-    elif self.branch is not None:
-      dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
-    else:
-      dep = f"{self.name}{self.pypi_extensions}"
-
+    if self.lower_constraint is not None and self.upper_constraint is not None: dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}"
+    elif self.lower_constraint is not None: dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
+    elif self.upper_constraint is not None: dep = f"{self.name}{self.pypi_extensions}<{self.upper_constraint}"
+    elif self.subdirectory is not None: dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
+    elif self.branch is not None: dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
+    else: dep = f"{self.name}{self.pypi_extensions}"
     deps.append(dep)
-
-    if self.platform:
-      deps.append(self.platform_restriction(*self.platform))
-
+    if self.platform: deps.append(self.platform_restriction(*self.platform))
     return ";".join(deps)
-
   @classmethod
-  def from_tuple(cls, *decls: t.Any) -> Dependencies:
-    return cls(*decls)
+  def from_tuple(cls, *decls: t.Any) -> Dependencies: return cls(*decls)
 
 _BENTOML_EXT = ["grpc", "io"]
 _TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
@@ -126,11 +92,6 @@ _BASE_DEPENDENCIES = [
     Dependencies(name="bitsandbytes", upper_constraint="0.42"),  # 0.41  works with CUDA 11.8
 ]
 
-_NIGHTLY_MAPPING: dict[str, Dependencies] = {
-    "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT), "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None), "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT), "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
-    "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None), "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None), "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None), "vllm": Dependencies.from_tuple("vllm", "vllm-project/vllm", "main", None, None, True, None),
-}
-
 _ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 FINE_TUNE_DEPS = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
@@ -185,48 +146,89 @@ def create_optional_table() -> Table:
 
   return table
 
-def create_url_table() -> Table:
+def create_url_table(_info: t.Any) -> Table:
   table = tomlkit.table()
   _urls = {
-      "Blog": "https://modelserving.com", "Chat": "https://discord.gg/openllm", "Documentation": "https://github.com/bentoml/openllm#readme", "GitHub": "https://github.com/bentoml/openllm", "History": "https://github.com/bentoml/openllm/blob/main/CHANGELOG.md", "Homepage": "https://bentoml.com", "Tracker": "https://github.com/bentoml/openllm/issues",
+      "Blog": "https://modelserving.com", "Chat": "https://discord.gg/openllm", "Documentation": "https://github.com/bentoml/openllm#readme",
+      "GitHub": _info.html_url,
+      "History": f"{_info.html_url}/blob/main/CHANGELOG.md",
+      "Homepage": _info.homepage,
+      "Tracker": f"{_info.html_url}/issues",
       "Twitter": "https://twitter.com/bentomlai",
   }
   table.update({k: v for k, v in sorted(_urls.items())})
   return table
 
+def build_system() -> Table:
+  table = tomlkit.table()
+  table.add("build-backend", "hatchling.build")
+  requires_array = tomlkit.array()
+  requires_array.extend(["hatchling==1.18.0", "hatch-vcs==0.3.0", "hatch-fancy-pypi-readme==23.1.0", "hatch-mypyc==0.16.0"])
+  table.add("requires", requires_array.multiline(True))
+  return table
+
+def authors() -> Array:
+  arr = tomlkit.array()
+  arr.append(dict(name="Aaron Pham", email="aarnphm@bentoml.com"))
+  arr.append(dict(name="BentoML Team", email="contact@bentoml.com"))
+  return arr.multiline(True)
+
+def keywords() -> Array:
+  arr = tomlkit.array()
+  arr.extend([
+    "MLOps",
+    "AI",
+    "BentoML",
+    "Model Serving",
+    "Model Deployment",
+    "LLMOps",
+    "Falcon",
+    "Vicuna",
+    "Llama 2",
+    "Fine tuning",
+    "Serverless",
+    "Large Language Model",
+    "Generative AI",
+    "StableLM",
+    "Alpaca",
+    "PyTorch",
+    "Transformers"])
+  return arr
+
 def build_cli_extensions() -> Table:
   table = tomlkit.table()
   ext: dict[str, str] = {"openllm": "openllm.cli.entrypoint:cli"}
-  ext.update({f"openllm-{inflection.dasherize(ke)}": f"openllm.cli.extension.{ke}:cli" for ke in sorted([fname[:-3] for fname in os.listdir(os.path.abspath(os.path.join(ROOT, "src", "openllm", "cli", "extension"))) if fname.endswith(".py") and not fname.startswith("__")])})
+  ext.update({f"openllm-{inflection.dasherize(ke)}": f"openllm.cli.extension.{ke}:cli" for ke in sorted([fname[:-3] for fname in os.listdir(os.path.abspath(os.path.join(ROOT, "openllm-python", "src", "openllm", "cli", "extension"))) if fname.endswith(".py") and not fname.startswith("__")])})
   table.update(ext)
   return table
 
 def main() -> int:
-  with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
-    pyproject = tomlkit.parse(f.read())
+  api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
+  _info = api.repos.get()
+  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "r") as f: pyproject = tomlkit.parse(f.read())
 
   dependencies_array = tomlkit.array()
   dependencies_array.extend([v.to_str() for v in _BASE_DEPENDENCIES])
+  # dynamic field
+  dyn_arr = tomlkit.array()
+  dyn_arr.extend(["version", "readme"])
 
-  pyproject["project"]["urls"] = create_url_table()
-  pyproject["project"]["scripts"] = build_cli_extensions()
+  pyproject["build-system"] = build_system()
+  pyproject["project"]["authors"] = authors()
   pyproject["project"]["classifiers"] = create_classifiers()
-  pyproject["project"]["optional-dependencies"] = create_optional_table()
   pyproject["project"]["dependencies"] = dependencies_array.multiline(True)
+  pyproject["project"]["description"] = f"{_info.name}: {_info.description}"
+  pyproject["project"]["dynamic"] = dyn_arr
+  pyproject["project"]["keywords"] = keywords().multiline(True)
+  pyproject["project"]["license"] = _info.license.spdx_id
+  pyproject["project"]["name"] = f"{_info.name.lower()}"
+  pyproject["project"]["requires-python"] = ">=3.8"
 
-  with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
-    f.write(tomlkit.dumps(pyproject))
-
-  with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
-    f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n")
-    f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
-  with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
-    f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n")
-    f.write("# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n")
-    f.write("-r nightly-requirements.txt\n-e .[all]\n")
-    f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
+  pyproject["project"]["urls"] = create_url_table(_info)
+  pyproject["project"]["scripts"] = build_cli_extensions()
+  pyproject["project"]["optional-dependencies"] = create_optional_table()
 
+  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "w") as f: f.write(tomlkit.dumps(pyproject))
   return 0
 
-if __name__ == "__main__":
-  raise SystemExit(main())
+if __name__ == "__main__": raise SystemExit(main())
diff --git a/tools/generate-coverage.py b/tools/generate-coverage.py
index 12196f3f..65132443 100755
--- a/tools/generate-coverage.py
+++ b/tools/generate-coverage.py
@@ -8,10 +8,10 @@ from lxml import etree
 
 ROOT = Path(__file__).resolve().parent.parent
 
-PACKAGES = {"src/openllm/": "openllm"}
+PACKAGES = {"openllm-python/src/openllm/": "openllm"}
 
 def main() -> int:
-  coverage_report = ROOT / "coverage.xml"
+  coverage_report = ROOT/"coverage.xml"
   root = etree.fromstring(coverage_report.read_text())
 
   raw_package_data: defaultdict[str, dict[str, int]] = defaultdict(lambda: {"hits": 0, "misses": 0})
@@ -27,10 +27,8 @@ def main() -> int:
         raise ValueError(message)
 
       for line in module.find("lines"):
-        if line.attrib["hits"] == "1":
-          data["hits"] += 1
-        else:
-          data["misses"] += 1
+        if line.attrib["hits"] == "1": data["hits"] += 1
+        else: data["misses"] += 1
 
   total_statements_covered = 0
   total_statements = 0
@@ -40,14 +38,11 @@ def main() -> int:
     statements = statements_covered + data["misses"]
     total_statements_covered += statements_covered
     total_statements += statements
-
     coverage_data[package_name] = {"statements_covered": statements_covered, "statements": statements}
   coverage_data["total"] = {"statements_covered": total_statements_covered, "statements": total_statements}
 
-  coverage_summary = ROOT / "coverage-summary.json"
+  coverage_summary = ROOT/"coverage-summary.json"
   coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding="utf-8")
-
   return 0
 
-if __name__ == "__main__":
-  raise SystemExit(main())
+if __name__ == "__main__": raise SystemExit(main())
diff --git a/tools/lock-actions b/tools/lock-actions
new file mode 100755
index 00000000..cda8f826
--- /dev/null
+++ b/tools/lock-actions
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+set -ex
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$GIT_ROOT" || exit 1
+
+[[ -x "$(command -v docker)" ]] || (
+    echo "docker not found. Make sure to have docker running to run this job."
+    exit 1
+)
+
+find "${GIT_ROOT}/.github/workflows" -type f -iname '*.yml' -exec docker run -it --rm -v "${PWD}":"${PWD}" -w "${PWD}" ghcr.io/sethvargo/ratchet:0.4.0 pin {} \;
diff --git a/tools/sync-readme b/tools/sync-readme
new file mode 100755
index 00000000..3de68ab4
--- /dev/null
+++ b/tools/sync-readme
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+if ! git diff --quiet README.md; then
+    cp README.md openllm-python/README.md
+    exit 1
+else
+    echo "README.md is up to date"
+    exit 0
+fi
diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py
index ed9091da..43dd7bed 100755
--- a/tools/update-brew-tap.py
+++ b/tools/update-brew-tap.py
@@ -22,8 +22,7 @@ def determine_release_url(svn_url: str, tag: str, target: t.Literal["macos_arm",
   return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz"
 
 # curl -sSL <svn_url>/archive/refs/tags/<tag>.tar.gz | shasum -a256 | cut -d'' -f1
-def get_release_hash_command(svn_url: str, tag: str) -> Pipeline:
-  return curl["-sSL", svn_url] | shasum["-a256"] | cut["-d", " ", "-f1"]
+def get_release_hash_command(svn_url: str, tag: str) -> Pipeline: return curl["-sSL", svn_url] | shasum["-a256"] | cut["-d", " ", "-f1"]
 
 def main() -> int:
   api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
@@ -35,7 +34,7 @@ def main() -> int:
 
   ENVIRONMENT = Environment(extensions=["jinja2.ext.do", "jinja2.ext.loopcontrols", "jinja2.ext.debug"], trim_blocks=True, lstrip_blocks=True, loader=FileSystemLoader((ROOT / "Formula").__fspath__(), followlinks=True))
   template_file = "openllm.rb.j2"
-  with (ROOT / "Formula" / "openllm.rb").open("w") as f:
+  with (ROOT/"Formula"/"openllm.rb").open("w") as f:
     f.write(ENVIRONMENT.get_template(template_file, globals={"determine_release_url": determine_release_url}).render(shadict=shadict, __tag__=release_tag, __cmd__=fs.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)), __template_file__=fs.path.join("Formula", template_file), __gz_extension__=_gz_strategies, **_info))
     f.write("\n")
   return 0
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index 957ee0bd..dd4bad3e 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import importlib
-import os
+import os, sys
 from pathlib import Path
 
-import openllm
-from openllm._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
-
 # currently we are assuming the indentatio level is 2 for comments
 START_COMMENT = f"# {os.path.basename(__file__)}: start\n"
 END_COMMENT = f"# {os.path.basename(__file__)}: stop\n"
@@ -15,8 +11,12 @@ END_SPECIAL_COMMENT = f"# {os.path.basename(__file__)}: special stop\n"
 START_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs start\n"
 END_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs stop\n"
 
-_TARGET_FILE = Path(__file__).parent.parent/"src"/"openllm"/"_configuration.py"
-_imported = importlib.import_module(ModelSettings.__module__)
+ROOT = Path(__file__).parent.parent
+_TARGET_FILE = ROOT/"openllm-python"/"src"/"openllm"/"_configuration.py"
+
+sys.path.insert(0, (ROOT/"openllm-python"/"src").__fspath__())
+from openllm._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
+from openllm.utils import codegen
 
 def process_annotations(annotations: str) -> str:
   if "NotRequired" in annotations: return annotations[len("NotRequired["):-1]
@@ -93,34 +93,31 @@ def main() -> int:
 
   # NOTE: inline stubs __config__ attrs representation
   special_attrs_lines: list[str] = []
-  for keys, ForwardRef in openllm.utils.codegen.get_annotations(ModelSettings).items(): special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
-
+  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
   # NOTE: inline stubs for _ConfigAttr type stubs
   config_attr_lines: list[str] = []
-  for keys, ForwardRef in openllm.utils.codegen.get_annotations(ModelSettings).items():
+  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
     config_attr_lines.extend([" "*4 + line for line in [f"__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n", f'"""{_value_docstring[keys]}"""\n',]])
-
   # NOTE: inline runtime __getitem__ overload process
   lines: list[str] = []
   lines.append(" "*2 + "# NOTE: ModelSettings arguments\n")
-  for keys, ForwardRef in openllm.utils.codegen.get_annotations(ModelSettings).items(): lines.extend([" "*2 + line for line in ["@overload\n" if "overload" in dir(_imported) else "@t.overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n',]])
+  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n',]])
   # special case variables: generation_class, extras, sampling_class
   lines.append(" "*2 + "# NOTE: generation_class, sampling_class and extras arguments\n")
   lines.extend([
-      " "*2 + line for line in [
-          "@overload\n" if "overload" in dir(_imported) else "@t.overload\n", 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...\n', "@overload\n" if "overload" in dir(_imported) else "@t.overload\n", 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...\n', "@overload\n"
-          if "overload" in dir(_imported) else "@t.overload\n", 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
-      ]
-  ])
+  " "*2 + line for line in [
+    "@overload\n", 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...\n',
+    "@overload\n", 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...\n',
+    "@overload\n", 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
+  ]])
   lines.append(" "*2 + "# NOTE: GenerationConfig arguments\n")
-  generation_config_anns = openllm.utils.codegen.get_annotations(GenerationConfig)
-  for keys, type_pep563 in generation_config_anns.items(): lines.extend([" "*2 + line for line in ["@overload\n" if "overload" in dir(_imported) else "@t.overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]])
+  generation_config_anns = codegen.get_annotations(GenerationConfig)
+  for keys, type_pep563 in generation_config_anns.items(): lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n']])
   lines.append(" "*2 + "# NOTE: SamplingParams arguments\n")
-  for keys, type_pep563 in openllm.utils.codegen.get_annotations(SamplingParams).items():
-    if keys not in generation_config_anns: lines.extend([" "*2 + line for line in ["@overload\n" if "overload" in dir(_imported) else "@t.overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]])
-
+  for keys, type_pep563 in codegen.get_annotations(SamplingParams).items():
+    if keys not in generation_config_anns: lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]])
   lines.append(" "*2 + "# NOTE: PeftType arguments\n")
-  for keys in PeftType._member_names_: lines.extend([" "*2 + line for line in ["@overload\n" if "overload" in dir(_imported) else "@t.overload\n", f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',]])
+  for keys in PeftType._member_names_: lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',]])
 
   processed = processed[:start_attrs_idx] + [" "*4 + START_ATTRS_COMMENT, *special_attrs_lines, " "*4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [" "*4 + START_SPECIAL_COMMENT, *config_attr_lines, " "*4 + END_SPECIAL_COMMENT] + processed[end_stub_idx + 1:start_idx] + [" "*2 + START_COMMENT, *lines, " "*2 + END_COMMENT] + processed[end_idx + 1:]
   with _TARGET_FILE.open("w") as f: f.writelines(processed)
diff --git a/tools/update-dummy.py b/tools/update-dummy.py
index 07cd9071..78038ee3 100755
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -1,23 +1,23 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import os
-import typing as t
+import os, typing as t, sys
 from pathlib import Path
-
-import openllm
-from openllm._configuration import LiteralRuntime
-
-if t.TYPE_CHECKING:
-  from collections import OrderedDict
-
 _ROOT = Path(__file__).parent.parent
-config_requirements = {k:[_.replace("-", "_") for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k,v in openllm.CONFIG_MAPPING.items()}
+
+sys.path.insert(0, (_ROOT/"openllm-python"/"src").__fspath__())
+from openllm._configuration import LiteralRuntime
+from openllm.models import auto
+from openllm import CONFIG_MAPPING
+
+if t.TYPE_CHECKING: from collections import OrderedDict
+
+config_requirements = {k:[_.replace("-", "_") for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k,v in CONFIG_MAPPING.items()}
 _dependencies: dict[LiteralRuntime,str] = {k:v for k,v in zip(LiteralRuntime.__args__, ("torch", "tensorflow", "flax", "vllm"))}
 _auto: dict[str,str] = {k:v for k,v in zip(LiteralRuntime.__args__, ("AutoLLM", "AutoTFLLM", "AutoFlaxLLM", "AutoVLLM"))}
 
-def get_target_dummy_file(framework: LiteralRuntime) -> Path: return _ROOT.joinpath("src","openllm","utils",f"dummy_{framework}_objects.py")
+def get_target_dummy_file(framework: LiteralRuntime) -> Path: return _ROOT/"openllm-python"/"src"/"openllm"/"utils"/f"dummy_{framework}_objects.py"
 def mapping_names(framework: LiteralRuntime): return "MODEL_MAPPING_NAMES" if framework == "pt" else f"MODEL_{framework.upper()}_MAPPING_NAMES"
-def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]: return getattr(openllm.models.auto, mapping_names(framework))
+def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]: return getattr(auto, mapping_names(framework))
 
 def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]:
   _dep_list: list[str] = [f'"{v}"' for v in [_dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != "__default__" and config_requirements[model_name] else [])]]
diff --git a/tools/update-models-import.py b/tools/update-models-import.py
index f1b9ef5a..32022db2 100755
--- a/tools/update-models-import.py
+++ b/tools/update-models-import.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 from pathlib import Path
 
-_TARGET_FILE = Path(__file__).parent.parent/"src"/"openllm"/"models"/"__init__.py"
+_TARGET_FILE = Path(__file__).parent.parent/"openllm-python"/"src"/"openllm"/"models"/"__init__.py"
 
 def create_module_import() -> str:
   r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']]
diff --git a/tools/update-readme.py b/tools/update-readme.py
index 36904a27..a5f2f5ba 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -1,23 +1,18 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import os
+import os, inflection, tomlkit, sys
 import typing as t
 
-import inflection
-import tomlkit
-
-import openllm
-
 START_COMMENT = f"<!-- {os.path.basename(__file__)}: start -->\n"
 END_COMMENT = f"<!-- {os.path.basename(__file__)}: stop -->\n"
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, os.path.join(ROOT,"openllm-python","src"))
+import openllm
 
 def main() -> int:
-  with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
-    deps = tomlkit.parse(f.read()).value["project"]["optional-dependencies"]
-  with open(os.path.join(ROOT, "README.md"), "r") as f:
-    readme = f.readlines()
+  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "r") as f: deps = tomlkit.parse(f.read()).value["project"]["optional-dependencies"]
+  with open(os.path.join(ROOT, "README.md"), "r") as f: readme = f.readlines()
 
   start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
   formatted: dict[t.Literal["Model", "Architecture", "URL", "Installation", "Model Ids"], list[str | list[str]]] = {"Model": [], "Architecture": [], "URL": [], "Model Ids": [], "Installation": [],}
@@ -28,14 +23,10 @@ def main() -> int:
     formatted["Architecture"].append(config_cls.__openllm_architecture__)
     formatted["URL"].append(config_cls.__openllm_url__)
     formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
-    if dashed in deps:
-      instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
-    else:
-      instruction = "```bash\npip install openllm\n```"
-    if len(instruction) > max_install_len_div:
-      max_install_len_div = len(instruction)
+    if dashed in deps: instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
+    else: instruction = "```bash\npip install openllm\n```"
+    if len(instruction) > max_install_len_div: max_install_len_div = len(instruction)
     formatted["Installation"].append(instruction)
-
   meta: list[str] = ["\n", "<table align='center'>\n"]
 
   # NOTE: headers
@@ -47,23 +38,18 @@ def main() -> int:
     meta += "<tr>\n"
     # configure architecture URL
     cfg_cls = openllm.CONFIG_MAPPING[name]
-    if cfg_cls.__openllm_trust_remote_code__:
-      arch = f"<td><a href={url}><code>{architecture}</code></a></td>\n"
-    else:
-      model_name = {"dolly_v2": "gpt_neox", "stablelm": "gpt_neox", "starcoder": "gpt_bigcode", "flan_t5": "t5",}.get(cfg_cls.__openllm_model_name__, cfg_cls.__openllm_model_name__)
-      arch = f"<td><a href=https://huggingface.co/docs/transformers/main/model_doc/{model_name}#transformers.{architecture}><code>{architecture}</code></a></td>\n"
-    meta.extend([f"\n<td><a href={url}>{name}</a></td>\n", arch,])
+    if cfg_cls.__openllm_trust_remote_code__: arch = f"<td><a href={url}><code>{architecture}</code></a></td>\n"
+    else: arch = f"<td><a href=https://huggingface.co/docs/transformers/main/model_doc/{dict(dolly_v2='gpt_neox',stablelm='gpt_neox', starcoder='gpt_bigcode', flan_t5='t5').get(cfg_cls.__openllm_model_name__, cfg_cls.__openllm_model_name__)}#transformers.{architecture}><code>{architecture}</code></a></td>\n"
+    meta.extend([f"\n<td><a href={url}>{name}</a></td>\n", arch])
     format_with_links: list[str] = []
-    for lid in model_ids:
-      format_with_links.append(f"<li><a href=https://huggingface.co/{lid}><code>{lid}</code></a></li>")
+    for lid in model_ids: format_with_links.append(f"<li><a href=https://huggingface.co/{lid}><code>{lid}</code></a></li>")
     meta.append("<td>\n\n<ul>" + "\n".join(format_with_links) + "</ul>\n\n</td>\n")
     meta.append(f"<td>\n\n{installation}\n\n</td>\n")
     meta += "</tr>\n"
   meta.extend(["</table>\n", "\n"])
 
   readme = readme[:start_index] + [START_COMMENT] + meta + [END_COMMENT] + readme[stop_index + 1:]
-  with open(os.path.join(ROOT, "README.md"), "w") as f:
-    f.writelines(readme)
+  with open(os.path.join(ROOT, "README.md"), "w") as f: f.writelines(readme)
   return 0
 
 if __name__ == "__main__": raise SystemExit(main())
diff --git a/tools/write-coverage-report.py b/tools/write-coverage-report.py
index 2b9064e6..c3af337d 100755
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 from decimal import ROUND_DOWN, Decimal
 from pathlib import Path
-
 import orjson
 
 PRECISION = Decimal(".01")
@@ -10,7 +9,7 @@ PRECISION = Decimal(".01")
 ROOT = Path(__file__).resolve().parent.parent
 
 def main() -> int:
-  coverage_summary = ROOT / "coverage-summary.json"
+  coverage_summary = ROOT/"coverage-summary.json"
 
   coverage_data = orjson.loads(coverage_summary.read_text(encoding="utf-8"))
   total_data = coverage_data.pop("total")
@@ -32,13 +31,10 @@ def main() -> int:
   color = "ok" if float(total_rate) >= 95 else "critical"
   lines.insert(0, f"![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n")
 
-  lines.append(f"**Summary** | {100 if total_rate == 100 else total_rate}% "
-                f"({total_statements_covered} / {total_statements})\n")
+  lines.append(f"**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n")
 
-  coverage_report = ROOT / "coverage-report.md"
-  with coverage_report.open("w", encoding="utf-8") as f:
-    f.write("".join(lines))
+  coverage_report = ROOT/"coverage-report.md"
+  with coverage_report.open("w", encoding="utf-8") as f: f.write("".join(lines))
   return 0
 
-if __name__ == "__main__":
-  raise SystemExit(main())
+if __name__ == "__main__": raise SystemExit(main())