diff --git a/.git_archival.txt b/.git_archival.txt
new file mode 100644
index 00000000..8fb235d7
--- /dev/null
+++ b/.git_archival.txt
@@ -0,0 +1,4 @@
+node: $Format:%H$
+node-date: $Format:%cI$
+describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$
+ref-names: $Format:%D$
diff --git a/.gitattributes b/.gitattributes
index 535c916b..19b87f7b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,3 +3,5 @@ nightly-requirements-gpu.txt linguist-generated=true
 tests/models/__snapshots__/* linguist-generated=true
 typings/**/*.pyi linguist-generated=true
 * text=auto eol=lf
+# Needed for setuptools-scm-git-archive
+.git_archival.txt  export-subst
diff --git a/.github/actions/release.sh b/.github/actions/release.sh
index 38ff6f53..2635898c 100755
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -ex
+set -e
 
 # Function to print script usage
 print_usage() {
@@ -61,41 +61,43 @@ fi
 release_package() {
     local version="$1"
     echo "Releasing version ${version}..."
-
     jq --arg release_version "${version}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
-
-    if [[ $release == 'patch' ]]; then
-        hatch version "${version}"
-    fi
-
     towncrier build --yes --version "${version}"
-    git add CHANGELOG.md changelog.d src/openllm/__about__.py package.json
+    git add CHANGELOG.md changelog.d package.json
     git commit -S -sm "infra: prepare for release ${version} [generated] [skip ci]"
     git push origin main
-
     echo "Releasing tag ${version}..." && git tag -a "v${version}" -sm "Release ${version} [generated by GitHub Actions]"
     git push origin "v${version}"
-
     echo "Finish releasing version ${version}"
 }
 
-echo "Cleaning previously built artifacts..." && hatch clean
+#get highest tags across all branches, not just the current branch
+version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
+VERSION="${version#v}"
+# Save the current value of IFS to restore it later
+OLD_IFS=$IFS
+IFS='.'
+# split into array
+read -ra VERSION_BITS <<< "$VERSION"
+# Restore the original value of IFS
+IFS=$OLD_IFS
+VNUM1=${VERSION_BITS[0]}
+VNUM2=${VERSION_BITS[1]}
+VNUM3=${VERSION_BITS[2]}
 
 if [[ $release == 'major' ]]; then
-    hatch version major
-    CURRENT_VERSION=$(hatch version)
-    release_package "${CURRENT_VERSION}"
+    VNUM1=$((VNUM1+1))
+    VNUM2=0
+    VNUM3=0
 elif [[ $release == 'minor' ]]; then
-    hatch version minor
-    CURRENT_VERSION="$(hatch version)"
-    release_package "${CURRENT_VERSION}"
+    VNUM2=$((VNUM2+1))
+    VNUM3=0
 else
-    CURRENT_VERSION=$(hatch version)
-
-    if [[ "$CURRENT_VERSION" =~ \.dev ]]; then
-        release_package "${CURRENT_VERSION%%.dev*}"
-    else
-        echo "Current version is not properly setup as dev version. Aborting..."
-        exit 1
-    fi
+    VNUM3=$((VNUM3+1))
 fi
+
+echo "Commit count: $(git rev-list --count HEAD)"
+
+#create new tag
+RELEASE_VERSION="$VNUM1.$VNUM2.$VNUM3"
+release_package "${RELEASE_VERSION}"
diff --git a/.github/workflows/create-releases.yml b/.github/workflows/create-releases.yml
index 82138c37..01d3171b 100644
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -90,8 +90,11 @@ jobs:
         run: python -m build
       - name: Publish to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          print-hash: true
   prepare-next-dev-cycle:
     needs:
+      - release
       - publish-python
       - binary-distribution
     runs-on: ubuntu-latest
@@ -126,9 +129,19 @@ jobs:
           GIT_COMMITTER_EMAIL: ${{ steps.import-gpg-key.outputs.email }}
         run: |
           git pull --autostash --no-edit --gpg-sign --ff origin main
-          echo "Bumping version to dev..." && hatch version patch && hatch version dev
-          jq --arg release_version "$(hatch version)" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
-          git add src/openllm/__about__.py package.json && git commit -S -sm "infra: bump to dev version of $(hatch version) [generated] [skip ci]"
+          SEMVER="${{ needs.release.outputs.version }}"
+          OLD_IFS=$IFS
+          IFS='.'
+          read -ra VERSION_BITS <<< "$SEMVER"
+          IFS=$OLD_IFS
+          VNUM1=${VERSION_BITS[0]}
+          VNUM2=${VERSION_BITS[1]}
+          VNUM3=${VERSION_BITS[2]}
+          VNUM3=$((VNUM3+1))
+          DEV_VERSION="$VNUM1.$VNUM2.$VNUM3.dev0"
+          echo "Bumping version to ${DEV_VERSION}..."
+          jq --arg release_version "${DEV_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
+          git add package.json && git commit -S -sm "infra: bump to dev version of ${DEV_VERSION} [generated] [skip ci]"
           git push origin HEAD:main
   binary-distribution:
     if: github.repository_owner == 'bentoml'
@@ -136,6 +149,7 @@ jobs:
     name: Create binary/wheels distribution
     uses: bentoml/OpenLLM/.github/workflows/binary-releases.yml@main
   release-notes:
+    if: github.repository_owner == 'bentoml'
     needs:
       - release
       - publish-python
diff --git a/.gitignore b/.gitignore
index 8af78aa2..45dff7a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,4 @@ pyapp
 /target
 
 .pdm-python
+/src/openllm/_version.py
diff --git a/README.md b/README.md
index 9ed24697..a5b6bff6 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 <div align="center">
     <h1 align="center">🦾 OpenLLM</h1>
     <a href="https://pypi.org/project/openllm">
-        <img src="https://img.shields.io/pypi/v/openllm.svg" alt="pypi_status" />
+        <img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
     </a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
         <img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
     </a><a href="https://twitter.com/bentomlai">
@@ -11,6 +11,14 @@
     </a><a href="https://l.bentoml.com/join-openllm-discord">
         <img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
     </a><br>
+    </a><a href="https://pypi.org/project/openllm">
+        <img src="https://img.shields.io/pypi/pyversions/openllm.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
+    </a><a href="https://github.com/pypa/hatch">
+        <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><br>
+    </a><a href="https://github.com/astral-sh/ruff">
+        <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
+    </a><br>
     <p>An open platform for operating large language models (LLMs) in production.</br>
     Fine-tune, serve, deploy, and monitor any LLMs with ease.</p>
     <i></i>
@@ -39,10 +47,14 @@ Images or deploy as serverless endpoint via
 🤖️ **Bring your own LLM**: Fine-tune any LLM to suit your needs with
 `LLM.tuning()`. (Coming soon)
 
+<!-- hatch-fancy-pypi-readme intro stop -->
+
 ![Gif showing OpenLLM Intro](/assets/output.gif)
 <br/>
 
-## 🏃‍ Getting Started
+<!-- hatch-fancy-pypi-readme interim start -->
+
+## 🏃 Getting Started
 
 To use OpenLLM, you need to have Python 3.8 (or newer) and `pip` installed on
 your system. We highly recommend using a Virtual Environment to prevent package
@@ -105,6 +117,7 @@ openllm query 'Explain to me the difference between "further" and "farther"'
 
 Visit `http://localhost:3000/docs.json` for OpenLLM's API specification.
 
+OpenLLM seamlessly supports many models and their variants.
 Users can also specify different variants of the model to be served, by
 providing the `--model-id` argument, e.g.:
 
@@ -112,6 +125,10 @@ providing the `--model-id` argument, e.g.:
 openllm start flan-t5 --model-id google/flan-t5-large
 ```
 
+> **Note** that `openllm` also supports all variants of fine-tuning weights, custom model path
+> as well as quantized weights for any of the supported models as long as it can be loaded with
+> the model architecture. Refer to [supported models](https://github.com/bentoml/OpenLLM/tree/main#-supported-models) section for models' architecture.
+
 Use the `openllm models` command to see the list of models and their variants
 supported in OpenLLM.
 
@@ -127,8 +144,6 @@ dependencies can be installed with the instructions below:
 <tr>
 <th>Model</th>
 <th>Architecture</th>
-<th>CPU</th>
-<th>GPU</th>
 <th>Model Ids</th>
 <th>Installation</th>
 </tr>
@@ -136,8 +151,6 @@ dependencies can be installed with the instructions below:
 
 <td><a href=https://github.com/THUDM/ChatGLM-6B>chatglm</a></td>
 <td><a href=https://github.com/THUDM/ChatGLM-6B><code>ChatGLMForConditionalGeneration</code></a></td>
-<td>❌</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/thudm/chatglm-6b><code>thudm/chatglm-6b</code></a></li>
@@ -159,8 +172,6 @@ pip install "openllm[chatglm]"
 
 <td><a href=https://github.com/databrickslabs/dolly>dolly-v2</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/databricks/dolly-v2-3b><code>databricks/dolly-v2-3b</code></a></li>
@@ -180,8 +191,6 @@ pip install openllm
 
 <td><a href=https://falconllm.tii.ae/>falcon</a></td>
 <td><a href=https://falconllm.tii.ae/><code>FalconForCausalLM</code></a></td>
-<td>❌</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/tiiuae/falcon-7b><code>tiiuae/falcon-7b</code></a></li>
@@ -202,8 +211,6 @@ pip install "openllm[falcon]"
 
 <td><a href=https://huggingface.co/docs/transformers/model_doc/flan-t5>flan-t5</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/t5#transformers.T5ForConditionalGeneration><code>T5ForConditionalGeneration</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/google/flan-t5-small><code>google/flan-t5-small</code></a></li>
@@ -225,8 +232,6 @@ pip install "openllm[flan-t5]"
 
 <td><a href=https://github.com/EleutherAI/gpt-neox>gpt-neox</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
-<td>❌</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/eleutherai/gpt-neox-20b><code>eleutherai/gpt-neox-20b</code></a></li></ul>
@@ -244,8 +249,6 @@ pip install openllm
 
 <td><a href=https://github.com/facebookresearch/llama>llama</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM><code>LlamaForCausalLM</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/meta-llama/llama-2-70b-chat-hf><code>meta-llama/llama-2-70b-chat-hf</code></a></li>
@@ -275,8 +278,6 @@ pip install "openllm[llama]"
 
 <td><a href=https://huggingface.co/mosaicml>mpt</a></td>
 <td><a href=https://huggingface.co/mosaicml><code>MPTForCausalLM</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/mosaicml/mpt-7b><code>mosaicml/mpt-7b</code></a></li>
@@ -300,8 +301,6 @@ pip install "openllm[mpt]"
 
 <td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/facebook/opt-125m><code>facebook/opt-125m</code></a></li>
@@ -324,8 +323,6 @@ pip install "openllm[opt]"
 
 <td><a href=https://github.com/Stability-AI/StableLM>stablelm</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_neox#transformers.GPTNeoXForCausalLM><code>GPTNeoXForCausalLM</code></a></td>
-<td>✅</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b><code>stabilityai/stablelm-tuned-alpha-3b</code></a></li>
@@ -346,8 +343,6 @@ pip install openllm
 
 <td><a href=https://github.com/bigcode-project/starcoder>starcoder</a></td>
 <td><a href=https://huggingface.co/docs/transformers/main/model_doc/gpt_bigcode#transformers.GPTBigCodeForCausalLM><code>GPTBigCodeForCausalLM</code></a></td>
-<td>❌</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/bigcode/starcoder><code>bigcode/starcoder</code></a></li>
@@ -366,8 +361,6 @@ pip install "openllm[starcoder]"
 
 <td><a href=https://github.com/baichuan-inc/Baichuan-7B>baichuan</a></td>
 <td><a href=https://github.com/baichuan-inc/Baichuan-7B><code>BaiChuanForCausalLM</code></a></td>
-<td>❌</td>
-<td>✅</td>
 <td>
 
 <ul><li><a href=https://huggingface.co/baichuan-inc/baichuan-7b><code>baichuan-inc/baichuan-7b</code></a></li>
@@ -596,9 +589,12 @@ client.ask_agent(
 )
 ```
 
+<!-- hatch-fancy-pypi-readme interim stop -->
+
 ![Gif showing Agent integration](/assets/agent.gif)
 <br/>
 
+<!-- hatch-fancy-pypi-readme meta start -->
 
 ## 🚀 Deploying to Production
 
@@ -664,7 +660,6 @@ the serverless cloud for shipping and scaling AI applications.
    [deployment instructions](https://docs.bentoml.com/en/latest/reference/cli.html#bentoml-deployment-create).
 
 
-
 ## 👥 Community
 
 Engage with like-minded individuals passionate about LLMs, AI, and more on our
diff --git a/changelog.d/143.feature.md b/changelog.d/143.feature.md
new file mode 100644
index 00000000..7bcefd94
--- /dev/null
+++ b/changelog.d/143.feature.md
@@ -0,0 +1,5 @@
+Added installing with git-archival support
+
+```bash
+pip install "https://github.com/bentoml/openllm/archive/main.tar.gz"
+```
diff --git a/hatch.toml b/hatch.toml
index d200c3d3..b6b0fda3 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -1,5 +1,63 @@
+[metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+# PyPI doesn't support the <picture> tag.
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+<p align="center">
+  <a href="https://github.com/bentoml/openllm">
+    <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/main-banner.png" width="35%" alt="Banner for OpenLLM" />
+  </a>
+</p>
+
+"""
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+path = "README.md"
+end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/output.gif" width="35%" alt="Gif showing OpenLLM Intro" />
+</p>
+"""
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
+end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/bentoml/openllm/main/assets/agent.gif" width="35%" alt="Gif showing Agent integration" />
+</p>
+"""
+[[metadata.hooks.fancy-pypi-readme.fragments]]
+path = "README.md"
+start-after = "<!-- hatch-fancy-pypi-readme meta start -->\n"
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+## Release Information
+
+"""
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+path = "CHANGELOG.md"
+pattern = "\n(###.+?\n)## "
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+text = """
+
+---
+
+[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
+"""
 [version]
-path = "src/openllm/__about__.py"
+fallback-version = "0.0.0"
+source = "vcs"
+[build.hooks.vcs]
+version-file = "src/openllm/_version.py"
+[version.raw-options]
+git_describe_command = ["git", "describe", "--dirty", "--tags", "--long", "--first-parent"]
+local_scheme = "no-local-version"
 [metadata]
 allow-direct-references = true
 [build.targets.wheel]
@@ -16,6 +74,8 @@ dependencies = [
     "tomlkit",
     # NOTE: Using under ./tools/update-readme.py
     "markdown-it-py",
+    # NOTE: For fancy PyPI readme
+    "hatch-fancy-pypi-readme",
 ]
 [envs.default.scripts]
 changelog = "towncrier build --version main --draft"
diff --git a/pyproject.toml b/pyproject.toml
index ce41555d..6868fbcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 # project.classifiers, project.dependencies, project.optional-dependencies, project.urls
 [build-system]
 build-backend = "hatchling.build"
-requires = ["hatchling"]
+requires = ["hatchling", "hatch-vcs", "hatch-fancy-pypi-readme"]
 
 [project]
 authors = [{ name = "Aaron Pham", email = "aarnphm@bentoml.com" }]
@@ -48,7 +48,7 @@ dependencies = [
     "bitsandbytes<0.42",
 ]
 description = 'OpenLLM: Operating LLMs in production'
-dynamic = ["version"]
+dynamic = ["version", "readme"]
 keywords = [
     "MLOps",
     "AI",
@@ -65,7 +65,6 @@ keywords = [
 ]
 license = "Apache-2.0"
 name = "openllm"
-readme = "README.md"
 requires-python = ">=3.8"
 
 [project.scripts]
@@ -258,7 +257,6 @@ omit = [
     "__pypackages__/*",
     "src/openllm/playground/",
     "src/openllm/__init__.py",
-    "src/openllm/__about__.py",
     "src/openllm/__main__.py",
     "src/openllm/utils/dummy_*.py",
 ]
@@ -281,7 +279,6 @@ omit = [
     "__pypackages__/*",
     "src/openllm/playground/",
     "src/openllm/__init__.py",
-    "src/openllm/__about__.py",
     "src/openllm/__main__.py",
     "src/openllm/utils/dummy_*.py",
 ]
@@ -294,7 +291,6 @@ exclude = [
     "__pypackages__/*",
     "src/openllm/playground/",
     "src/openllm/__init__.py",
-    "src/openllm/__about__.py",
     "src/openllm/__main__.py",
     "src/openllm/utils/dummy_*.py",
 ]
diff --git a/src/openllm/__about__.py b/src/openllm/__about__.py
deleted file mode 100644
index 78db6420..00000000
--- a/src/openllm/__about__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-__version__ = "0.2.10.dev0"
diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
index c3867f5d..9885cfa3 100644
--- a/src/openllm/__init__.py
+++ b/src/openllm/__init__.py
@@ -16,7 +16,7 @@
 An open platform for operating large language models in production. Fine-tune, serve,
 deploy, and monitor any LLMs with ease.
 
-* Built-in support for StableLM, Llama, Dolly, Flan-T5, Vicuna
+* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
 * Option to bring your own fine-tuned LLMs
 * Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
 * Native integration with BentoML and LangChain for custom LLM apps
@@ -24,37 +24,26 @@ deploy, and monitor any LLMs with ease.
 from __future__ import annotations
 import logging
 import os
+import sys
 import typing as t
 import warnings
 
 from . import utils as utils
-from .__about__ import __version__ as __version__
 from .exceptions import MissingDependencyError
 
 
 if utils.DEBUG:
     utils.set_debug_mode(True)
     utils.set_quiet_mode(False)
-
     logging.basicConfig(level=logging.NOTSET)
 else:
     # configuration for bitsandbytes before import
     os.environ["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
     # The following warnings from bitsandbytes, and probably not that important
     # for users to see when DEBUG is False
-    warnings.filterwarnings(
-        "ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization"
-    )
-    warnings.filterwarnings(
-        "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization"
-    )
-    warnings.filterwarnings(
-        "ignore",
-        message=(
-            "The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization"
-            " are unavailable."
-        ),
-    )
+    warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
+    warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
+    warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.")
 
 
 _import_structure: dict[str, list[str]] = {
@@ -73,14 +62,7 @@ _import_structure: dict[str, list[str]] = {
     "serialisation": ["ggml", "transformers"],
     "cli": ["start", "start_grpc", "build", "import_model", "list_models"],
     # NOTE: models
-    "models.auto": [
-        "AutoConfig",
-        "CONFIG_MAPPING",
-        "MODEL_MAPPING_NAMES",
-        "MODEL_FLAX_MAPPING_NAMES",
-        "MODEL_TF_MAPPING_NAMES",
-        "MODEL_VLLM_MAPPING_NAMES",
-    ],
+    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES", ],
     "models.chatglm": ["ChatGLMConfig"],
     "models.baichuan": ["BaichuanConfig"],
     "models.dolly_v2": ["DollyV2Config"],
@@ -96,50 +78,34 @@ _import_structure: dict[str, list[str]] = {
 
 # NOTE: torch and cpm_kernels
 try:
-    if not (utils.is_torch_available() and utils.is_cpm_kernels_available()):
-        raise MissingDependencyError
+    if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_pt_and_cpm_kernels_objects
-
-    _import_structure["utils.dummy_pt_and_cpm_kernels_objects"] = [
-        name for name in dir(dummy_pt_and_cpm_kernels_objects) if not name.startswith("_")
-    ]
+    _import_structure["utils.dummy_pt_and_cpm_kernels_objects"] = [name for name in dir(dummy_pt_and_cpm_kernels_objects) if not name.startswith("_")]
 else:
     _import_structure["models.chatglm"].extend(["ChatGLM"])
     _import_structure["models.baichuan"].extend(["Baichuan"])
 
-
 try:
-    if not (utils.is_torch_available() and utils.is_einops_available()):
-        raise MissingDependencyError
+    if not (utils.is_torch_available() and utils.is_einops_available()): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_pt_and_einops_objects
-
-    _import_structure["utils.dummy_pt_and_einops_objects"] = [
-        name for name in dir(dummy_pt_and_einops_objects) if not name.startswith("_")
-    ]
+    _import_structure["utils.dummy_pt_and_einops_objects"] = [name for name in dir(dummy_pt_and_einops_objects) if not name.startswith("_")]
 else:
     _import_structure["models.falcon"].extend(["Falcon"])
 
-
 try:
-    if not (utils.is_torch_available() and utils.is_triton_available()):
-        raise MissingDependencyError
+    if not (utils.is_torch_available() and utils.is_triton_available()): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_pt_and_triton_objects
-
-    _import_structure["utils.dummy_pt_and_triton_objects"] = [
-        name for name in dir(dummy_pt_and_triton_objects) if not name.startswith("_")
-    ]
+    _import_structure["utils.dummy_pt_and_triton_objects"] = [name for name in dir(dummy_pt_and_triton_objects) if not name.startswith("_")]
 else:
     _import_structure["models.mpt"].extend(["MPT"])
 
 try:
-    if not utils.is_torch_available():
-        raise MissingDependencyError
+    if not utils.is_torch_available(): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_pt_objects
-
     _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
     _import_structure["models.flan_t5"].extend(["FlanT5"])
@@ -152,45 +118,34 @@ else:
     _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING"])
 
 try:
-    if not utils.is_vllm_available():
-        raise MissingDependencyError
+    if not utils.is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_vllm_objects
-
-    _import_structure["utils.dummy_vllm_objects"] = [
-        name for name in dir(dummy_vllm_objects) if not name.startswith("_")
-    ]
+    _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(dummy_vllm_objects) if not name.startswith("_")]
 else:
     _import_structure["models.llama"].extend(["VLLMLlaMA"])
     _import_structure["models.auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"])
 
 try:
-    if not utils.is_flax_available():
-        raise MissingDependencyError
+    if not utils.is_flax_available(): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_flax_objects
-
-    _import_structure["utils.dummy_flax_objects"] = [
-        name for name in dir(dummy_flax_objects) if not name.startswith("_")
-    ]
+    _import_structure["utils.dummy_flax_objects"] = [name for name in dir(dummy_flax_objects) if not name.startswith("_")]
 else:
     _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
     _import_structure["models.opt"].extend(["FlaxOPT"])
     _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"])
 
 try:
-    if not utils.is_tf_available():
-        raise MissingDependencyError
+    if not utils.is_tf_available(): raise MissingDependencyError
 except MissingDependencyError:
     from .utils import dummy_tf_objects
-
     _import_structure["utils.dummy_tf_objects"] = [name for name in dir(dummy_tf_objects) if not name.startswith("_")]
 else:
     _import_structure["models.flan_t5"].extend(["TFFlanT5"])
     _import_structure["models.opt"].extend(["TFOPT"])
     _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"])
 
-
 # declaration for OpenLLM-related modules
 if t.TYPE_CHECKING:
     from . import bundle as bundle
@@ -244,8 +199,7 @@ if t.TYPE_CHECKING:
 
     # NOTE: torch and cpm_kernels
     try:
-        if not (utils.is_torch_available() and utils.is_cpm_kernels_available()):
-            raise MissingDependencyError
+        if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_pt_and_cpm_kernels_objects import *
     else:
@@ -254,8 +208,7 @@ if t.TYPE_CHECKING:
 
     # NOTE: torch and einops
     try:
-        if not (utils.is_torch_available() and utils.is_einops_available()):
-            raise MissingDependencyError
+        if not (utils.is_torch_available() and utils.is_einops_available()): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_pt_and_einops_objects import *
     else:
@@ -263,16 +216,14 @@ if t.TYPE_CHECKING:
 
     # NOTE: torch and triton
     try:
-        if not (utils.is_torch_available() and utils.is_triton_available()):
-            raise MissingDependencyError
+        if not (utils.is_torch_available() and utils.is_triton_available()): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_pt_and_triton_objects import *
     else:
         from .models.mpt import MPT as MPT
 
     try:
-        if not utils.is_torch_available():
-            raise MissingDependencyError
+        if not utils.is_torch_available(): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_pt_objects import *
     else:
@@ -287,8 +238,7 @@ if t.TYPE_CHECKING:
         from .models.starcoder import StarCoder as StarCoder
 
     try:
-        if not utils.is_vllm_available():
-            raise MissingDependencyError
+        if not utils.is_vllm_available(): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_vllm_objects import *
     else:
@@ -297,8 +247,7 @@ if t.TYPE_CHECKING:
         from .models.llama import VLLMLlaMA as VLLMLlaMA
 
     try:
-        if not utils.is_flax_available():
-            raise MissingDependencyError
+        if not utils.is_flax_available(): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_flax_objects import *
     else:
@@ -308,8 +257,7 @@ if t.TYPE_CHECKING:
         from .models.opt import FlaxOPT as FlaxOPT
 
     try:
-        if not utils.is_tf_available():
-            raise MissingDependencyError
+        if not utils.is_tf_available(): raise MissingDependencyError
     except MissingDependencyError:
         from .utils.dummy_tf_objects import *
     else:
@@ -318,20 +266,10 @@ if t.TYPE_CHECKING:
         from .models.flan_t5 import TFFlanT5 as TFFlanT5
         from .models.opt import TFOPT as TFOPT
 
-else:
-    import sys
-
-    sys.modules[__name__] = utils.LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-        doc=__doc__,
-        extra_objects={
-            "__version__": __version__,
-            # The below is a special mapping that allows openllm to be used as a dictionary.
-            # This is purely for convenience sake, and should not be used in performance critcal
-            # code. This is also not considered as a public API.
-            "__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM", "vllm": "AutoVLLM"},
-        },
-    )
+else: sys.modules[__name__] = utils.LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__, doc=__doc__,
+      extra_objects={
+          # The below is a special mapping that allows openllm to be used as a dictionary.
+          # This is purely for convenience sake, and should not be used in performance critcal
+          # code. This is also not considered as a public API.
+          "__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM", "vllm": "AutoVLLM"},
+      })
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 119cc0b2..9655c429 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -69,7 +69,6 @@ import openllm
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelStore
 
-from .__about__ import __version__
 from .exceptions import OpenLLMException
 from .utils import DEBUG
 from .utils import ENV_VARS_TRUE_VALUES
@@ -403,7 +402,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
         return wrapper
 
 @click.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="openllm")
-@click.version_option(__version__, "--version", "-v")
+@click.version_option(None, "--version", "-v")
 def cli() -> None:
     """\b
      ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
diff --git a/src/openllm/utils/lazy.py b/src/openllm/utils/lazy.py
index 6549e891..0c9c940e 100644
--- a/src/openllm/utils/lazy.py
+++ b/src/openllm/utils/lazy.py
@@ -13,30 +13,85 @@
 # limitations under the License.
 
 from __future__ import annotations
+import functools
 import importlib
 import importlib.machinery
+import importlib.metadata
 import itertools
 import os
+import time
 import types
 import typing as t
 import warnings
 
+import attr
+
 from ..exceptions import ForbiddenAttributeError
 from ..exceptions import OpenLLMException
 
 
 class UsageNotAllowedError(OpenLLMException):
     """Raised when LazyModule.__getitem__ is forbidden."""
-
-
 class MissingAttributesError(OpenLLMException):
     """Raised when given keys is not available in LazyModule special mapping."""
 
+@functools.total_ordering
+@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+class VersionInfo:
+    """A version object that can be compared to tuple of length 1--4.
 
-_sentinel = object()
+    ```python
+    >>> VersionInfo(19, 1, 0, "final")  <= (19, 2)
+    True
+    >>> VersionInfo(19, 1, 0, "final") < (19, 1, 1)
+    True
+    >>> vi = VersionInfo(19, 2, 0, "final")
+    >>> vi < (19, 1, 1)
+    False
+    >>> vi < (19,)
+    False
+    >>> vi == (19, 2,)
+    True
+    >>> vi == (19, 2, 1)
+    False
+    ```
+    Vendorred from attrs.
+    """
+    major: int = attr.field()
+    minor: int = attr.field()
+    micro: int = attr.field()
+    releaselevel: str = attr.field()
 
-_reserved_namespace = {"__openllm_special__", "__openllm_migration__"}
+    @classmethod
+    def from_version_string(cls, s: str) -> VersionInfo:
+        """Parse *s* and return a VersionInfo."""
+        v = s.split(".")
+        if len(v) == 3: v.append("final")
+        return cls(major=int(v[0]), minor=int(v[1]), micro=int(v[2]), releaselevel=v[3])
+    def _ensure_tuple(self, other: VersionInfo | tuple[t.Any, ...]) -> tuple[tuple[int, int, int, str], tuple[int, int, int, str]]:
+        """Ensure *other* is a tuple of a valid length.
 
+        Returns a possibly transformed *other* and ourselves as a tuple of
+        the same length as *other*.
+        """
+        if self.__class__ is other.__class__: other = attr.astuple(other)
+        if not isinstance(other, tuple): raise NotImplementedError
+        if not (1 <= len(other) <= 4): raise NotImplementedError
+        return attr.astuple(self)[: len(other)], other
+
+    def __eq__(self, other: t.Any) -> bool:
+        try: us, them = self._ensure_tuple(other)
+        except NotImplementedError: return NotImplemented
+        return us == them
+
+    def __lt__(self, other: t.Any) -> bool:
+        try: us, them = self._ensure_tuple(other)
+        except NotImplementedError: return NotImplemented
+        # Since alphabetically "dev0" < "final" < "post1" < "post2", we don't
+        # have to do anything special with releaselevel for now.
+        return us < them
+
+_sentinel, _reserved_namespace = object(), {"__openllm_special__", "__openllm_migration__"}
 
 class LazyModule(types.ModuleType):
     """Module class that surfaces all objects but only performs associated imports when the objects are requested.
@@ -86,7 +141,6 @@ class LazyModule(types.ModuleType):
         self._objects = _extra_objects
         self._name = name
         self._import_structure = import_structure
-
     def __dir__(self) -> list[str]:
         """Needed for autocompletion in an IDE."""
         result = t.cast("list[str]", super().__dir__())
@@ -95,61 +149,62 @@ class LazyModule(types.ModuleType):
         # they have been accessed or not. So we only add the
         # elements of self.__all__ that are not already in the dir.
         return result + [i for i in self.__all__ if i not in result]
-
     def __getitem__(self, key: str) -> t.Any:
         """This is reserved to only internal uses and users shouldn't use this."""
-        if self._objects.get("__openllm_special__") is None:
-            raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.")
+        if self._objects.get("__openllm_special__") is None: raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.")
         _special_mapping = self._objects.get("__openllm_special__", {})
         try:
-            if key in _special_mapping:
-                return getattr(self, _special_mapping.__getitem__(key))
+            if key in _special_mapping: return getattr(self, _special_mapping.__getitem__(key))
             raise MissingAttributesError(f"Requested '{key}' is not available in given mapping.")
-        except AttributeError as e:
-            raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e
-        except Exception as e:
-            raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e
-
+        except AttributeError as e: raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e
+        except Exception as e: raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e
     def __getattr__(self, name: str) -> t.Any:
         """Equivocal __getattr__ implementation.
 
         It checks from _objects > _modules and does it recursively.
+
+        It also contains a special case for all of the metadata information, such as __version__ and __version_info__.
         """
-        if name in _reserved_namespace:
-            raise ForbiddenAttributeError(
-                f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified."
-            )
+        if name in _reserved_namespace: raise ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
+        dunder_to_metadata = {
+            "__title__": "Name",
+            "__copyright__": "",
+            "__version__": "version",
+            "__version_info__": "version",
+            "__description__": "summary",
+            "__uri__": "",
+            "__url__": "",
+            "__author__": "",
+            "__email__": "",
+            "__license__": "license",
+            "__homepage__": "",
+        }
+        if name in dunder_to_metadata:
+            if name not in {"__version_info__", "__copyright__", "__version__"}: warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2)
+            meta = importlib.metadata.metadata("openllm")
+            project_url = dict(url.split(", ") for url in meta.get_all("Project-URL"))
+            if name == "__license__": return "Apache-2.0"
+            elif name == "__copyright__": return f"Copyright (c) 2023-{time.strftime('%Y')}, Aaron Pham et al."
+            elif name in ("__uri__", "__url__"): return project_url["GitHub"]
+            elif name == "__homepage__": return project_url["Homepage"]
+            elif name == "__version_info__": return VersionInfo.from_version_string(meta["version"])  # similar to how attrs handle __version_info__
+            elif name == "__author__": return meta["Author-email"].rsplit(" ", 1)[0]
+            elif name == "__email__": return meta["Author-email"].rsplit("<", 1)[1][:-1]
+            return meta[dunder_to_metadata[name]]
         if "__openllm_migration__" in self._objects:
             cur_value = self._objects["__openllm_migration__"].get(name, _sentinel)
             if cur_value is not _sentinel:
-                warnings.warn(
-                    f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead",
-                    DeprecationWarning,
-                    stacklevel=3,
-                )
+                warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", DeprecationWarning, stacklevel=3)
                 return getattr(self, cur_value)
-        if name in self._objects:
-            return self._objects.__getitem__(name)
-        if name in self._modules:
-            value = self._get_module(name)
-        elif name in self._class_to_module.keys():
-            module = self._get_module(self._class_to_module.__getitem__(name))
-            value = getattr(module, name)
-        else:
-            raise AttributeError(f"module {self.__name__} has no attribute {name}")
-
+        if name in self._objects: return self._objects.__getitem__(name)
+        if name in self._modules: value = self._get_module(name)
+        elif name in self._class_to_module.keys(): value = getattr(self._get_module(self._class_to_module.__getitem__(name)), name)
+        else: raise AttributeError(f"module {self.__name__} has no attribute {name}")
         setattr(self, name, value)
         return value
-
     def _get_module(self, module_name: str) -> types.ModuleType:
-        try:
-            return importlib.import_module("." + module_name, self.__name__)
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
-                f" traceback):\n{e}"
-            ) from e
-
+        try: return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e: raise RuntimeError(f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}") from e
     def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]:
         """This is to ensure any given module is pickle-able."""
         return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/tools/update-readme.py b/tools/update-readme.py
index 29ee132c..6c819f82 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -30,23 +30,15 @@ END_COMMENT = f"<!-- {os.path.basename(__file__)}: stop -->\n"
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
-
 def main() -> int:
-    with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
-        deps = tomlkit.parse(f.read()).value["project"]["optional-dependencies"]
-
-    with open(os.path.join(ROOT, "README.md"), "r") as f:
-        readme = f.readlines()
+    with open(os.path.join(ROOT, "pyproject.toml"), "r") as f: deps = tomlkit.parse(f.read()).value["project"]["optional-dependencies"]
+    with open(os.path.join(ROOT, "README.md"), "r") as f: readme = f.readlines()
 
     start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
-    formatted: dict[
-        t.Literal["Model", "Architecture", "CPU", "GPU", "URL", "Installation", "Model Ids"], list[str | list[str]]
-    ] = {
+    formatted: dict[t.Literal["Model", "Architecture", "URL", "Installation", "Model Ids"], list[str | list[str]]] = {
         "Model": [],
         "Architecture": [],
         "URL": [],
-        "CPU": [],
-        "GPU": [],
         "Model Ids": [],
         "Installation": [],
     }
@@ -56,8 +48,6 @@ def main() -> int:
         formatted["Model"].append(dashed)
         formatted["Architecture"].append(config_cls.__openllm_architecture__)
         formatted["URL"].append(config_cls.__openllm_url__)
-        formatted["GPU"].append("✅")
-        formatted["CPU"].append("✅" if not config_cls.__openllm_requires_gpu__ else "❌")
         formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
         if dashed in deps:
             instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
@@ -74,9 +64,7 @@ def main() -> int:
     meta.extend([f"<th>{header}</th>\n" for header in formatted.keys() if header not in ("URL",)])
     meta += ["</tr>\n"]
     # NOTE: rows
-    for name, architecture, url, cpu, gpu, model_ids, installation in t.cast(
-        t.Iterable[t.Tuple[str, str, str, str, str, t.List[str], str]], zip(*formatted.values())
-    ):
+    for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], zip(*formatted.values())):
         meta += "<tr>\n"
         # configure architecture URL
         cfg_cls = openllm.CONFIG_MAPPING[name]
@@ -94,8 +82,6 @@ def main() -> int:
             [
                 f"\n<td><a href={url}>{name}</a></td>\n",
                 arch,
-                f"<td>{cpu}</td>\n",
-                f"<td>{gpu}</td>\n",
             ]
         )
         format_with_links: list[str] = []
@@ -107,12 +93,7 @@ def main() -> int:
     meta.extend(["</table>\n", "\n"])
 
     readme = readme[:start_index] + [START_COMMENT] + meta + [END_COMMENT] + readme[stop_index + 1 :]
-
-    with open(os.path.join(ROOT, "README.md"), "w") as f:
-        f.writelines(readme)
-
+    with open(os.path.join(ROOT, "README.md"), "w") as f: f.writelines(readme)
     return 0
 
-
-if __name__ == "__main__":
-    raise SystemExit(main())
+if __name__ == "__main__": raise SystemExit(main())