From ded8a9f809681ba70c27722211bf230355dc052f Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Fri, 16 Jun 2023 18:10:50 -0400
Subject: [PATCH] feat: quantization (#27)

---
 .github/SECURITY.md                           |  22 ++
 .github/actions/create_release_and_archive.sh |   3 +-
 .github/actions/release.sh                    |   1 +
 .github/actions/setup-repo/action.yml         |   2 +-
 .github/workflows/ci.yml                      |   9 +-
 .pre-commit-config.yaml                       |  32 ++-
 CHANGELOG.md                                  |  19 ++
 DEVELOPMENT.md                                |  64 ++++-
 changelog.d/27.feature.md                     |  14 ++
 changelog.d/template.md.jinja                 |  29 +++
 examples/langchain-chains-demo/README.md      |   6 -
 examples/langchain-tools-demo/README.md       |   6 -
 pyproject.toml                                |  78 +++---
 src/openllm/_configuration.py                 | 228 +++++++++++++-----
 src/openllm/_llm.py                           | 185 +++++++++-----
 src/openllm/_package.py                       |  40 ++-
 src/openllm/_schema.py                        |   2 +-
 src/openllm/_service.py                       |   8 +-
 src/openllm/cli.py                            | 125 +++++++---
 .../models/chatglm/configuration_chatglm.py   |   2 +-
 .../models/dolly_v2/configuration_dolly_v2.py |   1 +
 .../models/falcon/configuration_falcon.py     |   1 +
 .../models/flan_t5/configuration_flan_t5.py   |   1 +
 .../models/stablelm/modeling_stablelm.py      |   3 +-
 .../models/starcoder/modeling_starcoder.py    |   6 +-
 src/openllm/utils/__init__.py                 |  99 ++++----
 src/openllm/utils/analytics.py                |   2 +-
 src/openllm/utils/import_utils.py             |   5 +
 tests/_strategies/_configuration.py           |   3 +
 tests/test_configuration.py                   |   5 +-
 tools/assert-model-table-latest               |   5 +-
 tools/update-optional-dependencies.py         |   6 +-
 tools/update-readme.py                        |   8 +-
 33 files changed, 711 insertions(+), 309 deletions(-)
 create mode 100644 .github/SECURITY.md
 create mode 100644 CHANGELOG.md
 create mode 100644 changelog.d/27.feature.md
 create mode 100644 changelog.d/template.md.jinja

diff --git a/.github/SECURITY.md b/.github/SECURITY.md
new file mode 100644
index 00000000..9c8aa6bf
--- /dev/null
+++ b/.github/SECURITY.md
@@ -0,0 +1,22 @@
+# Security Policy
+
+## Supported Versions
+
+We are following [semantic versioning](https://semver.org/) with strict
+backward-compatibility policy. We can ensure that all minor and major version
+are backward compatible. We are more lenient with patch as the development can
+move quickly.
+
+If you are just using public API, then feel free to always upgrade. Whenever
+there is a breaking policies, it will become a `DeprecationWarning` with a
+period of 12 months before becoming broken.
+
+> **Warning:** Everything package under `openllm` that has an underscore
+> prefixes are exempt from this. They are considered private API and can change
+> at any time. However, you can ensure that all public API, classes and
+> functions will be backward-compatible.
+
+## Reporting a Vulnerability
+
+To report a security vulnerability, please send us an
+[email](contact@bentoml.com).
diff --git a/.github/actions/create_release_and_archive.sh b/.github/actions/create_release_and_archive.sh
index 92f89f2b..58003c2f 100755
--- a/.github/actions/create_release_and_archive.sh
+++ b/.github/actions/create_release_and_archive.sh
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 set -o errexit -o nounset -o pipefail
 
 # Set by GH actions, see
@@ -41,4 +40,6 @@ All available models: \`\`\`python -m openllm.models\`\`\`
 
 To start a LLM: \`\`\`python -m openllm start dolly-v2\`\`\`
 
+Find more information about this release in the [CHANGELOG.md](https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md)
+
 EOF
diff --git a/.github/actions/release.sh b/.github/actions/release.sh
index 15b2a176..ba2311bb 100755
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -35,6 +35,7 @@ echo "Releasing version $RELEASE_VERSION..." && hatch version "${RELEASE_VERSION
 
 jq --arg release_version "${RELEASE_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
 
+towncrier build --yes --version "${RELEASE_VERSION}" && git add CHANGELOG.md changelog.d
 git add src/openllm/__about__.py package.json && git commit -sm "infra: prepare for release ${RELEASE_VERSION} [generated]"
 git push origin main
 
diff --git a/.github/actions/setup-repo/action.yml b/.github/actions/setup-repo/action.yml
index 93caca2c..ddfd0a27 100644
--- a/.github/actions/setup-repo/action.yml
+++ b/.github/actions/setup-repo/action.yml
@@ -53,7 +53,7 @@ runs:
           ${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi-
     - name: Install dependencies
       shell: bash
-      run: pip install -e ".[all]" hatch -vv
+      run: pip install -e ".[all]" hatch towncrier -vv
     - name: Install pyright
       shell: bash
       run: npm install -g npm@^7 pyright
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9d762793..a4df22a1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,8 +22,7 @@ on:
 env:
   LINES: 120
   COLUMNS: 120
-  BENTOML_DO_NOT_TRACK: True
-  PYTEST_PLUGINS: bentoml.testing.pytest.plugin
+  OPENLLM_DO_NOT_TRACK: True
 # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
 defaults:
   run:
@@ -38,8 +37,10 @@ jobs:
           fetch-depth: 0
       - name: Setup CI
         uses: ./.github/actions/setup-repo
-      - name: Format check
-        run: hatch run dev:style
+      - name: Running changelog check
+        run: hatch run changelog
+      - name: Format and lint check
+        run: hatch run fmt
       - name: Type check
         if: ${{ github.event_name == 'pull_request' }}
         run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 97fc3b23..6b63fa2a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,20 +13,31 @@
 # limitations under the License.
 
 ci:
-  autoupdate_schedule: monthly
+  autoupdate_schedule: weekly
+  skip: [check-models-table-update, check-models-table-update]
+exclude: '.*\.(css|js|svg)$'
 repos:
-  - repo: local
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: 'v0.0.272'
     hooks:
-      - id: format-check
-        name: format-check
-        language: system
-        entry: hatch run dev:style
-        always_run: true
-        pass_filenames: false
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --show-fixes]
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black-jupyter
+        files: '/(src|tests|docs|examples|typings)/'
+  - repo: https://github.com/econchick/interrogate
+    rev: 1.5.0
+    hooks:
+      - id: interrogate
+        types: [python]
+        exclude: ^(docs|tools|tests)
+        args: [--config=pyproject.toml]
   - repo: local
     hooks:
       - id: check-license-header
-        name: license-header-check
+        name: check for license headers
         entry: ./tools/assert-license-headers
         language: script
         exclude_types:
@@ -36,13 +47,14 @@ repos:
         exclude: |
           (?x)^(
               tools/.*|
+              changelog.d/.*|
               typings/.*|
               .github/.*
           )$
   - repo: local
     hooks:
       - id: check-models-table-update
-        name: check-models-table-update
+        name: check if table in README.md is up-to-date
         entry: ./tools/assert-model-table-latest
         language: script
         files: README.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..17f03705
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,19 @@
+# Changelog
+
+We are following [semantic versioning](https://semver.org/) with strict
+backward-compatibility policy.
+
+You can find out backwards-compatibility policy
+[here](https://github.com/bentoml/openllm/blob/main/.github/SECURITY.md).
+
+Changes for the upcoming release can be found in the
+['changelog.d' directory](https://github.com/bentoml/openllm/tree/main/changelog.d)
+in our repository.
+
+<!--
+Do *NOT* add changelog entries here!
+
+This changelog is managed by towncrier and is compiled at release time.
+-->
+
+<!-- towncrier release notes start -->
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index cec32e84..9c94eb1b 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -100,7 +100,7 @@ After setting up your environment, here's how you can start contributing:
 3. Run all formatter and linter with `hatch`:
 
    ```bash
-   hatch run dev:fmt
+   hatch run fmt
    ```
 4. Write tests that verify your feature or fix (see
    [Writing Tests](#writing-tests) below).
@@ -127,8 +127,8 @@ After setting up your environment, here's how you can start contributing:
 ## Using a custom fork
 
 If you wish to use a modified version of OpenLLM, install your fork from source
-with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built will
-include the generated wheels for OpenLLM in the bundle.
+with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built
+will include the generated wheels for OpenLLM in the bundle.
 
 ## Writing Tests
 
@@ -154,3 +154,61 @@ To release a new version, use `./tools/run-release-action`. It requires `gh`,
 ```
 
 > Note that currently this workflow can only be run by the BentoML team.
+
+## Changelog
+
+_modeled after the [attrs](https://github.com/python-attrs/attrs) workflow_
+
+If the change is noteworthy, there needs to be a changelog entry so users can
+learn about it!
+
+To avoid merge conflicts, we use the
+[_Towncrier_](https://pypi.org/project/towncrier) package to manage our
+changelog. _towncrier_ uses independent _Markdown_ files for each pull request –
+so called _news fragments_ – instead of one monolithic changelog file. On
+release, those news fragments are compiled into
+[`CHANGELOG.md`](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md).
+
+You don't need to install _Towncrier_ yourself, you just have to abide by a few
+simple rules:
+
+- For each pull request, add a new file into `changelog.d` with a filename
+  adhering to the `<pr#>.(change|deprecation|breaking|feature).md` schema: For
+  example, `changelog.d/42.change.md` for a non-breaking change that is proposed
+  in pull request #42.
+- As with other docs, please use [semantic newlines] within news fragments.
+- Wrap symbols like modules, functions, or classes into backticks so they are
+  rendered in a `monospace font`.
+- Wrap arguments into asterisks like in docstrings:
+  `Added new argument *an_argument*.`
+- If you mention functions or other callables, add parentheses at the end of
+  their names: `openllm.func()` or `openllm.LLMClass.method()`. This makes the
+  changelog a lot more readable.
+- Prefer simple past tense or constructions with "now". For example:
+
+  - Added `LLM.func()`.
+  - `LLM.func()` now doesn't do X.Y.Z anymore when passed the _foobar_ argument.
+- If you want to reference multiple issues, copy the news fragment to another
+  filename. _Towncrier_ will merge all news fragments with identical contents
+  into one entry with multiple links to the respective pull requests.
+
+Example entries:
+
+```md
+Added `LLM.func()`.
+The feature really _is_ awesome.
+```
+
+or:
+
+```md
+`openllm.utils.func()` now doesn't X.Y.Z anymore when passed the _foobar_ argument.
+The bug really _was_ nasty.
+```
+
+---
+
+`hatch run changelog` will render the current changelog to the terminal if you have
+any doubts.
+
+[semantic newlines]: https://rhodesmill.org/brandon/2012/one-sentence-per-line/
diff --git a/changelog.d/27.feature.md b/changelog.d/27.feature.md
new file mode 100644
index 00000000..8ebb70cb
--- /dev/null
+++ b/changelog.d/27.feature.md
@@ -0,0 +1,14 @@
+Added support for quantization during serving time.
+`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
+`GPTQ` quantization support is on the roadmap and currently
+being worked on.
+`openllm start` now also support `--bettertransformer` to use
+`BetterTransformer` for serving
+Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
+to acecss the config value: `openllm.DollyV2Config()['requirements']`
+the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
+Added `towncrier` workflow to easily generate changelog entries
+Added `use_pipeline`, `bettertransformer` flag into ModelSettings
+`LLMConfig` now supported `__dataclass_transform__` protocol to help
+with type-checking
+Changed `openllm download-models` to `openllm download`
diff --git a/changelog.d/template.md.jinja b/changelog.d/template.md.jinja
new file mode 100644
index 00000000..a915699e
--- /dev/null
+++ b/changelog.d/template.md.jinja
@@ -0,0 +1,29 @@
+{%- if versiondata["version"] == "main" -%}
+## Changes for the Upcoming Release
+
+:::{warning}
+These changes reflect the current [development progress](https://github.com/bentoml/openllm/tree/main) and have **not** been part of a official PyPI release yet.
+To try out the latest change, one can do: `pip install -U git+https://github.com/bentoml/openllm.git@main`
+:::
+{% else -%}
+## [{{ versiondata["version"] }}](https://github.com/bentoml/openllm/tree/{{ versiondata["version"] }})
+{%- endif %}
+
+{% for section, _ in sections.items() %}
+{% if sections[section] %}
+{% for category, val in definitions.items() if category in sections[section] %}
+
+### {{ definitions[category]['name'] }}
+
+{% for text, values in sections[section][category].items() %}
+- {{ text }}
+  {{ values|join(',\n  ') }}
+{% endfor %}
+
+{% endfor %}
+{% else %}
+No significant changes.
+
+
+{% endif %}
+{% endfor %}
diff --git a/examples/langchain-chains-demo/README.md b/examples/langchain-chains-demo/README.md
index b626d086..d0020d41 100644
--- a/examples/langchain-chains-demo/README.md
+++ b/examples/langchain-chains-demo/README.md
@@ -23,9 +23,3 @@ docker run \
   ..image_name
 
 ```
-
-
-
-
-
-
diff --git a/examples/langchain-tools-demo/README.md b/examples/langchain-tools-demo/README.md
index b6562938..b76f9a13 100644
--- a/examples/langchain-tools-demo/README.md
+++ b/examples/langchain-tools-demo/README.md
@@ -24,9 +24,3 @@ docker run \
   ..image_name
 
 ```
-
-
-
-
-
-
diff --git a/pyproject.toml b/pyproject.toml
index f5019bb0..b5cec136 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,6 +104,7 @@ packages = ["src/openllm", "src/openllm_client"]
 [tool.hatch.envs.default]
 dependencies = [
     "coverage[toml]>=6.5",
+    # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
     "pytest",
     "pytest-asyncio>=0.21.0",
     "pytest-xdist[psutil]",
@@ -111,42 +112,69 @@ dependencies = [
     "pytest-mock",
     "pytest-randomly",
     "pytest-rerunfailures",
+    "hypothesis",
+    "syrupy",
     # NOTE: To run all hooks
     "pre-commit",
     # NOTE: Using under ./tools/update-optional-dependencies.py
     "tomlkit",
     # NOTE: Using under ./tools/update-readme.py
     "markdown-it-py",
-    # NOTE: Tests strategies with Hypothesis
-    "hypothesis",
-    # NOTE: snapshot testing
-    "syrupy",
+    # NOTE: pyright for type
+    "pyright",
+    # NOTE: towncrier for changelog
+    "towncrier",
 ]
 [tool.hatch.envs.default.scripts]
-cov = ["test-cov", "cov-report"]
+changelog = "towncrier build --version main --draft"
+cov = ["cov-test", "cov-report"]
 cov-report = ["- coverage combine", "coverage report"]
+cov-test = "coverage run -m pytest {args:tests}"
+fmt = "pre-commit run --all-files"
 setup = "pre-commit install"
 test = "pytest {args:tests}"
-test-cov = "coverage run -m pytest {args:tests}"
+typing = "pyright {args:src/openllm tests}"
+
+[tool.towncrier]
+directory = "changelog.d"
+filename = "CHANGELOG.md"
+issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
+name = "openllm"
+start_string = "<!-- towncrier release notes start -->\n"
+template = "changelog.d/template.md.jinja"
+title_format = ""
+underlines = ["", "", ""]
+
+[[tool.towncrier.section]]
+path = ""
+
+[[tool.towncrier.type]]
+directory = "breaking"
+name = "Backwards-incompatible Changes"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "deprecation"
+name = "Deprecations"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "change"
+name = "Changes"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "feature"
+name = "Features"
+showcontent = true
 
 [[tool.hatch.envs.all.matrix]]
 python = ["3.8", "3.9", "3.10", "3.11"]
 
-[tool.hatch.envs.dev]
-dependencies = [
-    "ruff",
-    "pyright",
-    "hatch",
-    # NOTE: black for generating service file.
-    "black[jupyter]==23.3.0",
-]
-detached = true
-
-[tool.hatch.envs.dev.scripts]
-all = ["fmt", "typing"]
-fmt = ["black {args:.}", "black --pyi {args:typings/}", "ruff --fix {args:.}", "style"]
-style = ["ruff {args:.}", "black --check --diff {args:.}"]
-typing = "pyright {args:src/openllm tests}"
+[tool.interrogate]
+fail-under = 100
+verbose = 2
+whitelist-regex = ["test_.*"]
 
 [tool.pytest.ini_options]
 addopts = ["-rfEX", "-pno:warnings"]
@@ -206,12 +234,6 @@ force-single-line = true
 known-first-party = ["openllm", "bentoml", 'transformers']
 lines-after-imports = 2
 
-[tool.ruff.flake8-quotes]
-inline-quotes = "single"
-
-[tool.ruff.flake8-tidy-imports]
-ban-relative-imports = "all"
-
 [tool.ruff.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "__init__.py" = ["E402", "F401", "F403", "F811"]
@@ -222,7 +244,7 @@ ban-relative-imports = "all"
 [tool.pyright]
 analysis.useLibraryCodeForTypes = true
 enableTypeIgnoreComments = true
-include = ["src/", "tests/"]
+include = ["src/", "tests/", "tools/", "examples/"]
 pythonVersion = "3.11"
 reportMissingImports = "none"
 reportMissingModuleSource = "warning"
diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index 437bd443..279e64b8 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -65,8 +65,11 @@ from deepmerge.merger import Merger
 
 import openllm
 
-from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
-from .utils import DEBUG, LazyType, bentoml_cattr, codegen, dantic, first_not_none, lenient_issubclass
+from .exceptions import (ForbiddenAttributeError, GpuNotAvailableError,
+                         OpenLLMException)
+from .utils import (DEBUG, ENV_VARS_TRUE_VALUES, LazyType, bentoml_cattr,
+                    codegen, dantic, first_not_none, lenient_issubclass,
+                    non_intrusive_setattr)
 
 if hasattr(t, "Required"):
     from typing import Required
@@ -78,6 +81,11 @@ if hasattr(t, "NotRequired"):
 else:
     from typing_extensions import NotRequired
 
+if hasattr(t, "dataclass_transform"):
+    from typing import dataclass_transform
+else:
+    from typing_extensions import dataclass_transform
+
 _T = t.TypeVar("_T")
 
 
@@ -85,7 +93,8 @@ if t.TYPE_CHECKING:
     import tensorflow as tf
     import torch
     import transformers
-    from attr import _CountingAttr, _make_init, _make_repr, _transform_attrs  # type: ignore
+    from attr import (_CountingAttr, _make_init, _make_repr,  # type: ignore
+                      _transform_attrs)
     from transformers.generation.beam_constraints import Constraint
 
     from ._types import ClickFunctionWrapper, F, O_co, P
@@ -103,7 +112,8 @@ else:
     ItemgetterAny = itemgetter
     # NOTE: Using internal API from attr here, since we are actually
     # allowing subclass of openllm.LLMConfig to become 'attrs'-ish
-    from attr._make import _CountingAttr, _make_init, _make_repr, _transform_attrs
+    from attr._make import (_CountingAttr, _make_init, _make_repr,
+                            _transform_attrs)
 
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -369,6 +379,11 @@ class GenerationConfig:
             )
         self.__attrs_init__(**attrs)
 
+    def __getitem__(self, item: str) -> t.Any:
+        if hasattr(self, item):
+            return getattr(self, item)
+        raise KeyError(f"GenerationConfig has no attribute {item}")
+
 
 bentoml_cattr.register_unstructure_hook_factory(
     lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
@@ -411,6 +426,11 @@ class ModelSettings(t.TypedDict, total=False):
     requires_gpu: bool
     trust_remote_code: bool
     requirements: t.Optional[ListStr]
+
+    # llm implementation specifics
+    use_pipeline: bool
+    bettertransformer: bool
+    model_type: t.Literal["causal_lm", "seq2seq_lm"]
     runtime: t.Literal["transformers", "cpp"]
 
     # naming convention, only name_type is needed to infer from the class
@@ -458,19 +478,19 @@ _ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
 def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
     if not lenient_issubclass(cl_, LLMConfig):
         raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
-    settings = cl_.__config__
 
-    if settings is None:
-        raise RuntimeError("Given LLMConfig must have '__config__' defined.")
+    if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
+        raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
+
+    settings = cl_.__config__
+    assert settings
 
     required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
-    if any(k not in settings for k in required):
-        raise ValueError(f"The following keys are required under '__config__': {required}")
-    if not settings["default_id"] or not settings["model_ids"]:
-        raise ValueError("Make sure that either 'default_id', 'model_ids' are not emptied under '__config__'.")
 
-    if any(k in settings for k in ("env", "start_name", "model_name")):
-        raise ValueError("The following keys are not allowed under '__config__': env, start_name, model_name")
+    missing = set(required) - set(settings.keys())
+
+    if len(missing) > 0:
+        raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
 
     if "generation_class" in settings:
         raise ValueError(
@@ -478,10 +498,16 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
             f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
         )
 
+    if not settings["default_id"] or not settings["model_ids"]:
+        raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
+
+    # NOTE: value in __config__ can be None, hense we use setdefault
+    # to update in-place
     _cl_name = cl_.__name__.replace("Config", "")
-    name_type = first_not_none(settings.get("name_type"), "dasherize")
-    model_name = inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
-    start_name = inflection.dasherize(model_name) if name_type == "dasherize" else model_name
+    name_type = settings.setdefault("name_type", "dasherize")
+    model_name = settings.setdefault(
+        "model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
+    )
     partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
 
     def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
@@ -498,21 +524,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
             for f in fields
         ]
 
-    return cls(
-        default_id=settings["default_id"],
-        model_ids=settings["model_ids"],
-        url=settings.get("url", ""),
-        requires_gpu=settings.get("requires_gpu", False),
-        trust_remote_code=settings.get("trust_remote_code", False),
-        requirements=settings.get("requirements", None),
-        name_type=name_type,
-        model_name=model_name,
-        start_name=start_name,
-        runtime=settings.get("runtime", "transformers"),
-        env=openllm.utils.ModelEnv(model_name),
-        timeout=settings.get("timeout", 3600),
-        workers_per_resource=settings.get("workers_per_resource", 1),
-        generation_class=attr.make_class(
+    settings.setdefault(
+        "generation_class",
+        attr.make_class(
             f"{_cl_name}GenerationConfig",
             [],
             bases=(GenerationConfig,),
@@ -520,10 +534,40 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
             weakref_slot=True,
             frozen=False,
             repr=True,
+            collect_by_mro=True,
             field_transformer=auto_env_transformers,
         ),
     )
 
+    env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
+    requires_gpu = settings.setdefault("requires_gpu", False)
+
+    # bettertransformer support
+    bettertransformer = settings.setdefault(
+        "bettertransformer",
+        os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
+    )
+    if requires_gpu:
+        # For all models that requires GPU, no need to offload it to BetterTransformer
+        # use bitsandbytes or gptq instead for latency improvement
+        if bettertransformer:
+            logger.debug("Model requires GPU by default, disabling bettertransformer.")
+        bettertransformer = False
+    settings["bettertransformer"] = bettertransformer
+
+    # default value
+    settings.setdefault("url", "")
+    settings.setdefault("use_pipeline", False)
+    settings.setdefault("model_type", "causal_lm")
+    settings.setdefault("trust_remote_code", False)
+    settings.setdefault("requirements", None)
+    settings.setdefault("timeout", 3600)
+    settings.setdefault("workers_per_resource", 1)
+    settings.setdefault("runtime", "transformers")
+    settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
+
+    return cls(**settings)
+
 
 bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
 
@@ -534,15 +578,16 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
     We can't use the cached object.__setattr__ since we are setting
     attributes to a class.
     """
-    if add_dunder:
-        return f"setattr(cls, '{attr_name}', __add_dunder(cls, {value_var}))"
-    return f"setattr(cls, '{attr_name}', {value_var})"
+    val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
+    return f"setattr(cls, '{attr_name}', {val})"
 
 
 _dunder_add = {"generation_class"}
 
 
-def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance) -> t.Callable[..., None]:
+def _make_assignment_script(
+    cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: t.LiteralString = "openllm"
+) -> t.Callable[..., None]:
     """Generate the assignment script with prefix attributes __openllm_<value>__"""
     args: ListStr = []
     globs: DictStrAny = {
@@ -555,7 +600,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
 
     lines: ListStr = ["_getattr = _cached_getattribute_get(_cached_attribute)"]
     for attr_name, field in attr.fields_dict(attributes.__class__).items():
-        arg_name = field.metadata.get("target", f"__openllm_{inflection.underscore(attr_name)}__")
+        arg_name = field.metadata.get("target", f"__{_prefix}_{inflection.underscore(attr_name)}__")
         args.append(f"{attr_name}=getattr(_cached_attribute, '{attr_name}')")
         lines.append(_setattr_class(arg_name, attr_name, add_dunder=attr_name in _dunder_add))
         annotations[attr_name] = field.type
@@ -568,6 +613,23 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
 _reserved_namespace = {"__config__", "GenerationConfig"}
 
 
+@dataclass_transform(order_default=True, field_specifiers=(attr.field, dantic.Field))
+def __llm_config_transform__(cls: type[LLMConfig]) -> type[LLMConfig]:
+    kwargs: dict[str, t.Any] = {}
+    if hasattr(cls, "GenerationConfig"):
+        kwargs = {k: v for k, v in vars(cls.GenerationConfig).items() if not k.startswith("_")}
+    non_intrusive_setattr(
+        cls,
+        "__dataclass_transform__",
+        {
+            "order_default": True,
+            "field_specifiers": (attr.field, dantic.Field),
+            "kwargs": kwargs,
+        },
+    )
+    return cls
+
+
 @attr.define(slots=True)
 class LLMConfig:
     """
@@ -640,11 +702,11 @@ class LLMConfig:
     # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
     if t.TYPE_CHECKING:
         # NOTE: public attributes to override
-        __config__: ModelSettings | None = None
+        __config__: ModelSettings | None = Field(None)
         """Internal configuration for this LLM model. Each of the field in here will be populated
         and prefixed with __openllm_<value>__"""
 
-        GenerationConfig: type = type
+        GenerationConfig: type = Field(None)
         """Users can override this subclass of any given LLMConfig to provide GenerationConfig
         default value. For example:
 
@@ -663,7 +725,7 @@ class LLMConfig:
         def __attrs_init__(self, **attrs: t.Any):
             """Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract."""
 
-        __attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = tuple()
+        __attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = Field(None, init=False)
         """Since we are writing our own __init_subclass__, which is an alternative way for __prepare__,
         we want openllm.LLMConfig to be attrs-like dataclass that has pydantic-like interface.
         __attrs_attrs__ will be handled dynamically by __init_subclass__.
@@ -683,33 +745,38 @@ class LLMConfig:
         __openllm_url__: str = Field(None, init=False)
         """The resolved url for this LLMConfig."""
 
-        __openllm_requires_gpu__: bool = False
+        __openllm_requires_gpu__: bool = Field(None, init=False)
         """Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU."""
 
-        __openllm_trust_remote_code__: bool = False
+        __openllm_trust_remote_code__: bool = Field(False)
         """Whether to always trust remote code"""
 
-        __openllm_requirements__: ListStr | None = None
+        __openllm_requirements__: ListStr | None = Field(None)
         """The default PyPI requirements needed to run this given LLM. By default, we will depend on
         bentoml, torch, transformers."""
 
         __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
         """A ModelEnv instance for this LLMConfig."""
 
-        __openllm_model_name__: str = ""
+        __openllm_model_name__: str = Field("")
         """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
 
-        __openllm_start_name__: str = ""
+        __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
+        """The model type for this given LLM. By default, it should be causal language modeling.
+        Currently supported 'causal_lm' or 'seq2seq_lm'
+        """
+
+        __openllm_start_name__: str = Field("")
         """Default name to be used with `openllm start`"""
 
-        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
+        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
         """the default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase."""
 
-        __openllm_timeout__: int = 3600
+        __openllm_timeout__: int = Field(36000)
         """The default timeout to be set for this given LLM."""
 
-        __openllm_workers_per_resource__: int | float = 1
+        __openllm_workers_per_resource__: int | float = Field(1)
         """The number of workers per resource. This is used to determine the number of workers to use for this model.
         For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
         OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
@@ -720,10 +787,23 @@ class LLMConfig:
         By default, it is set to 1.
         """
 
-        __openllm_runtime__: t.Literal["transformers", "cpp"] = "transformers"
+        __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
         """The runtime to use for this model. Possible values are `transformers` or `cpp`. See
         LlaMA for more information."""
 
+        __openllm_use_pipeline__: bool = Field(False)
+        """Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
+        The reason for this to be here is because we want to access this object before loading
+        the _bentomodel. This is because we will actually download the model weights when accessing
+        _bentomodel.
+        """
+
+        __openllm_bettertransformer__: bool = Field(False)
+        """Whether to use BetterTransformer for this given LLM. This depends per model
+        architecture. By default, we will use BetterTransformer for T5 and StableLM models,
+        and set to False for every other models.
+        """
+
         __openllm_default_id__: str = Field(None)
         """Return the default model to use when using 'openllm start <model_id>'.
         This could be one of the keys in 'self.model_ids' or custom users model."""
@@ -804,6 +884,7 @@ class LLMConfig:
         these["generation_config"] = cls.Field(
             default=cls.__openllm_generation_class__(),
             description=inspect.cleandoc(cls.__openllm_generation_class__.__doc__ or ""),
+            type=GenerationConfig,
         )
 
         # Generate the base __attrs_attrs__ transformation here.
@@ -884,6 +965,7 @@ class LLMConfig:
         cls.__openllm_hints__ = {
             f.name: f.type for ite in map(attr.fields, (cls, cls.__openllm_generation_class__)) for f in ite
         }
+        cls = __llm_config_transform__(cls)
 
     def __setattr__(self, attr: str, value: t.Any):
         if attr in _reserved_namespace:
@@ -909,14 +991,7 @@ class LLMConfig:
         if generation_config is None:
             generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
         else:
-            generation_keys = {k for k in attrs if k in _generation_cl_dict}
-            if len(generation_keys) > 0:
-                logger.warning(
-                    "Both 'generation_config' and keys for 'generation_config' are passed."
-                    " The following keys in 'generation_config' will be overriden be keywords argument: %s",
-                    ", ".join(generation_keys),
-                )
-                config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in generation_keys})
+            config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})
 
         for k in _cached_keys:
             if k in generation_config or attrs.get(k) is None:
@@ -942,7 +1017,32 @@ class LLMConfig:
             )
 
         # The rest of attrs should only be the attributes to be passed to __attrs_init__
-        self.__attrs_init__(generation_config=self.__openllm_generation_class__(**generation_config), **attrs)
+        self.__attrs_init__(generation_config=self["generation_class"](**generation_config), **attrs)
+
+    def __getitem__(self, item: str | t.Any) -> t.Any:
+        """Allowing access LLMConfig as a dictionary. The order will always evaluate as
+
+        __openllm_*__ > self.key > __openllm_generation_class__ > __openllm_extras__
+
+        This method is purely for convenience, and should not be used for performance critical code.
+        """
+        if not isinstance(item, str):
+            raise TypeError(f"LLM only supports string indexing, not {item.__class__.__name__}")
+        if item in _reserved_namespace:
+            raise ForbiddenAttributeError(
+                f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
+            )
+        internal_attributes = f"__openllm_{item}__"
+        if hasattr(self, internal_attributes):
+            return getattr(self, internal_attributes)
+        elif hasattr(self, item):
+            return getattr(self, item)
+        elif hasattr(self.__openllm_generation_class__, item):
+            return getattr(self.__openllm_generation_class__, item)
+        elif item in self.__openllm_extras__:
+            return self.__openllm_extras__[item]
+        else:
+            raise KeyError(item)
 
     def __getattribute__(self, item: str) -> t.Any:
         if item in _reserved_namespace:
@@ -976,10 +1076,8 @@ class LLMConfig:
 
     def model_dump(self, flatten: bool = False, **_: t.Any):
         dumped = bentoml_cattr.unstructure(self)
-        generation_config = bentoml_cattr.unstructure(self.generation_config)
-        if not flatten:
-            dumped["generation_config"] = generation_config
-        else:
+        if flatten:
+            generation_config = dumped.pop("generation_config")
             dumped.update(generation_config)
         return dumped
 
@@ -1028,11 +1126,11 @@ class LLMConfig:
         key_to_remove: ListStr = []
 
         for k, v in attrs.items():
-            if k.startswith(f"{self.__openllm_model_name__}_generation_"):
-                llm_config_attrs["generation_config"][k[len(self.__openllm_model_name__ + "_generation_") :]] = v
+            if k.startswith(f"{self['model_name']}_generation_"):
+                llm_config_attrs["generation_config"][k[len(self["model_name"] + "_generation_") :]] = v
                 key_to_remove.append(k)
-            elif k.startswith(f"{self.__openllm_model_name__}_"):
-                llm_config_attrs[k[len(self.__openllm_model_name__ + "_") :]] = v
+            elif k.startswith(f"{self['model_name']}_"):
+                llm_config_attrs[k[len(self["model_name"] + "_") :]] = v
                 key_to_remove.append(k)
 
         return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index cc8da735..75f76929 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -35,7 +35,8 @@ from bentoml._internal.types import ModelSignatureDict
 import openllm
 
 from .exceptions import ForbiddenAttributeError, OpenLLMException
-from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr
+from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
+                    non_intrusive_setattr)
 
 if t.TYPE_CHECKING:
     import torch
@@ -60,7 +61,6 @@ else:
 
 logger = logging.getLogger(__name__)
 
-# NOTE: `1-2` -> text-generation and text2text-generation
 FRAMEWORK_TO_AUTOCLASS_MAPPING = {
     "pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
     "tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
@@ -132,6 +132,7 @@ def import_model(
             ),
         )
 
+    # NOTE: `1-2` -> text-generation and text2text-generation
     if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
         idx = 0
     elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
@@ -243,25 +244,6 @@ class LLMInterface(ABC):
         raise NotImplementedError
 
 
-def _default_post_init(self: LLM[t.Any, t.Any]):
-    # load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
-    #              See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
-    #              for more information.
-    # NOTE: set a default variable to transform to BetterTransformer by default for inference
-    if self.config.__openllm_runtime__ == "cpp":
-        self.load_in_mha = False
-    else:
-        self.load_in_mha = (
-            os.environ.get(self.config_class.__openllm_env__.bettertransformer, str(False)).upper()
-            in ENV_VARS_TRUE_VALUES
-        )
-        if self.config_class.__openllm_requires_gpu__:
-            # For all models that requires GPU, no need to offload it to BetterTransformer
-            # use bitsandbytes instead
-
-            self.load_in_mha = False
-
-
 _M = t.TypeVar("_M")
 _T = t.TypeVar("_T")
 
@@ -285,6 +267,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
         _model_attrs: dict[str, t.Any]
         _tokenizer_attrs: dict[str, t.Any]
 
+        bettertransformer: bool
+
     def __init_subclass__(cls):
         cd = cls.__dict__
         prefix_class_name_config = cls.__name__
@@ -310,20 +294,6 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                     "Missing required key 'config_class'. Make sure to define it within the LLM subclass."
                 )
 
-        if cls.llm_post_init is not LLMInterface.llm_post_init:
-            original_llm_post_init = cd["llm_post_init"]
-
-            def wrapped_llm_post_init(self: t.Self) -> None:
-                """We need to both initialize private attributes and call the user-defined model_post_init
-                method.
-                """
-                _default_post_init(self)
-                original_llm_post_init(self)
-
-            cls.llm_post_init = wrapped_llm_post_init
-        else:
-            setattr(cls, "llm_post_init", _default_post_init)
-
         if cls.import_model is LLMInterface.import_model:
             # using the default import model
             setattr(cls, "import_model", functools.partial(import_model, _model_framework=implementation))
@@ -353,6 +323,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
         model_id: str | None = None,
         llm_config: openllm.LLMConfig | None = None,
         *args: t.Any,
+        quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
+        bettertransformer: bool | None = None,
         **attrs: t.Any,
     ):
         """Initialize the LLM with given pretrained model.
@@ -429,6 +401,9 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
             model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
             llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                         will use `config_class` to construct default configuration.
+            quantize: The quantization to use for this LLM. Defaults to None. Possible values
+                      include 8bit, 4bit and gptq.
+            bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
             *args: The args to be passed to the model.
             **attrs: The kwargs to be passed to the model.
 
@@ -438,16 +413,102 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                                    However, if `model_id` is a path, this argument is recomended to include.
         """
 
-        load_in_mha = attrs.pop("load_in_mha", False)
         openllm_model_version = attrs.pop("openllm_model_version", None)
 
+        # low_cpu_mem_usage is only available for model
+        # this is helpful on system with low memory to avoid OOM
+        low_cpu_mem_usage = attrs.pop("low_cpu_mem_usage", True)
+
+        # quantization setup
+        quantization_config = attrs.pop("quantization_config", None)
+        # 8 bit configuration
+        int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
+        cpu_offloading = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
+        int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
+        int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
+        # 4 bit configuration
+        int4_compute_dtype = attrs.pop("llm_bnb_4bit_compute_dtype", torch.bfloat16)
+        int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
+        int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
+
+        if quantization_config and quantize:
+            raise ValueError(
+                """'quantization_config' and 'quantize' are mutually exclusive. Either customise
+            your quantization_config or use the quantize argument."""
+            )
+        if quantization_config is None:
+            # quantize is a openllm.LLM feature, where we can quantize the model
+            # with bitsandbytes or quantization aware training.
+            if quantize is not None:
+                if not is_bitsandbytes_available():
+                    raise RuntimeError(
+                        "Quantization requires bitsandbytes to be installed. Make "
+                        "sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
+                    )
+                logger.debug(
+                    "'quantize' is not None. %s will use a default 'quantization_config' for %s. "
+                    "If you want to customise the quantization config, make sure to pass your "
+                    "own 'quantization_config'",
+                    self,
+                    quantize,
+                )
+                if quantize == "8bit":
+                    if int8_skip_modules is None:
+                        int8_skip_modules = []
+                    if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
+                        logger.debug("Skipping 'lm_head' for quantization for %s", self)
+                        int8_skip_modules.append("lm_head")
+                    quantization_config = transformers.BitsAndBytesConfig(
+                        load_in_8bit=True,
+                        llm_int8_enable_fp32_cpu_offload=cpu_offloading,
+                        llm_int8_threshhold=int8_threshold,
+                        llm_int8_skip_modules=int8_skip_modules,
+                        llm_int8_has_fp16_weight=int8_has_fp16_weight,
+                    )
+                elif quantize == "4bit":
+                    trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
+                    supports_kbits = trf_versions[:2] >= (4, 30)
+                    if supports_kbits:
+                        quantization_config = transformers.BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            llm_bnb_4bit_compute_dtype=int4_compute_dtype,
+                            llm_bnb_4bit_quant_type=int4_quant_type,
+                            llm_bnb_4bit_use_double_quant=int4_use_double_quant,
+                        )
+                    else:
+                        logger.warning(
+                            "'quantize' is set to 4bit, while the current transformers version %s does not support "
+                            "k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
+                            "make sure to install the latest version of transformers either via PyPI or "
+                            "from git source: 'pip install git+https://github.com/huggingface/transformers'.",
+                            trf_versions,
+                        )
+                elif quantize == "gptq":
+                    # TODO: support GPTQ loading quantization
+                    if model_id is None:
+                        raise RuntimeError(
+                            "'quantize=%s' requires passing custom path to quantized weights as we are unable to load "
+                            "the model on the fly. See https://github.com/qwopqwop200/GPTQ-for-LLaMa for "
+                            "instruction on how to quantize '%s' with GPTQ.",
+                            quantize,
+                            self,
+                        )
+                    raise NotImplementedError("GPTQ is not supported yet.")
+                else:
+                    raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
+
+        attrs.update({"quantization_config": quantization_config})
+
         if llm_config is not None:
             logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
             self.config = llm_config
         else:
             self.config = self.config_class.model_construct_env(**attrs)
             # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
-            attrs = self.config.__openllm_extras__
+            attrs = self.config["extras"]
+
+        if not self.config["use_pipeline"]:
+            attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
         model_kwds, tokenizer_kwds = {}, {}
         if self.__llm_init_kwargs__:
@@ -463,10 +524,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
             )
 
         if model_id is None:
-            model_id = os.environ.get(self.config.__openllm_env__.model_id, self.config.__openllm_default_id__)
+            model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])
 
         # NOTE: This is the actual given path or pretrained weight for this LLM.
-        assert model_id is not None
+        if t.TYPE_CHECKING:
+            assert model_id is not None
         self._model_id = model_id
 
         # parsing tokenizer and model kwargs
@@ -476,23 +538,24 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
         model_kwds.update({k: v for k, v in attrs.items() if not k.startswith(TOKENIZER_PREFIX)})
 
         # handle trust_remote_code
-        self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
+        self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config["trust_remote_code"])
 
         # NOTE: Save the args and kwargs for latter load
         self._model_args = args
         self._model_attrs = model_kwds
         self._tokenizer_attrs = tokenizer_kwds
-
-        # we allow users to overwrite the load_in_mha defined by the LLM subclass.
-        if load_in_mha:
-            logger.debug("Overwriting 'load_in_mha=%s' (base load_in_mha=%s)", load_in_mha, self.load_in_mha)
-            self.load_in_mha = load_in_mha
-
         self._openllm_model_version = openllm_model_version
 
         if self.__llm_post_init__:
             self.llm_post_init()
 
+        # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
+        if bettertransformer:
+            logger.debug("Using %r with BetterTransformer", self)
+            self.bettertransformer = bettertransformer
+        else:
+            non_intrusive_setattr(self, "bettertransformer", self.config["bettertransformer"])
+
     def __setattr__(self, attr: str, value: t.Any):
         if attr in _reserved_namespace:
             raise ForbiddenAttributeError(
@@ -513,7 +576,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
 
     @property
     def runner_name(self) -> str:
-        return f"llm-{self.config.__openllm_start_name__}-runner"
+        return f"llm-{self.config['start_name']}-runner"
 
     # NOTE: The section below defines a loose contract with langchain's LLM interface.
     @property
@@ -524,7 +587,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
     def identifying_params(self) -> dict[str, t.Any]:
         return {
             "configuration": self.config.model_dump_json().decode(),
-            "model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
+            "model_ids": orjson.dumps(self.config["model_ids"]).decode(),
         }
 
     @staticmethod
@@ -580,8 +643,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                 sys.executable,
                 "-m",
                 "openllm",
-                "download-models",
-                self.config.__openllm_start_name__,
+                "download",
+                self.config["start_name"],
                 "--model-id",
                 self.model_id,
                 "--output",
@@ -625,7 +688,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
         kwds = self._model_attrs
         kwds["trust_remote_code"] = self.__llm_trust_remote_code__
 
-        if self.load_in_mha and "_pretrained_class" not in self._bentomodel.info.metadata:
+        is_pipeline = "_pretrained_class" in self._bentomodel.info.metadata
+        # differentiate when saving tokenizer or other pretrained type.
+        is_pretrained_model = is_pipeline and "_framework" in self._bentomodel.info.metadata
+
+        if self.bettertransformer and is_pipeline and self.config["use_pipeline"]:
             # This is a pipeline, provide a accelerator args
             kwds["accelerator"] = "bettertransformer"
 
@@ -636,10 +703,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                 self.__llm_model__ = self._bentomodel.load_model(*self._model_args, **kwds)
 
             if (
-                self.load_in_mha
-                and all(i in self._bentomodel.info.metadata for i in ("_framework", "_pretrained_class"))
+                self.bettertransformer
+                and is_pretrained_model
                 and self._bentomodel.info.metadata["_framework"] == "torch"
-                and self.config.__openllm_runtime__ == "transformers"
+                and self.config["runtime"] == "transformers"
             ):
                 # BetterTransformer is currently only supported on PyTorch.
                 from optimum.bettertransformer import BetterTransformer
@@ -767,7 +834,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
 
         # NOTE: returning the two langchain API's to the runner
         return types.new_class(
-            inflection.camelize(self.config.__openllm_model_name__) + "Runner",
+            inflection.camelize(self.config["model_name"]) + "Runner",
             (bentoml.Runner,),
             exec_body=lambda ns: ns.update(
                 {
@@ -776,17 +843,17 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                     "llm": self,  # NOTE: self reference to LLM
                     "config": self.config,
                     "__call__": _wrapped_generate_run,
-                    "__module__": f"openllm.models.{self.config.__openllm_model_name__}",
-                    "__doc__": self.config.__openllm_env__.start_docstring,
+                    "__module__": f"openllm.models.{self.config['model_name']}",
+                    "__doc__": self.config["env"].start_docstring,
                 }
             ),
         )(
             types.new_class(
-                inflection.camelize(self.config.__openllm_model_name__) + "Runnable",
+                inflection.camelize(self.config["model_name"]) + "Runnable",
                 (_Runnable,),
                 {
                     "SUPPORTED_RESOURCES": ("nvidia.com/gpu", "cpu")
-                    if self.config.__openllm_requires_gpu__
+                    if self.config["requires_gpu"]
                     else ("nvidia.com/gpu",),
                     "llm_type": self.llm_type,
                     "identifying_params": self.identifying_params,
diff --git a/src/openllm/_package.py b/src/openllm/_package.py
index 2b438299..624f741e 100644
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -76,17 +76,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
     # first, then proceed to install everything inside the wheels/ folder.
     packages: list[str] = ["openllm"]
 
-    if llm.config.__openllm_requirements__ is not None:
-        packages.extend(llm.config.__openllm_requirements__)
+    if llm.config["requirements"] is not None:
+        packages.extend(llm.config["requirements"])
 
     if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
         packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
 
-    to_use_framework = llm.config.__openllm_env__.get_framework_env()
+    env = llm.config["env"]
+    to_use_framework = env.get_framework_env()
     if to_use_framework == "flax":
-        assert (
-            utils.is_flax_available()
-        ), f"Flax is not available, while {llm.config.__openllm_env__.framework} is set to 'flax'"
+        assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
         packages.extend(
             [
                 f"flax>={importlib.metadata.version('flax')}",
@@ -95,9 +94,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
             ]
         )
     elif to_use_framework == "tf":
-        assert (
-            utils.is_tf_available()
-        ), f"TensorFlow is not available, while {llm.config.__openllm_env__.framework} is set to 'tf'"
+        assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
         candidates = (
             "tensorflow",
             "tensorflow-cpu",
@@ -133,16 +130,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
     _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
     _bentoml_config_options_opts = [
-        "api_server.traffic.timeout=3600",  # NOTE: Currently we hardcode this value
-        f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
-        f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
+        "api_server.traffic.timeout=36000",  # NOTE: Currently we hardcode this value
+        f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}',
+        f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
     ]
     _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+    env = llm.config["env"]
     return DockerOptions(
         cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
         env={
-            llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
-            "OPENLLM_MODEL": llm.config.__openllm_model_name__,
+            env.framework: env.get_framework_env(),
+            "OPENLLM_MODEL": llm.config["model_name"],
             "OPENLLM_MODEL_ID": llm.model_id,
             "BENTOML_DEBUG": str(get_debug_mode()),
             "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
@@ -180,7 +178,7 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
     try:
         os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
 
-        to_use_framework = llm_config.__openllm_env__.get_framework_env()
+        to_use_framework = llm_config["env"].get_framework_env()
         if to_use_framework == "flax":
             llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
         elif to_use_framework == "tf":
@@ -192,12 +190,10 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
 
         labels = dict(llm.identifying_params)
         labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
-        service_name = f"generated_{llm.config.__openllm_model_name__}_service.py"
-        workers_per_resource = utils.first_not_none(
-            workers_per_resource, default=llm.config.__openllm_workers_per_resource__
-        )
+        service_name = f"generated_{llm_config['model_name']}_service.py"
+        workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
 
-        with fs.open_fs(f"temp://llm_{llm.config.__openllm_model_name__}") as llm_fs:
+        with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
             # add service.py definition to this temporary folder
             utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
 
@@ -209,12 +205,12 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
                     raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
                 _previously_built = True
             except bentoml.exceptions.NotFound:
-                logger.info("Building Bento for LLM '%s'", llm.config.__openllm_start_name__)
+                logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
                 bento = bentoml.bentos.build(
                     f"{service_name}:svc",
                     name=bento_tag.name,
                     labels=labels,
-                    description=f"OpenLLM service for {llm.config.__openllm_start_name__}",
+                    description=f"OpenLLM service for {llm_config['start_name']}",
                     include=[
                         f for f in llm_fs.walk.files(filter=["*.py"])
                     ],  # NOTE: By default, we are using _service.py as the default service, for now.
diff --git a/src/openllm/_schema.py b/src/openllm/_schema.py
index 096d42d6..bc7587dc 100644
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -55,7 +55,7 @@ class GenerationInput:
     def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
         llm_config = openllm.AutoConfig.for_model(model_name, **attrs)
         return attr.make_class(
-            inflection.camelize(llm_config.__openllm_model_name__) + "GenerationInput",
+            inflection.camelize(llm_config["model_name"]) + "GenerationInput",
             attrs={
                 "prompt": attr.field(type=str),
                 "llm_config": attr.field(
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index 47ca759b..6127d646 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -36,7 +36,7 @@ model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}")  # openllm: mode
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
 
-svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
 
 
 @svc.api(
@@ -55,8 +55,8 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> openllm.MetadataOutput:
     return openllm.MetadataOutput(
         model_id=model_id,
-        timeout=llm_config.__openllm_timeout__,
-        model_name=llm_config.__openllm_model_name__,
-        framework=llm_config.__openllm_env__.get_framework_env(),
+        timeout=llm_config["timeout"],
+        model_name=llm_config["model_name"],
+        framework=llm_config["env"].get_framework_env(),
         configuration=llm_config.model_dump_json().decode(),
     )
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index deaa06da..1b6aba31 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -83,6 +83,29 @@ def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.A
     call(text, **attrs)
 
 
+def quantize_option(factory: t.Any):
+    help_str = """Running this model in quantized mode.
+    Note that GPTQ is currently working in progress and will be available soon.
+
+    NOTE: Quantization is only available for PyTorch models.
+    """
+    return factory.option(
+        "--quantize",
+        type=click.Choice(["8bit", "4bit", "gptq"]),
+        default=None,
+        help=help_str,
+    )
+
+
+def bettertransformer_option(factory: t.Any):
+    return factory.option(
+        "--bettertransformer",
+        is_flag=True,
+        default=None,
+        help="Use BetterTransformer wrapper to serve model",
+    )
+
+
 def start_model_command(
     model_name: str,
     group: click.Group,
@@ -108,29 +131,30 @@ def start_model_command(
     openllm.utils.configure_logging()
 
     llm_config = openllm.AutoConfig.for_model(model_name)
+    env = llm_config["env"]
 
     docstring = f"""\
-{llm_config.__openllm_env__.start_docstring}
+{env.start_docstring}
 \b
-Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.__openllm_default_id__}]
+Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}]
 """
     command_attrs: dict[str, t.Any] = {
-        "name": llm_config.__openllm_model_name__,
+        "name": llm_config["model_name"],
         "context_settings": _context_settings or {},
         "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
         "help": docstring,
     }
 
     aliases: list[str] = []
-    if llm_config.__openllm_name_type__ == "dasherize":
-        aliases.append(llm_config.__openllm_start_name__)
+    if llm_config["name_type"] == "dasherize":
+        aliases.append(llm_config["start_name"])
 
     command_attrs["aliases"] = aliases if len(aliases) > 0 else None
 
     serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args
 
     available_gpu = openllm.utils.gpu_count()
-    if llm_config.__openllm_requires_gpu__ and len(available_gpu) < 1:
+    if llm_config["requires_gpu"] and len(available_gpu) < 1:
         # NOTE: The model requires GPU, therefore we will return a dummy command
         command_attrs.update(
             {
@@ -152,8 +176,13 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
     @llm_config.to_click_options
     @serve_decorator
     @cog.optgroup.group("General LLM Options")
-    @cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds")
-    @model_id_option(cog.optgroup, model_env=llm_config.__openllm_env__)
+    @cog.optgroup.option(
+        "--server-timeout",
+        type=int,
+        default=None,
+        help="Server timeout in seconds",
+    )
+    @model_id_option(cog.optgroup, model_env=env)
     @cog.optgroup.option(
         "--device",
         type=tuple,
@@ -165,34 +194,47 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
         show_envvar=True,
     )
     @workers_per_resource_option(cog.optgroup)
-    @click.pass_context
+    @quantize_option(cog.optgroup)
+    @bettertransformer_option(cog.optgroup)
     def model_start(
-        ctx: click.Context,
         server_timeout: int | None,
         model_id: str | None,
         workers_per_resource: float | None,
         device: tuple[str, ...] | None,
+        quantize: t.Literal["8bit", "4bit", "gptq"] | None,
+        bettertransformer: bool | None,
         **attrs: t.Any,
     ) -> openllm.LLMConfig:
         config, server_attrs = llm_config.model_validate_click(**attrs)
 
-        if llm_config.__openllm_env__.get_framework_env() == "flax":
+        if quantize and env.get_framework_env() != "pt":
+            _echo("Quantization is only available for PyTorch models.", fg="yellow")
+
+        if env.get_framework_env() == "flax":
             llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
-        elif llm_config.__openllm_env__.get_framework_env() == "tf":
+        elif env.get_framework_env() == "tf":
             llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
         else:
-            llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
+            llm = openllm.AutoLLM.for_model(
+                model_name,
+                model_id=model_id,
+                llm_config=config,
+                quantize=quantize,
+                bettertransformer=bettertransformer,
+                ensure_available=True,
+            )
 
-        if llm.config.__openllm_requirements__ is not None and len(llm.config.__openllm_requirements__) > 0:
+        requirements = config["requirements"]
+        if requirements is not None and len(requirements) > 0:
             _echo(
-                f"Make sure to have the following dependencies available: {llm.config.__openllm_requirements__}",
+                f"Make sure to have the following dependencies available: {requirements}",
                 fg="yellow",
             )
 
         workers_per_resource = openllm.utils.first_not_none(
-            workers_per_resource, default=llm.config.__openllm_workers_per_resource__
+            workers_per_resource, default=config["workers_per_resource"]
         )
-        server_timeout = openllm.utils.first_not_none(server_timeout, default=llm.config.__openllm_timeout__)
+        server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
 
         num_workers = int(1 / workers_per_resource)
         if num_workers > 1:
@@ -216,26 +258,26 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
         _bentoml_config_options_opts = [
             "tracing.sample_rate=1.0",
             f"api_server.traffic.timeout={server_timeout}",
-            f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
-            f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
+            f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
+            f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
         ]
         if device:
             if len(device) > 1:
                 for idx, dev in enumerate(device):
                     _bentoml_config_options_opts.append(
-                        f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+                        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
                     )
             else:
                 _bentoml_config_options_opts.append(
-                    f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
+                    f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
                 )
 
         _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
 
         start_env.update(
             {
-                llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
-                llm.config.__openllm_env__.model_config: llm.config.model_dump_json().decode(),
+                env.framework: env.get_framework_env(),
+                env.model_config: llm.config.model_dump_json().decode(),
                 "OPENLLM_MODEL": model_name,
                 "OPENLLM_MODEL_ID": llm.model_id,
                 "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
@@ -280,7 +322,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
         """
         # The following logics is similar to one of BentoMLCommandGroup
 
-        from bentoml._internal.configuration import DEBUG_ENV_VAR, QUIET_ENV_VAR
+        from bentoml._internal.configuration import (DEBUG_ENV_VAR,
+                                                     QUIET_ENV_VAR)
 
         @click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.")
         @click.option(
@@ -668,11 +711,15 @@ def start_grpc_cli():
 @output_option
 @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
 @workers_per_resource_option(click, build=True)
+@quantize_option(click)
+@bettertransformer_option(click)
 def build(
     model_name: str,
     model_id: str | None,
     overwrite: bool,
     output: OutputLiteral,
+    quantize: t.Literal["8bit", "4bit", "gptq"] | None,
+    bettertransformer: bool | None,
     workers_per_resource: float | None,
 ):
     """Package a given models into a Bento.
@@ -695,6 +742,8 @@ def build(
         model_name,
         __cli__=True,
         model_id=model_id,
+        quantize=quantize,
+        bettertransformer=bettertransformer,
         _workers_per_resource=workers_per_resource,
         _overwrite_existing_bento=overwrite,
     )
@@ -764,20 +813,20 @@ def models(output: OutputLiteral, show_available: bool):
         for m in models:
             config = openllm.AutoConfig.for_model(m)
             runtime_impl: tuple[t.Literal["pt", "flax", "tf"], ...] = tuple()
-            if config.__openllm_model_name__ in openllm.MODEL_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_MAPPING_NAMES:
                 runtime_impl += ("pt",)
-            if config.__openllm_model_name__ in openllm.MODEL_FLAX_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_FLAX_MAPPING_NAMES:
                 runtime_impl += ("flax",)
-            if config.__openllm_model_name__ in openllm.MODEL_TF_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_TF_MAPPING_NAMES:
                 runtime_impl += ("tf",)
             json_data[m] = {
-                "model_id": config.__openllm_model_ids__,
-                "url": config.__openllm_url__,
-                "requires_gpu": config.__openllm_requires_gpu__,
+                "model_id": config["model_ids"],
+                "url": config["url"],
+                "requires_gpu": config["requires_gpu"],
                 "runtime_impl": runtime_impl,
                 "installation": "pip install openllm" if m not in extras else f'pip install "openllm[{m}]"',
             }
-            converted.extend([convert_transformers_model_name(i) for i in config.__openllm_model_ids__])
+            converted.extend([convert_transformers_model_name(i) for i in config["model_ids"]])
             if openllm.utils.DEBUG:
                 try:
                     openllm.AutoLLM.for_model(m, llm_config=config)
@@ -950,7 +999,7 @@ def query_(
         _echo(res["responses"], fg="white")
 
 
-@cli.command()
+@cli.command(name="download")
 @click.argument(
     "model_name",
     type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
@@ -967,10 +1016,10 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
         openllm.utils.configure_logging()
 
     config = openllm.AutoConfig.for_model(model_name)
-    env = config.__openllm_env__.get_framework_env()
-    if env == "flax":
+    envvar = config["env"].get_framework_env()
+    if envvar == "flax":
         model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
-    elif env == "tf":
+    elif envvar == "tf":
         model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
     else:
         model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
@@ -978,11 +1027,11 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
     try:
         _ref = bentoml.transformers.get(model.tag)
         if output == "pretty":
-            _echo(f"{model_name} is already setup for framework '{env}': {str(_ref.tag)}", nl=True, fg="yellow")
+            _echo(f"{model_name} is already setup for framework '{envvar}': {str(_ref.tag)}", nl=True, fg="yellow")
         elif output == "json":
             _echo(
                 orjson.dumps(
-                    {"previously_setup": True, "framework": env, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
+                    {"previously_setup": True, "framework": envvar, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
                 ).decode(),
                 fg="white",
             )
@@ -1016,7 +1065,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
         elif output == "json":
             _echo(
                 orjson.dumps(
-                    {"previously_setup": False, "framework": env, "tag": str(_ref.tag)},
+                    {"previously_setup": False, "framework": envvar, "tag": str(_ref.tag)},
                     option=orjson.OPT_INDENT_2,
                 ).decode()
             )
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py
index d330a161..c1b40f8b 100644
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -46,7 +46,7 @@ class ChatGLMConfig(openllm.LLMConfig):
 
     retain_history: bool = openllm.LLMConfig.Field(
         False,
-        description="""Whether to retain history given to the model. 
+        description="""Whether to retain history given to the model.
         If set to True, then the model will retain given history.""",
     )
 
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index 49584ee0..1b98c638 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -38,6 +38,7 @@ class DollyV2Config(openllm.LLMConfig):
         "timeout": 3600000,
         "trust_remote_code": True,
         "url": "https://github.com/databrickslabs/dolly",
+        "use_pipeline": True,
         "default_id": "databricks/dolly-v2-3b",
         "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
     }
diff --git a/src/openllm/models/falcon/configuration_falcon.py b/src/openllm/models/falcon/configuration_falcon.py
index c046d783..4689d81a 100644
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -29,6 +29,7 @@ class FalconConfig(openllm.LLMConfig):
         "trust_remote_code": True,
         "requires_gpu": True,
         "timeout": int(36e6),
+        "use_pipeline": True,
         "url": "https://falconllm.tii.ae/",
         "requirements": ["einops", "xformers", "safetensors"],
         "default_id": "tiiuae/falcon-7b",
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/src/openllm/models/flan_t5/configuration_flan_t5.py
index a23861bd..0b7379a7 100644
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -61,6 +61,7 @@ class FlanT5Config(openllm.LLMConfig):
             "google/flan-t5-xl",
             "google/flan-t5-xxl",
         ],
+        "model_type": "seq2seq_lm",
     }
 
     class GenerationConfig:
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py
index c1b5c085..15914924 100644
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -47,13 +47,12 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
 
     def llm_post_init(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.load_in_mha = True if not torch.cuda.is_available() else False
+        self.bettertransformer = True if not torch.cuda.is_available() else False
 
     @property
     def import_kwargs(self):
         model_kwds = {
             "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-            "load_in_8bit": False,
             "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
         }
         tokenizer_kwds: dict[str, t.Any] = {}
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 7160e022..5c7e459a 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -47,8 +47,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
     def import_kwargs(self):
         model_kwds = {
             "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
-            "torch_dtype": torch.float16,
+            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
         }
         tokenizer_kwds = {"padding_side": "left"}
         return model_kwds, tokenizer_kwds
@@ -62,7 +61,6 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
         **attrs: t.Any,
     ) -> bentoml.Model:
         torch_dtype = attrs.pop("torch_dtype", torch.float16)
-        load_in_8bit = attrs.pop("load_in_8bit", True)
         device_map = attrs.pop("device_map", "auto")
 
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
@@ -74,7 +72,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
         )
 
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs
+            model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
         )
         try:
             return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
index 3324a2a0..cde27c03 100644
--- a/src/openllm/utils/__init__.py
+++ b/src/openllm/utils/__init__.py
@@ -15,42 +15,38 @@
 Utilities function for OpenLLM. User can import these function for convenience, but
 we won't ensure backward compatibility for these functions. So use with caution.
 """
-from __future__ import annotations
+from __future__ import annotations as _annotations
 
+import functools
+import logging
 import os
 import sys
 import types
 import typing as t
 
-from bentoml._internal.configuration import get_debug_mode as get_debug_mode
-from bentoml._internal.configuration import get_quiet_mode as get_quiet_mode
-from bentoml._internal.configuration import set_debug_mode as set_debug_mode
-from bentoml._internal.configuration import set_quiet_mode as set_quiet_mode
-from bentoml._internal.log import configure_logging as configure_logging
-from bentoml._internal.log import configure_server_logging as configure_server_logging
+from bentoml._internal.configuration import (get_debug_mode, get_quiet_mode,
+                                             set_debug_mode, set_quiet_mode)
+from bentoml._internal.log import configure_logging, configure_server_logging
 from bentoml._internal.types import LazyType
+from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
+                                     copy_file_to_fs_folder, first_not_none,
+                                     pkg, reserve_free_port,
+                                     resolve_user_filepath)
 
-# NOTE: The following exports useful utils from bentoml
-from bentoml._internal.utils import LazyLoader as LazyLoader
-from bentoml._internal.utils import bentoml_cattr as bentoml_cattr
-from bentoml._internal.utils import copy_file_to_fs_folder as copy_file_to_fs_folder
-from bentoml._internal.utils import first_not_none as first_not_none
-from bentoml._internal.utils import pkg as pkg
-from bentoml._internal.utils import reserve_free_port as reserve_free_port
-from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath
+from .lazy import LazyModule
 
-from .lazy import LazyModule as LazyModule
+logger = logging.getLogger(__name__)
 
 try:
-    from typing import GenericAlias as TypingGenericAlias  # type: ignore
+    from typing import GenericAlias as _TypingGenericAlias  # type: ignore
 except ImportError:
     # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
-    TypingGenericAlias = ()
+    _TypingGenericAlias = ()
 
 if sys.version_info < (3, 10):
-    WithArgsTypes = (TypingGenericAlias,)
+    _WithArgsTypes = (_TypingGenericAlias,)
 else:
-    WithArgsTypes: t.Any = (
+    _WithArgsTypes: t.Any = (
         t._GenericAlias,  # type: ignore (_GenericAlias is the actual GenericAlias implementation)
         types.GenericAlias,
         types.UnionType,
@@ -61,7 +57,7 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
     try:
         return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
     except TypeError:
-        if isinstance(cls, WithArgsTypes):
+        if isinstance(cls, _WithArgsTypes):
             return False
         raise
 
@@ -72,27 +68,25 @@ def gpu_count() -> tuple[int, ...]:
     return tuple(NvidiaGpuResource.from_system())
 
 
+# equivocal setattr to save one lookup per assignment
+_object_setattr = object.__setattr__
+
+
+def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
+    """This makes sure that we don't overwrite any existing attributes on the object"""
+    _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
+
+    if not hasattr(obj, name):
+        _setattr(name, value)
+
+
 DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get("OPENLLMDEVDEBUG")))
 
-_extras = {
-    "get_debug_mode": get_debug_mode,
-    "get_quiet_mode": get_quiet_mode,
-    "set_debug_mode": set_debug_mode,
-    "set_quiet_mode": set_quiet_mode,
-    "configure_logging": configure_logging,
-    "configure_server_logging": configure_server_logging,
-    "LazyType": LazyType,
-    "LazyLoader": LazyLoader,
-    "LazyModule": LazyModule,
-    "bentoml_cattr": bentoml_cattr,
-    "copy_file_to_fs_folder": copy_file_to_fs_folder,
-    "first_not_none": first_not_none,
-    "pkg": pkg,
-    "reserve_free_port": reserve_free_port,
-    "resolve_user_filepath": resolve_user_filepath,
-    "lenient_issubclass": lenient_issubclass,
-    "gpu_count": gpu_count,
-    "DEBUG": DEBUG,
+
+# XXX: define all classes, functions import above this line
+# since _extras will be the locals() import from this file.
+_extras: dict[str, t.Any] = {
+    k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
 }
 
 _import_structure = {
@@ -108,23 +102,46 @@ _import_structure = {
         "is_flax_available",
         "is_tf_available",
         "is_torch_available",
+        "is_bitsandbytes_available",
         "require_backends",
     ],
 }
 
 if t.TYPE_CHECKING:
+    # NOTE: The following exports useful utils from bentoml
+    from . import LazyLoader as LazyLoader
+    from . import LazyType as LazyType
     from . import analytics as analytics
+    from . import bentoml_cattr as bentoml_cattr
     from . import codegen as codegen
+    from . import configure_logging as configure_logging
+    from . import configure_server_logging as configure_server_logging
+    from . import copy_file_to_fs_folder as copy_file_to_fs_folder
     from . import dantic as dantic
+    from . import first_not_none as first_not_none
+    from . import get_debug_mode as get_debug_mode
+    from . import get_quiet_mode as get_quiet_mode
+    from . import gpu_count as gpu_count
+    from . import lenient_issubclass as lenient_issubclass
+    from . import non_intrusive_setattr as non_intrusive_setattr
+    from . import pkg as pkg
+    from . import reserve_free_port as reserve_free_port
+    from . import resolve_user_filepath as resolve_user_filepath
+    from . import set_debug_mode as set_debug_mode
+    from . import set_quiet_mode as set_quiet_mode
     from .import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
     from .import_utils import DummyMetaclass as DummyMetaclass
     from .import_utils import ModelEnv as ModelEnv
-    from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
+    from .import_utils import \
+        is_bitsandbytes_available as is_bitsandbytes_available
+    from .import_utils import \
+        is_cpm_kernels_available as is_cpm_kernels_available
     from .import_utils import is_einops_available as is_einops_available
     from .import_utils import is_flax_available as is_flax_available
     from .import_utils import is_tf_available as is_tf_available
     from .import_utils import is_torch_available as is_torch_available
     from .import_utils import require_backends as require_backends
+    from .lazy import LazyModule as LazyModule
 else:
     import sys
 
diff --git a/src/openllm/utils/analytics.py b/src/openllm/utils/analytics.py
index af4bc2a3..36a5d59a 100644
--- a/src/openllm/utils/analytics.py
+++ b/src/openllm/utils/analytics.py
@@ -81,7 +81,7 @@ class StartInitEvent(_internal_analytics.schemas.EventMeta):
 
     @staticmethod
     def handler(llm_config: openllm.LLMConfig) -> StartInitEvent:
-        return StartInitEvent(model_name=llm_config.__openllm_model_name__, llm_config=llm_config.model_dump())
+        return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
 
 
 def track_start_init(
diff --git a/src/openllm/utils/import_utils.py b/src/openllm/utils/import_utils.py
index b7197ba2..324710b0 100644
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -61,6 +61,7 @@ _tf_available = importlib.util.find_spec("tensorflow") is not None
 _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
 _einops_available = _is_package_available("einops")
 _cpm_kernel_available = _is_package_available("cpm_kernels")
+_bitsandbytes_available = _is_package_available("bitsandbytes")
 
 
 def is_einops_available():
@@ -71,6 +72,10 @@ def is_cpm_kernels_available():
     return _cpm_kernel_available
 
 
+def is_bitsandbytes_available():
+    return _bitsandbytes_available
+
+
 def is_torch_available():
     global _torch_available
     if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
diff --git a/tests/_strategies/_configuration.py b/tests/_strategies/_configuration.py
index a8b6c02a..f6d6081b 100644
--- a/tests/_strategies/_configuration.py
+++ b/tests/_strategies/_configuration.py
@@ -37,6 +37,9 @@ def model_settings(draw: st.DrawFn):
         requires_gpu=st.booleans(),
         trust_remote_code=st.booleans(),
         requirements=st.none() | st.lists(st.text(), min_size=1),
+        use_pipeline=st.booleans(),
+        model_type=st.sampled_from(["causal_lm", "seq2seq_lm"]),
+        runtime=st.sampled_from(["transformers", "cpp"]),
         name_type=st.sampled_from(["dasherize", "lowercase"]),
         timeout=st.integers(min_value=3600),
         workers_per_resource=st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
diff --git a/tests/test_configuration.py b/tests/test_configuration.py
index 39af2549..c18c6f54 100644
--- a/tests/test_configuration.py
+++ b/tests/test_configuration.py
@@ -23,7 +23,8 @@ from hypothesis import assume, given
 from hypothesis import strategies as st
 
 import openllm
-from openllm._configuration import GenerationConfig, ModelSettings, _field_env_key
+from openllm._configuration import (GenerationConfig, ModelSettings,
+                                    _field_env_key)
 from openllm.utils import DEBUG
 
 from ._strategies._configuration import make_llm_config, model_settings
@@ -67,7 +68,7 @@ def test_forbidden_access():
 
 @given(model_settings())
 def test_class_normal_gen(gen_settings: ModelSettings):
-    assume(gen_settings["default_id"] and gen_settings["model_ids"])
+    assume(gen_settings["default_id"] and all(i for i in gen_settings["model_ids"]))
     cl_: type[openllm.LLMConfig] = make_llm_config("NotFullLLM", gen_settings)
     assert issubclass(cl_, openllm.LLMConfig)
     for key in gen_settings:
diff --git a/tools/assert-model-table-latest b/tools/assert-model-table-latest
index 6e39843b..bc9f438e 100755
--- a/tools/assert-model-table-latest
+++ b/tools/assert-model-table-latest
@@ -3,11 +3,10 @@
 from __future__ import annotations
 
 import os
+import subprocess
 
 from markdown_it import MarkdownIt
 
-import openllm
-
 md = MarkdownIt()
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -17,7 +16,7 @@ with open(os.path.join(ROOT, "README.md"), "r") as f:
 # NOTE: Currently, we only have one table in README, which is the Model readme.
 table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]
 
-available = len(openllm.CONFIG_MAPPING.keys())
+available = subprocess.check_output(["openllm", "models", "-o", "porcelain"]).strip().decode("utf-8").count("\n") + 1
 
 on_table = len(table)  # NOTE: minus the header
 
diff --git a/tools/update-optional-dependencies.py b/tools/update-optional-dependencies.py
index 07c6ffc4..439b314d 100755
--- a/tools/update-optional-dependencies.py
+++ b/tools/update-optional-dependencies.py
@@ -31,9 +31,9 @@ FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 OPENAI_DEPS = ["openai", "tiktoken"]
 
 _base_requirements = {
-    inflection.dasherize(name): config.__openllm_requirements__
-    for name, config in openllm.CONFIG_MAPPING.items()
-    if config.__openllm_requirements__
+    inflection.dasherize(name): config_cls.__openllm_requirements__
+    for name, config_cls in openllm.CONFIG_MAPPING.items()
+    if config_cls.__openllm_requirements__
 }
 
 # NOTE: update this table when adding new external dependencies
diff --git a/tools/update-readme.py b/tools/update-readme.py
index 3a77a5b6..24c5e2f7 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -47,13 +47,13 @@ def main() -> int:
         "Model Ids": [],
     }
     max_install_len_div = 0
-    for name, config in openllm.CONFIG_MAPPING.items():
+    for name, config_cls in openllm.CONFIG_MAPPING.items():
         dashed = inflection.dasherize(name)
         formatted["Model"].append(dashed)
-        formatted["URL"].append(config.__openllm_url__)
+        formatted["URL"].append(config_cls.__openllm_url__)
         formatted["GPU"].append("✅")
-        formatted["CPU"].append("✅" if not config.__openllm_requires_gpu__ else "❌")
-        formatted["Model Ids"].append(config.__openllm_model_ids__)
+        formatted["CPU"].append("✅" if not config_cls.__openllm_requires_gpu__ else "❌")
+        formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
         if dashed in deps:
             instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
         else: