feat: quantization (#27)

2026-05-24 08:34:37 -04:00 · 2023-06-16 18:10:50 -04:00
parent 19bc7e3116
commit ded8a9f809
33 changed files with 711 additions and 309 deletions
--- a/.github/SECURITY.md
+++ b/.github/SECURITY.md
@@ -0,0 +1,22 @@
+# Security Policy
+
+## Supported Versions
+
+We are following [semantic versioning](https://semver.org/) with strict
+backward-compatibility policy. We can ensure that all minor and major version
+are backward compatible. We are more lenient with patch as the development can
+move quickly.
+
+If you are just using public API, then feel free to always upgrade. Whenever
+there is a breaking policies, it will become a `DeprecationWarning` with a
+period of 12 months before becoming broken.
+
+> **Warning:** Everything package under `openllm` that has an underscore
+> prefixes are exempt from this. They are considered private API and can change
+> at any time. However, you can ensure that all public API, classes and
+> functions will be backward-compatible.
+
+## Reporting a Vulnerability
+
+To report a security vulnerability, please send us an
+[email](contact@bentoml.com).
--- a/.github/actions/create_release_and_archive.sh
+++ b/.github/actions/create_release_and_archive.sh
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 set -o errexit -o nounset -o pipefail

 # Set by GH actions, see
@@ -41,4 +40,6 @@ All available models: \`\`\`python -m openllm.models\`\`\`

 To start a LLM: \`\`\`python -m openllm start dolly-v2\`\`\`

+Find more information about this release in the [CHANGELOG.md](https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md)
+
 EOF
--- a/.github/actions/release.sh
+++ b/.github/actions/release.sh
@@ -35,6 +35,7 @@ echo "Releasing version $RELEASE_VERSION..." && hatch version "${RELEASE_VERSION

 jq --arg release_version "${RELEASE_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json

+towncrier build --yes --version "${RELEASE_VERSION}" && git add CHANGELOG.md changelog.d
 git add src/openllm/__about__.py package.json && git commit -sm "infra: prepare for release ${RELEASE_VERSION} [generated]"
 git push origin main

--- a/.github/actions/setup-repo/action.yml
+++ b/.github/actions/setup-repo/action.yml
@@ -53,7 +53,7 @@ runs:
          ${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi-
    - name: Install dependencies
      shell: bash
-      run: pip install -e ".[all]" hatch -vv
+      run: pip install -e ".[all]" hatch towncrier -vv
    - name: Install pyright
      shell: bash
      run: npm install -g npm@^7 pyright
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,8 +22,7 @@ on:
 env:
  LINES: 120
  COLUMNS: 120
-  BENTOML_DO_NOT_TRACK: True
-  PYTEST_PLUGINS: bentoml.testing.pytest.plugin
+  OPENLLM_DO_NOT_TRACK: True
 # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
 defaults:
  run:
@@ -38,8 +37,10 @@ jobs:
          fetch-depth: 0
      - name: Setup CI
        uses: ./.github/actions/setup-repo
-      - name: Format check
-        run: hatch run dev:style
+      - name: Running changelog check
+        run: hatch run changelog
+      - name: Format and lint check
+        run: hatch run fmt
      - name: Type check
        if: ${{ github.event_name == 'pull_request' }}
        run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,20 +13,31 @@
 # limitations under the License.

 ci:
-  autoupdate_schedule: monthly
+  autoupdate_schedule: weekly
+  skip: [check-models-table-update, check-models-table-update]
+exclude: '.*\.(css|js|svg)$'
 repos:
-  - repo: local
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: 'v0.0.272'
    hooks:
-      - id: format-check
-        name: format-check
-        language: system
-        entry: hatch run dev:style
-        always_run: true
-        pass_filenames: false
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --show-fixes]
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black-jupyter
+        files: '/(src|tests|docs|examples|typings)/'
+  - repo: https://github.com/econchick/interrogate
+    rev: 1.5.0
+    hooks:
+      - id: interrogate
+        types: [python]
+        exclude: ^(docs|tools|tests)
+        args: [--config=pyproject.toml]
  - repo: local
    hooks:
      - id: check-license-header
-        name: license-header-check
+        name: check for license headers
        entry: ./tools/assert-license-headers
        language: script
        exclude_types:
@@ -36,13 +47,14 @@ repos:
        exclude: |
          (?x)^(
              tools/.*|
+              changelog.d/.*|
              typings/.*|
              .github/.*
          )$
  - repo: local
    hooks:
      - id: check-models-table-update
-        name: check-models-table-update
+        name: check if table in README.md is up-to-date
        entry: ./tools/assert-model-table-latest
        language: script
        files: README.md
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,19 @@
+# Changelog
+
+We are following [semantic versioning](https://semver.org/) with strict
+backward-compatibility policy.
+
+You can find out backwards-compatibility policy
+[here](https://github.com/bentoml/openllm/blob/main/.github/SECURITY.md).
+
+Changes for the upcoming release can be found in the
+['changelog.d' directory](https://github.com/bentoml/openllm/tree/main/changelog.d)
+in our repository.
+
+<!--
+Do *NOT* add changelog entries here!
+
+This changelog is managed by towncrier and is compiled at release time.
+-->
+
+<!-- towncrier release notes start -->
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -100,7 +100,7 @@ After setting up your environment, here's how you can start contributing:
 3. Run all formatter and linter with `hatch`:

   ```bash
-   hatch run dev:fmt
+   hatch run fmt
   ```
 4. Write tests that verify your feature or fix (see
   [Writing Tests](#writing-tests) below).
@@ -127,8 +127,8 @@ After setting up your environment, here's how you can start contributing:
 ## Using a custom fork

 If you wish to use a modified version of OpenLLM, install your fork from source
-with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built will
-include the generated wheels for OpenLLM in the bundle.
+with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built
+will include the generated wheels for OpenLLM in the bundle.

 ## Writing Tests

@@ -154,3 +154,61 @@ To release a new version, use `./tools/run-release-action`. It requires `gh`,
 ```

 > Note that currently this workflow can only be run by the BentoML team.
+
+## Changelog
+
+_modeled after the [attrs](https://github.com/python-attrs/attrs) workflow_
+
+If the change is noteworthy, there needs to be a changelog entry so users can
+learn about it!
+
+To avoid merge conflicts, we use the
+[_Towncrier_](https://pypi.org/project/towncrier) package to manage our
+changelog. _towncrier_ uses independent _Markdown_ files for each pull request –
+so called _news fragments_ – instead of one monolithic changelog file. On
+release, those news fragments are compiled into
+[`CHANGELOG.md`](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md).
+
+You don't need to install _Towncrier_ yourself, you just have to abide by a few
+simple rules:
+
+- For each pull request, add a new file into `changelog.d` with a filename
+  adhering to the `<pr#>.(change|deprecation|breaking|feature).md` schema: For
+  example, `changelog.d/42.change.md` for a non-breaking change that is proposed
+  in pull request #42.
+- As with other docs, please use [semantic newlines] within news fragments.
+- Wrap symbols like modules, functions, or classes into backticks so they are
+  rendered in a `monospace font`.
+- Wrap arguments into asterisks like in docstrings:
+  `Added new argument *an_argument*.`
+- If you mention functions or other callables, add parentheses at the end of
+  their names: `openllm.func()` or `openllm.LLMClass.method()`. This makes the
+  changelog a lot more readable.
+- Prefer simple past tense or constructions with "now". For example:
+
+  - Added `LLM.func()`.
+  - `LLM.func()` now doesn't do X.Y.Z anymore when passed the _foobar_ argument.
+- If you want to reference multiple issues, copy the news fragment to another
+  filename. _Towncrier_ will merge all news fragments with identical contents
+  into one entry with multiple links to the respective pull requests.
+
+Example entries:
+
+```md
+Added `LLM.func()`.
+The feature really _is_ awesome.
+```
+
+or:
+
+```md
+`openllm.utils.func()` now doesn't X.Y.Z anymore when passed the _foobar_ argument.
+The bug really _was_ nasty.
+```
+
+---
+
+`hatch run changelog` will render the current changelog to the terminal if you have
+any doubts.
+
+[semantic newlines]: https://rhodesmill.org/brandon/2012/one-sentence-per-line/
--- a/changelog.d/27.feature.md
+++ b/changelog.d/27.feature.md
@@ -0,0 +1,14 @@
+Added support for quantization during serving time.
+`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
+`GPTQ` quantization support is on the roadmap and currently
+being worked on.
+`openllm start` now also support `--bettertransformer` to use
+`BetterTransformer` for serving
+Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
+to acecss the config value: `openllm.DollyV2Config()['requirements']`
+the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
+Added `towncrier` workflow to easily generate changelog entries
+Added `use_pipeline`, `bettertransformer` flag into ModelSettings
+`LLMConfig` now supported `__dataclass_transform__` protocol to help
+with type-checking
+Changed `openllm download-models` to `openllm download`
--- a/changelog.d/template.md.jinja
+++ b/changelog.d/template.md.jinja
@@ -0,0 +1,29 @@
+{%- if versiondata["version"] == "main" -%}
+## Changes for the Upcoming Release
+
+:::{warning}
+These changes reflect the current [development progress](https://github.com/bentoml/openllm/tree/main) and have **not** been part of a official PyPI release yet.
+To try out the latest change, one can do: `pip install -U git+https://github.com/bentoml/openllm.git@main`
+:::
+{% else -%}
+## [{{ versiondata["version"] }}](https://github.com/bentoml/openllm/tree/{{ versiondata["version"] }})
+{%- endif %}
+
+{% for section, _ in sections.items() %}
+{% if sections[section] %}
+{% for category, val in definitions.items() if category in sections[section] %}
+
+### {{ definitions[category]['name'] }}
+
+{% for text, values in sections[section][category].items() %}
+- {{ text }}
+  {{ values|join(',\n  ') }}
+{% endfor %}
+
+{% endfor %}
+{% else %}
+No significant changes.
+
+
+{% endif %}
+{% endfor %}
--- a/examples/langchain-chains-demo/README.md
+++ b/examples/langchain-chains-demo/README.md
@@ -23,9 +23,3 @@ docker run \
  ..image_name

 ```
-
-
-
-
-
-
--- a/examples/langchain-tools-demo/README.md
+++ b/examples/langchain-tools-demo/README.md
@@ -24,9 +24,3 @@ docker run \
  ..image_name

 ```
-
-
-
-
-
-
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,6 +104,7 @@ packages = ["src/openllm", "src/openllm_client"]
 [tool.hatch.envs.default]
 dependencies = [
    "coverage[toml]>=6.5",
+    # NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
    "pytest",
    "pytest-asyncio>=0.21.0",
    "pytest-xdist[psutil]",
@@ -111,42 +112,69 @@ dependencies = [
    "pytest-mock",
    "pytest-randomly",
    "pytest-rerunfailures",
+    "hypothesis",
+    "syrupy",
    # NOTE: To run all hooks
    "pre-commit",
    # NOTE: Using under ./tools/update-optional-dependencies.py
    "tomlkit",
    # NOTE: Using under ./tools/update-readme.py
    "markdown-it-py",
-    # NOTE: Tests strategies with Hypothesis
-    "hypothesis",
-    # NOTE: snapshot testing
-    "syrupy",
+    # NOTE: pyright for type
+    "pyright",
+    # NOTE: towncrier for changelog
+    "towncrier",
 ]
 [tool.hatch.envs.default.scripts]
-cov = ["test-cov", "cov-report"]
+changelog = "towncrier build --version main --draft"
+cov = ["cov-test", "cov-report"]
 cov-report = ["- coverage combine", "coverage report"]
+cov-test = "coverage run -m pytest {args:tests}"
+fmt = "pre-commit run --all-files"
 setup = "pre-commit install"
 test = "pytest {args:tests}"
-test-cov = "coverage run -m pytest {args:tests}"
+typing = "pyright {args:src/openllm tests}"
+
+[tool.towncrier]
+directory = "changelog.d"
+filename = "CHANGELOG.md"
+issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
+name = "openllm"
+start_string = "<!-- towncrier release notes start -->\n"
+template = "changelog.d/template.md.jinja"
+title_format = ""
+underlines = ["", "", ""]
+
+[[tool.towncrier.section]]
+path = ""
+
+[[tool.towncrier.type]]
+directory = "breaking"
+name = "Backwards-incompatible Changes"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "deprecation"
+name = "Deprecations"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "change"
+name = "Changes"
+showcontent = true
+
+[[tool.towncrier.type]]
+directory = "feature"
+name = "Features"
+showcontent = true

 [[tool.hatch.envs.all.matrix]]
 python = ["3.8", "3.9", "3.10", "3.11"]

-[tool.hatch.envs.dev]
-dependencies = [
-    "ruff",
-    "pyright",
-    "hatch",
-    # NOTE: black for generating service file.
-    "black[jupyter]==23.3.0",
-]
-detached = true
-
-[tool.hatch.envs.dev.scripts]
-all = ["fmt", "typing"]
-fmt = ["black {args:.}", "black --pyi {args:typings/}", "ruff --fix {args:.}", "style"]
-style = ["ruff {args:.}", "black --check --diff {args:.}"]
-typing = "pyright {args:src/openllm tests}"
+[tool.interrogate]
+fail-under = 100
+verbose = 2
+whitelist-regex = ["test_.*"]

 [tool.pytest.ini_options]
 addopts = ["-rfEX", "-pno:warnings"]
@@ -206,12 +234,6 @@ force-single-line = true
 known-first-party = ["openllm", "bentoml", 'transformers']
 lines-after-imports = 2

-[tool.ruff.flake8-quotes]
-inline-quotes = "single"
-
-[tool.ruff.flake8-tidy-imports]
-ban-relative-imports = "all"
-
 [tool.ruff.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "__init__.py" = ["E402", "F401", "F403", "F811"]
@@ -222,7 +244,7 @@ ban-relative-imports = "all"
 [tool.pyright]
 analysis.useLibraryCodeForTypes = true
 enableTypeIgnoreComments = true
-include = ["src/", "tests/"]
+include = ["src/", "tests/", "tools/", "examples/"]
 pythonVersion = "3.11"
 reportMissingImports = "none"
 reportMissingModuleSource = "warning"
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -65,8 +65,11 @@ from deepmerge.merger import Merger

 import openllm

-from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
-from .utils import DEBUG, LazyType, bentoml_cattr, codegen, dantic, first_not_none, lenient_issubclass
+from .exceptions import (ForbiddenAttributeError, GpuNotAvailableError,
+                         OpenLLMException)
+from .utils import (DEBUG, ENV_VARS_TRUE_VALUES, LazyType, bentoml_cattr,
+                    codegen, dantic, first_not_none, lenient_issubclass,
+                    non_intrusive_setattr)

 if hasattr(t, "Required"):
    from typing import Required
@@ -78,6 +81,11 @@ if hasattr(t, "NotRequired"):
 else:
    from typing_extensions import NotRequired

+if hasattr(t, "dataclass_transform"):
+    from typing import dataclass_transform
+else:
+    from typing_extensions import dataclass_transform
+
 _T = t.TypeVar("_T")


@@ -85,7 +93,8 @@ if t.TYPE_CHECKING:
    import tensorflow as tf
    import torch
    import transformers
-    from attr import _CountingAttr, _make_init, _make_repr, _transform_attrs  # type: ignore
+    from attr import (_CountingAttr, _make_init, _make_repr,  # type: ignore
+                      _transform_attrs)
    from transformers.generation.beam_constraints import Constraint

    from ._types import ClickFunctionWrapper, F, O_co, P
@@ -103,7 +112,8 @@ else:
    ItemgetterAny = itemgetter
    # NOTE: Using internal API from attr here, since we are actually
    # allowing subclass of openllm.LLMConfig to become 'attrs'-ish
-    from attr._make import _CountingAttr, _make_init, _make_repr, _transform_attrs
+    from attr._make import (_CountingAttr, _make_init, _make_repr,
+                            _transform_attrs)

    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -369,6 +379,11 @@ class GenerationConfig:
            )
        self.__attrs_init__(**attrs)

+    def __getitem__(self, item: str) -> t.Any:
+        if hasattr(self, item):
+            return getattr(self, item)
+        raise KeyError(f"GenerationConfig has no attribute {item}")
+

 bentoml_cattr.register_unstructure_hook_factory(
    lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
@@ -411,6 +426,11 @@ class ModelSettings(t.TypedDict, total=False):
    requires_gpu: bool
    trust_remote_code: bool
    requirements: t.Optional[ListStr]
+
+    # llm implementation specifics
+    use_pipeline: bool
+    bettertransformer: bool
+    model_type: t.Literal["causal_lm", "seq2seq_lm"]
    runtime: t.Literal["transformers", "cpp"]

    # naming convention, only name_type is needed to infer from the class
@@ -458,19 +478,19 @@ _ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
 def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
    if not lenient_issubclass(cl_, LLMConfig):
        raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
-    settings = cl_.__config__

-    if settings is None:
-        raise RuntimeError("Given LLMConfig must have '__config__' defined.")
+    if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
+        raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
+
+    settings = cl_.__config__
+    assert settings

    required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
-    if any(k not in settings for k in required):
-        raise ValueError(f"The following keys are required under '__config__': {required}")
-    if not settings["default_id"] or not settings["model_ids"]:
-        raise ValueError("Make sure that either 'default_id', 'model_ids' are not emptied under '__config__'.")

-    if any(k in settings for k in ("env", "start_name", "model_name")):
-        raise ValueError("The following keys are not allowed under '__config__': env, start_name, model_name")
+    missing = set(required) - set(settings.keys())
+
+    if len(missing) > 0:
+        raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")

    if "generation_class" in settings:
        raise ValueError(
@@ -478,10 +498,16 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
            f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
        )

+    if not settings["default_id"] or not settings["model_ids"]:
+        raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
+
+    # NOTE: value in __config__ can be None, hense we use setdefault
+    # to update in-place
    _cl_name = cl_.__name__.replace("Config", "")
-    name_type = first_not_none(settings.get("name_type"), "dasherize")
-    model_name = inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
-    start_name = inflection.dasherize(model_name) if name_type == "dasherize" else model_name
+    name_type = settings.setdefault("name_type", "dasherize")
+    model_name = settings.setdefault(
+        "model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
+    )
    partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")

    def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
@@ -498,21 +524,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
            for f in fields
        ]

-    return cls(
-        default_id=settings["default_id"],
-        model_ids=settings["model_ids"],
-        url=settings.get("url", ""),
-        requires_gpu=settings.get("requires_gpu", False),
-        trust_remote_code=settings.get("trust_remote_code", False),
-        requirements=settings.get("requirements", None),
-        name_type=name_type,
-        model_name=model_name,
-        start_name=start_name,
-        runtime=settings.get("runtime", "transformers"),
-        env=openllm.utils.ModelEnv(model_name),
-        timeout=settings.get("timeout", 3600),
-        workers_per_resource=settings.get("workers_per_resource", 1),
-        generation_class=attr.make_class(
+    settings.setdefault(
+        "generation_class",
+        attr.make_class(
            f"{_cl_name}GenerationConfig",
            [],
            bases=(GenerationConfig,),
@@ -520,10 +534,40 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
            weakref_slot=True,
            frozen=False,
            repr=True,
+            collect_by_mro=True,
            field_transformer=auto_env_transformers,
        ),
    )

+    env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
+    requires_gpu = settings.setdefault("requires_gpu", False)
+
+    # bettertransformer support
+    bettertransformer = settings.setdefault(
+        "bettertransformer",
+        os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
+    )
+    if requires_gpu:
+        # For all models that requires GPU, no need to offload it to BetterTransformer
+        # use bitsandbytes or gptq instead for latency improvement
+        if bettertransformer:
+            logger.debug("Model requires GPU by default, disabling bettertransformer.")
+        bettertransformer = False
+    settings["bettertransformer"] = bettertransformer
+
+    # default value
+    settings.setdefault("url", "")
+    settings.setdefault("use_pipeline", False)
+    settings.setdefault("model_type", "causal_lm")
+    settings.setdefault("trust_remote_code", False)
+    settings.setdefault("requirements", None)
+    settings.setdefault("timeout", 3600)
+    settings.setdefault("workers_per_resource", 1)
+    settings.setdefault("runtime", "transformers")
+    settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
+
+    return cls(**settings)
+

 bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)

@@ -534,15 +578,16 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
    We can't use the cached object.__setattr__ since we are setting
    attributes to a class.
    """
-    if add_dunder:
-        return f"setattr(cls, '{attr_name}', __add_dunder(cls, {value_var}))"
-    return f"setattr(cls, '{attr_name}', {value_var})"
+    val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
+    return f"setattr(cls, '{attr_name}', {val})"


 _dunder_add = {"generation_class"}


-def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance) -> t.Callable[..., None]:
+def _make_assignment_script(
+    cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: t.LiteralString = "openllm"
+) -> t.Callable[..., None]:
    """Generate the assignment script with prefix attributes __openllm_<value>__"""
    args: ListStr = []
    globs: DictStrAny = {
@@ -555,7 +600,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance

    lines: ListStr = ["_getattr = _cached_getattribute_get(_cached_attribute)"]
    for attr_name, field in attr.fields_dict(attributes.__class__).items():
-        arg_name = field.metadata.get("target", f"__openllm_{inflection.underscore(attr_name)}__")
+        arg_name = field.metadata.get("target", f"__{_prefix}_{inflection.underscore(attr_name)}__")
        args.append(f"{attr_name}=getattr(_cached_attribute, '{attr_name}')")
        lines.append(_setattr_class(arg_name, attr_name, add_dunder=attr_name in _dunder_add))
        annotations[attr_name] = field.type
@@ -568,6 +613,23 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
 _reserved_namespace = {"__config__", "GenerationConfig"}


+@dataclass_transform(order_default=True, field_specifiers=(attr.field, dantic.Field))
+def __llm_config_transform__(cls: type[LLMConfig]) -> type[LLMConfig]:
+    kwargs: dict[str, t.Any] = {}
+    if hasattr(cls, "GenerationConfig"):
+        kwargs = {k: v for k, v in vars(cls.GenerationConfig).items() if not k.startswith("_")}
+    non_intrusive_setattr(
+        cls,
+        "__dataclass_transform__",
+        {
+            "order_default": True,
+            "field_specifiers": (attr.field, dantic.Field),
+            "kwargs": kwargs,
+        },
+    )
+    return cls
+
+
@attr.define(slots=True)
 class LLMConfig:
    """
@@ -640,11 +702,11 @@ class LLMConfig:
    # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
    if t.TYPE_CHECKING:
        # NOTE: public attributes to override
-        __config__: ModelSettings | None = None
+        __config__: ModelSettings | None = Field(None)
        """Internal configuration for this LLM model. Each of the field in here will be populated
        and prefixed with __openllm_<value>__"""

-        GenerationConfig: type = type
+        GenerationConfig: type = Field(None)
        """Users can override this subclass of any given LLMConfig to provide GenerationConfig
        default value. For example:

@@ -663,7 +725,7 @@ class LLMConfig:
        def __attrs_init__(self, **attrs: t.Any):
            """Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract."""

-        __attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = tuple()
+        __attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = Field(None, init=False)
        """Since we are writing our own __init_subclass__, which is an alternative way for __prepare__,
        we want openllm.LLMConfig to be attrs-like dataclass that has pydantic-like interface.
        __attrs_attrs__ will be handled dynamically by __init_subclass__.
@@ -683,33 +745,38 @@ class LLMConfig:
        __openllm_url__: str = Field(None, init=False)
        """The resolved url for this LLMConfig."""

-        __openllm_requires_gpu__: bool = False
+        __openllm_requires_gpu__: bool = Field(None, init=False)
        """Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU."""

-        __openllm_trust_remote_code__: bool = False
+        __openllm_trust_remote_code__: bool = Field(False)
        """Whether to always trust remote code"""

-        __openllm_requirements__: ListStr | None = None
+        __openllm_requirements__: ListStr | None = Field(None)
        """The default PyPI requirements needed to run this given LLM. By default, we will depend on
        bentoml, torch, transformers."""

        __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
        """A ModelEnv instance for this LLMConfig."""

-        __openllm_model_name__: str = ""
+        __openllm_model_name__: str = Field("")
        """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""

-        __openllm_start_name__: str = ""
+        __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
+        """The model type for this given LLM. By default, it should be causal language modeling.
+        Currently supported 'causal_lm' or 'seq2seq_lm'
+        """
+
+        __openllm_start_name__: str = Field("")
        """Default name to be used with `openllm start`"""

-        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
+        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
        """the default name typed for this model. "dasherize" will convert the name to lowercase and
        replace spaces with dashes. "lowercase" will convert the name to lowercase."""

-        __openllm_timeout__: int = 3600
+        __openllm_timeout__: int = Field(36000)
        """The default timeout to be set for this given LLM."""

-        __openllm_workers_per_resource__: int | float = 1
+        __openllm_workers_per_resource__: int | float = Field(1)
        """The number of workers per resource. This is used to determine the number of workers to use for this model.
        For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
        OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
@@ -720,10 +787,23 @@ class LLMConfig:
        By default, it is set to 1.
        """

-        __openllm_runtime__: t.Literal["transformers", "cpp"] = "transformers"
+        __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
        """The runtime to use for this model. Possible values are `transformers` or `cpp`. See
        LlaMA for more information."""

+        __openllm_use_pipeline__: bool = Field(False)
+        """Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
+        The reason for this to be here is because we want to access this object before loading
+        the _bentomodel. This is because we will actually download the model weights when accessing
+        _bentomodel.
+        """
+
+        __openllm_bettertransformer__: bool = Field(False)
+        """Whether to use BetterTransformer for this given LLM. This depends per model
+        architecture. By default, we will use BetterTransformer for T5 and StableLM models,
+        and set to False for every other models.
+        """
+
        __openllm_default_id__: str = Field(None)
        """Return the default model to use when using 'openllm start <model_id>'.
        This could be one of the keys in 'self.model_ids' or custom users model."""
@@ -804,6 +884,7 @@ class LLMConfig:
        these["generation_config"] = cls.Field(
            default=cls.__openllm_generation_class__(),
            description=inspect.cleandoc(cls.__openllm_generation_class__.__doc__ or ""),
+            type=GenerationConfig,
        )

        # Generate the base __attrs_attrs__ transformation here.
@@ -884,6 +965,7 @@ class LLMConfig:
        cls.__openllm_hints__ = {
            f.name: f.type for ite in map(attr.fields, (cls, cls.__openllm_generation_class__)) for f in ite
        }
+        cls = __llm_config_transform__(cls)

    def __setattr__(self, attr: str, value: t.Any):
        if attr in _reserved_namespace:
@@ -909,14 +991,7 @@ class LLMConfig:
        if generation_config is None:
            generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
        else:
-            generation_keys = {k for k in attrs if k in _generation_cl_dict}
-            if len(generation_keys) > 0:
-                logger.warning(
-                    "Both 'generation_config' and keys for 'generation_config' are passed."
-                    " The following keys in 'generation_config' will be overriden be keywords argument: %s",
-                    ", ".join(generation_keys),
-                )
-                config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in generation_keys})
+            config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})

        for k in _cached_keys:
            if k in generation_config or attrs.get(k) is None:
@@ -942,7 +1017,32 @@ class LLMConfig:
            )

        # The rest of attrs should only be the attributes to be passed to __attrs_init__
-        self.__attrs_init__(generation_config=self.__openllm_generation_class__(**generation_config), **attrs)
+        self.__attrs_init__(generation_config=self["generation_class"](**generation_config), **attrs)
+
+    def __getitem__(self, item: str | t.Any) -> t.Any:
+        """Allowing access LLMConfig as a dictionary. The order will always evaluate as
+
+        __openllm_*__ > self.key > __openllm_generation_class__ > __openllm_extras__
+
+        This method is purely for convenience, and should not be used for performance critical code.
+        """
+        if not isinstance(item, str):
+            raise TypeError(f"LLM only supports string indexing, not {item.__class__.__name__}")
+        if item in _reserved_namespace:
+            raise ForbiddenAttributeError(
+                f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
+            )
+        internal_attributes = f"__openllm_{item}__"
+        if hasattr(self, internal_attributes):
+            return getattr(self, internal_attributes)
+        elif hasattr(self, item):
+            return getattr(self, item)
+        elif hasattr(self.__openllm_generation_class__, item):
+            return getattr(self.__openllm_generation_class__, item)
+        elif item in self.__openllm_extras__:
+            return self.__openllm_extras__[item]
+        else:
+            raise KeyError(item)

    def __getattribute__(self, item: str) -> t.Any:
        if item in _reserved_namespace:
@@ -976,10 +1076,8 @@ class LLMConfig:

    def model_dump(self, flatten: bool = False, **_: t.Any):
        dumped = bentoml_cattr.unstructure(self)
-        generation_config = bentoml_cattr.unstructure(self.generation_config)
-        if not flatten:
-            dumped["generation_config"] = generation_config
-        else:
+        if flatten:
+            generation_config = dumped.pop("generation_config")
            dumped.update(generation_config)
        return dumped

@@ -1028,11 +1126,11 @@ class LLMConfig:
        key_to_remove: ListStr = []

        for k, v in attrs.items():
-            if k.startswith(f"{self.__openllm_model_name__}_generation_"):
-                llm_config_attrs["generation_config"][k[len(self.__openllm_model_name__ + "_generation_") :]] = v
+            if k.startswith(f"{self['model_name']}_generation_"):
+                llm_config_attrs["generation_config"][k[len(self["model_name"] + "_generation_") :]] = v
                key_to_remove.append(k)
-            elif k.startswith(f"{self.__openllm_model_name__}_"):
-                llm_config_attrs[k[len(self.__openllm_model_name__ + "_") :]] = v
+            elif k.startswith(f"{self['model_name']}_"):
+                llm_config_attrs[k[len(self["model_name"] + "_") :]] = v
                key_to_remove.append(k)

        return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -35,7 +35,8 @@ from bentoml._internal.types import ModelSignatureDict
 import openllm

 from .exceptions import ForbiddenAttributeError, OpenLLMException
-from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr
+from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
+                    non_intrusive_setattr)

 if t.TYPE_CHECKING:
    import torch
@@ -60,7 +61,6 @@ else:

 logger = logging.getLogger(__name__)

-# NOTE: `1-2` -> text-generation and text2text-generation
 FRAMEWORK_TO_AUTOCLASS_MAPPING = {
    "pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
    "tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
@@ -132,6 +132,7 @@ def import_model(
            ),
        )

+    # NOTE: `1-2` -> text-generation and text2text-generation
    if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
        idx = 0
    elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
@@ -243,25 +244,6 @@ class LLMInterface(ABC):
        raise NotImplementedError


-def _default_post_init(self: LLM[t.Any, t.Any]):
-    # load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
-    #              See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
-    #              for more information.
-    # NOTE: set a default variable to transform to BetterTransformer by default for inference
-    if self.config.__openllm_runtime__ == "cpp":
-        self.load_in_mha = False
-    else:
-        self.load_in_mha = (
-            os.environ.get(self.config_class.__openllm_env__.bettertransformer, str(False)).upper()
-            in ENV_VARS_TRUE_VALUES
-        )
-        if self.config_class.__openllm_requires_gpu__:
-            # For all models that requires GPU, no need to offload it to BetterTransformer
-            # use bitsandbytes instead
-
-            self.load_in_mha = False
-
-
 _M = t.TypeVar("_M")
 _T = t.TypeVar("_T")

@@ -285,6 +267,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        _model_attrs: dict[str, t.Any]
        _tokenizer_attrs: dict[str, t.Any]

+        bettertransformer: bool
+
    def __init_subclass__(cls):
        cd = cls.__dict__
        prefix_class_name_config = cls.__name__
@@ -310,20 +294,6 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                    "Missing required key 'config_class'. Make sure to define it within the LLM subclass."
                )

-        if cls.llm_post_init is not LLMInterface.llm_post_init:
-            original_llm_post_init = cd["llm_post_init"]
-
-            def wrapped_llm_post_init(self: t.Self) -> None:
-                """We need to both initialize private attributes and call the user-defined model_post_init
-                method.
-                """
-                _default_post_init(self)
-                original_llm_post_init(self)
-
-            cls.llm_post_init = wrapped_llm_post_init
-        else:
-            setattr(cls, "llm_post_init", _default_post_init)
-
        if cls.import_model is LLMInterface.import_model:
            # using the default import model
            setattr(cls, "import_model", functools.partial(import_model, _model_framework=implementation))
@@ -353,6 +323,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        model_id: str | None = None,
        llm_config: openllm.LLMConfig | None = None,
        *args: t.Any,
+        quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
+        bettertransformer: bool | None = None,
        **attrs: t.Any,
    ):
        """Initialize the LLM with given pretrained model.
@@ -429,6 +401,9 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
            llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                        will use `config_class` to construct default configuration.
+            quantize: The quantization to use for this LLM. Defaults to None. Possible values
+                      include 8bit, 4bit and gptq.
+            bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
            *args: The args to be passed to the model.
            **attrs: The kwargs to be passed to the model.

@@ -438,16 +413,102 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                                   However, if `model_id` is a path, this argument is recomended to include.
        """

-        load_in_mha = attrs.pop("load_in_mha", False)
        openllm_model_version = attrs.pop("openllm_model_version", None)

+        # low_cpu_mem_usage is only available for model
+        # this is helpful on system with low memory to avoid OOM
+        low_cpu_mem_usage = attrs.pop("low_cpu_mem_usage", True)
+
+        # quantization setup
+        quantization_config = attrs.pop("quantization_config", None)
+        # 8 bit configuration
+        int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
+        cpu_offloading = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
+        int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
+        int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
+        # 4 bit configuration
+        int4_compute_dtype = attrs.pop("llm_bnb_4bit_compute_dtype", torch.bfloat16)
+        int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
+        int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
+
+        if quantization_config and quantize:
+            raise ValueError(
+                """'quantization_config' and 'quantize' are mutually exclusive. Either customise
+            your quantization_config or use the quantize argument."""
+            )
+        if quantization_config is None:
+            # quantize is a openllm.LLM feature, where we can quantize the model
+            # with bitsandbytes or quantization aware training.
+            if quantize is not None:
+                if not is_bitsandbytes_available():
+                    raise RuntimeError(
+                        "Quantization requires bitsandbytes to be installed. Make "
+                        "sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
+                    )
+                logger.debug(
+                    "'quantize' is not None. %s will use a default 'quantization_config' for %s. "
+                    "If you want to customise the quantization config, make sure to pass your "
+                    "own 'quantization_config'",
+                    self,
+                    quantize,
+                )
+                if quantize == "8bit":
+                    if int8_skip_modules is None:
+                        int8_skip_modules = []
+                    if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
+                        logger.debug("Skipping 'lm_head' for quantization for %s", self)
+                        int8_skip_modules.append("lm_head")
+                    quantization_config = transformers.BitsAndBytesConfig(
+                        load_in_8bit=True,
+                        llm_int8_enable_fp32_cpu_offload=cpu_offloading,
+                        llm_int8_threshhold=int8_threshold,
+                        llm_int8_skip_modules=int8_skip_modules,
+                        llm_int8_has_fp16_weight=int8_has_fp16_weight,
+                    )
+                elif quantize == "4bit":
+                    trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
+                    supports_kbits = trf_versions[:2] >= (4, 30)
+                    if supports_kbits:
+                        quantization_config = transformers.BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            llm_bnb_4bit_compute_dtype=int4_compute_dtype,
+                            llm_bnb_4bit_quant_type=int4_quant_type,
+                            llm_bnb_4bit_use_double_quant=int4_use_double_quant,
+                        )
+                    else:
+                        logger.warning(
+                            "'quantize' is set to 4bit, while the current transformers version %s does not support "
+                            "k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
+                            "make sure to install the latest version of transformers either via PyPI or "
+                            "from git source: 'pip install git+https://github.com/huggingface/transformers'.",
+                            trf_versions,
+                        )
+                elif quantize == "gptq":
+                    # TODO: support GPTQ loading quantization
+                    if model_id is None:
+                        raise RuntimeError(
+                            "'quantize=%s' requires passing custom path to quantized weights as we are unable to load "
+                            "the model on the fly. See https://github.com/qwopqwop200/GPTQ-for-LLaMa for "
+                            "instruction on how to quantize '%s' with GPTQ.",
+                            quantize,
+                            self,
+                        )
+                    raise NotImplementedError("GPTQ is not supported yet.")
+                else:
+                    raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
+
+        attrs.update({"quantization_config": quantization_config})
+
        if llm_config is not None:
            logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
            self.config = llm_config
        else:
            self.config = self.config_class.model_construct_env(**attrs)
            # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
-            attrs = self.config.__openllm_extras__
+            attrs = self.config["extras"]
+
+        if not self.config["use_pipeline"]:
+            attrs["low_cpu_mem_usage"] = low_cpu_mem_usage

        model_kwds, tokenizer_kwds = {}, {}
        if self.__llm_init_kwargs__:
@@ -463,10 +524,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            )

        if model_id is None:
-            model_id = os.environ.get(self.config.__openllm_env__.model_id, self.config.__openllm_default_id__)
+            model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])

        # NOTE: This is the actual given path or pretrained weight for this LLM.
-        assert model_id is not None
+        if t.TYPE_CHECKING:
+            assert model_id is not None
        self._model_id = model_id

        # parsing tokenizer and model kwargs
@@ -476,23 +538,24 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        model_kwds.update({k: v for k, v in attrs.items() if not k.startswith(TOKENIZER_PREFIX)})

        # handle trust_remote_code
-        self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
+        self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config["trust_remote_code"])

        # NOTE: Save the args and kwargs for latter load
        self._model_args = args
        self._model_attrs = model_kwds
        self._tokenizer_attrs = tokenizer_kwds
-
-        # we allow users to overwrite the load_in_mha defined by the LLM subclass.
-        if load_in_mha:
-            logger.debug("Overwriting 'load_in_mha=%s' (base load_in_mha=%s)", load_in_mha, self.load_in_mha)
-            self.load_in_mha = load_in_mha
-
        self._openllm_model_version = openllm_model_version

        if self.__llm_post_init__:
            self.llm_post_init()

+        # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
+        if bettertransformer:
+            logger.debug("Using %r with BetterTransformer", self)
+            self.bettertransformer = bettertransformer
+        else:
+            non_intrusive_setattr(self, "bettertransformer", self.config["bettertransformer"])
+
    def __setattr__(self, attr: str, value: t.Any):
        if attr in _reserved_namespace:
            raise ForbiddenAttributeError(
@@ -513,7 +576,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):

    @property
    def runner_name(self) -> str:
-        return f"llm-{self.config.__openllm_start_name__}-runner"
+        return f"llm-{self.config['start_name']}-runner"

    # NOTE: The section below defines a loose contract with langchain's LLM interface.
    @property
@@ -524,7 +587,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
    def identifying_params(self) -> dict[str, t.Any]:
        return {
            "configuration": self.config.model_dump_json().decode(),
-            "model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
+            "model_ids": orjson.dumps(self.config["model_ids"]).decode(),
        }

    @staticmethod
@@ -580,8 +643,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                sys.executable,
                "-m",
                "openllm",
-                "download-models",
-                self.config.__openllm_start_name__,
+                "download",
+                self.config["start_name"],
                "--model-id",
                self.model_id,
                "--output",
@@ -625,7 +688,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        kwds = self._model_attrs
        kwds["trust_remote_code"] = self.__llm_trust_remote_code__

-        if self.load_in_mha and "_pretrained_class" not in self._bentomodel.info.metadata:
+        is_pipeline = "_pretrained_class" in self._bentomodel.info.metadata
+        # differentiate when saving tokenizer or other pretrained type.
+        is_pretrained_model = is_pipeline and "_framework" in self._bentomodel.info.metadata
+
+        if self.bettertransformer and is_pipeline and self.config["use_pipeline"]:
            # This is a pipeline, provide a accelerator args
            kwds["accelerator"] = "bettertransformer"

@@ -636,10 +703,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                self.__llm_model__ = self._bentomodel.load_model(*self._model_args, **kwds)

            if (
-                self.load_in_mha
-                and all(i in self._bentomodel.info.metadata for i in ("_framework", "_pretrained_class"))
+                self.bettertransformer
+                and is_pretrained_model
                and self._bentomodel.info.metadata["_framework"] == "torch"
-                and self.config.__openllm_runtime__ == "transformers"
+                and self.config["runtime"] == "transformers"
            ):
                # BetterTransformer is currently only supported on PyTorch.
                from optimum.bettertransformer import BetterTransformer
@@ -767,7 +834,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):

        # NOTE: returning the two langchain API's to the runner
        return types.new_class(
-            inflection.camelize(self.config.__openllm_model_name__) + "Runner",
+            inflection.camelize(self.config["model_name"]) + "Runner",
            (bentoml.Runner,),
            exec_body=lambda ns: ns.update(
                {
@@ -776,17 +843,17 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                    "llm": self,  # NOTE: self reference to LLM
                    "config": self.config,
                    "__call__": _wrapped_generate_run,
-                    "__module__": f"openllm.models.{self.config.__openllm_model_name__}",
-                    "__doc__": self.config.__openllm_env__.start_docstring,
+                    "__module__": f"openllm.models.{self.config['model_name']}",
+                    "__doc__": self.config["env"].start_docstring,
                }
            ),
        )(
            types.new_class(
-                inflection.camelize(self.config.__openllm_model_name__) + "Runnable",
+                inflection.camelize(self.config["model_name"]) + "Runnable",
                (_Runnable,),
                {
                    "SUPPORTED_RESOURCES": ("nvidia.com/gpu", "cpu")
-                    if self.config.__openllm_requires_gpu__
+                    if self.config["requires_gpu"]
                    else ("nvidia.com/gpu",),
                    "llm_type": self.llm_type,
                    "identifying_params": self.identifying_params,
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -76,17 +76,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
    # first, then proceed to install everything inside the wheels/ folder.
    packages: list[str] = ["openllm"]

-    if llm.config.__openllm_requirements__ is not None:
-        packages.extend(llm.config.__openllm_requirements__)
+    if llm.config["requirements"] is not None:
+        packages.extend(llm.config["requirements"])

    if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
        packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")

-    to_use_framework = llm.config.__openllm_env__.get_framework_env()
+    env = llm.config["env"]
+    to_use_framework = env.get_framework_env()
    if to_use_framework == "flax":
-        assert (
-            utils.is_flax_available()
-        ), f"Flax is not available, while {llm.config.__openllm_env__.framework} is set to 'flax'"
+        assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
        packages.extend(
            [
                f"flax>={importlib.metadata.version('flax')}",
@@ -95,9 +94,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
            ]
        )
    elif to_use_framework == "tf":
-        assert (
-            utils.is_tf_available()
-        ), f"TensorFlow is not available, while {llm.config.__openllm_env__.framework} is set to 'tf'"
+        assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
        candidates = (
            "tensorflow",
            "tensorflow-cpu",
@@ -133,16 +130,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
    _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
    _bentoml_config_options_opts = [
-        "api_server.traffic.timeout=3600",  # NOTE: Currently we hardcode this value
-        f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
-        f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
+        "api_server.traffic.timeout=36000",  # NOTE: Currently we hardcode this value
+        f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}',
+        f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
    ]
    _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+    env = llm.config["env"]
    return DockerOptions(
        cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
        env={
-            llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
-            "OPENLLM_MODEL": llm.config.__openllm_model_name__,
+            env.framework: env.get_framework_env(),
+            "OPENLLM_MODEL": llm.config["model_name"],
            "OPENLLM_MODEL_ID": llm.model_id,
            "BENTOML_DEBUG": str(get_debug_mode()),
            "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
@@ -180,7 +178,7 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
    try:
        os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)

-        to_use_framework = llm_config.__openllm_env__.get_framework_env()
+        to_use_framework = llm_config["env"].get_framework_env()
        if to_use_framework == "flax":
            llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
        elif to_use_framework == "tf":
@@ -192,12 +190,10 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be

        labels = dict(llm.identifying_params)
        labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
-        service_name = f"generated_{llm.config.__openllm_model_name__}_service.py"
-        workers_per_resource = utils.first_not_none(
-            workers_per_resource, default=llm.config.__openllm_workers_per_resource__
-        )
+        service_name = f"generated_{llm_config['model_name']}_service.py"
+        workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])

-        with fs.open_fs(f"temp://llm_{llm.config.__openllm_model_name__}") as llm_fs:
+        with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
            # add service.py definition to this temporary folder
            utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)

@@ -209,12 +205,12 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
                    raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
                _previously_built = True
            except bentoml.exceptions.NotFound:
-                logger.info("Building Bento for LLM '%s'", llm.config.__openllm_start_name__)
+                logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
                bento = bentoml.bentos.build(
                    f"{service_name}:svc",
                    name=bento_tag.name,
                    labels=labels,
-                    description=f"OpenLLM service for {llm.config.__openllm_start_name__}",
+                    description=f"OpenLLM service for {llm_config['start_name']}",
                    include=[
                        f for f in llm_fs.walk.files(filter=["*.py"])
                    ],  # NOTE: By default, we are using _service.py as the default service, for now.
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -55,7 +55,7 @@ class GenerationInput:
    def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
        llm_config = openllm.AutoConfig.for_model(model_name, **attrs)
        return attr.make_class(
-            inflection.camelize(llm_config.__openllm_model_name__) + "GenerationInput",
+            inflection.camelize(llm_config["model_name"]) + "GenerationInput",
            attrs={
                "prompt": attr.field(type=str),
                "llm_config": attr.field(
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -36,7 +36,7 @@ model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}")  # openllm: mode
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)

-svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])


@svc.api(
@@ -55,8 +55,8 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> openllm.MetadataOutput:
    return openllm.MetadataOutput(
        model_id=model_id,
-        timeout=llm_config.__openllm_timeout__,
-        model_name=llm_config.__openllm_model_name__,
-        framework=llm_config.__openllm_env__.get_framework_env(),
+        timeout=llm_config["timeout"],
+        model_name=llm_config["model_name"],
+        framework=llm_config["env"].get_framework_env(),
        configuration=llm_config.model_dump_json().decode(),
    )
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -83,6 +83,29 @@ def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.A
    call(text, **attrs)


+def quantize_option(factory: t.Any):
+    help_str = """Running this model in quantized mode.
+    Note that GPTQ is currently working in progress and will be available soon.
+
+    NOTE: Quantization is only available for PyTorch models.
+    """
+    return factory.option(
+        "--quantize",
+        type=click.Choice(["8bit", "4bit", "gptq"]),
+        default=None,
+        help=help_str,
+    )
+
+
+def bettertransformer_option(factory: t.Any):
+    return factory.option(
+        "--bettertransformer",
+        is_flag=True,
+        default=None,
+        help="Use BetterTransformer wrapper to serve model",
+    )
+
+
 def start_model_command(
    model_name: str,
    group: click.Group,
@@ -108,29 +131,30 @@ def start_model_command(
    openllm.utils.configure_logging()

    llm_config = openllm.AutoConfig.for_model(model_name)
+    env = llm_config["env"]

    docstring = f"""\
-{llm_config.__openllm_env__.start_docstring}
+{env.start_docstring}
 \b
-Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.__openllm_default_id__}]
+Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}]
 """
    command_attrs: dict[str, t.Any] = {
-        "name": llm_config.__openllm_model_name__,
+        "name": llm_config["model_name"],
        "context_settings": _context_settings or {},
        "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
        "help": docstring,
    }

    aliases: list[str] = []
-    if llm_config.__openllm_name_type__ == "dasherize":
-        aliases.append(llm_config.__openllm_start_name__)
+    if llm_config["name_type"] == "dasherize":
+        aliases.append(llm_config["start_name"])

    command_attrs["aliases"] = aliases if len(aliases) > 0 else None

    serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args

    available_gpu = openllm.utils.gpu_count()
-    if llm_config.__openllm_requires_gpu__ and len(available_gpu) < 1:
+    if llm_config["requires_gpu"] and len(available_gpu) < 1:
        # NOTE: The model requires GPU, therefore we will return a dummy command
        command_attrs.update(
            {
@@ -152,8 +176,13 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
    @llm_config.to_click_options
    @serve_decorator
    @cog.optgroup.group("General LLM Options")
-    @cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds")
-    @model_id_option(cog.optgroup, model_env=llm_config.__openllm_env__)
+    @cog.optgroup.option(
+        "--server-timeout",
+        type=int,
+        default=None,
+        help="Server timeout in seconds",
+    )
+    @model_id_option(cog.optgroup, model_env=env)
    @cog.optgroup.option(
        "--device",
        type=tuple,
@@ -165,34 +194,47 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
        show_envvar=True,
    )
    @workers_per_resource_option(cog.optgroup)
-    @click.pass_context
+    @quantize_option(cog.optgroup)
+    @bettertransformer_option(cog.optgroup)
    def model_start(
-        ctx: click.Context,
        server_timeout: int | None,
        model_id: str | None,
        workers_per_resource: float | None,
        device: tuple[str, ...] | None,
+        quantize: t.Literal["8bit", "4bit", "gptq"] | None,
+        bettertransformer: bool | None,
        **attrs: t.Any,
    ) -> openllm.LLMConfig:
        config, server_attrs = llm_config.model_validate_click(**attrs)

-        if llm_config.__openllm_env__.get_framework_env() == "flax":
+        if quantize and env.get_framework_env() != "pt":
+            _echo("Quantization is only available for PyTorch models.", fg="yellow")
+
+        if env.get_framework_env() == "flax":
            llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
-        elif llm_config.__openllm_env__.get_framework_env() == "tf":
+        elif env.get_framework_env() == "tf":
            llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
        else:
-            llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
+            llm = openllm.AutoLLM.for_model(
+                model_name,
+                model_id=model_id,
+                llm_config=config,
+                quantize=quantize,
+                bettertransformer=bettertransformer,
+                ensure_available=True,
+            )

-        if llm.config.__openllm_requirements__ is not None and len(llm.config.__openllm_requirements__) > 0:
+        requirements = config["requirements"]
+        if requirements is not None and len(requirements) > 0:
            _echo(
-                f"Make sure to have the following dependencies available: {llm.config.__openllm_requirements__}",
+                f"Make sure to have the following dependencies available: {requirements}",
                fg="yellow",
            )

        workers_per_resource = openllm.utils.first_not_none(
-            workers_per_resource, default=llm.config.__openllm_workers_per_resource__
+            workers_per_resource, default=config["workers_per_resource"]
        )
-        server_timeout = openllm.utils.first_not_none(server_timeout, default=llm.config.__openllm_timeout__)
+        server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])

        num_workers = int(1 / workers_per_resource)
        if num_workers > 1:
@@ -216,26 +258,26 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
        _bentoml_config_options_opts = [
            "tracing.sample_rate=1.0",
            f"api_server.traffic.timeout={server_timeout}",
-            f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
-            f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
+            f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
+            f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
        ]
        if device:
            if len(device) > 1:
                for idx, dev in enumerate(device):
                    _bentoml_config_options_opts.append(
-                        f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+                        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
                    )
            else:
                _bentoml_config_options_opts.append(
-                    f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
+                    f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
                )

        _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)

        start_env.update(
            {
-                llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
-                llm.config.__openllm_env__.model_config: llm.config.model_dump_json().decode(),
+                env.framework: env.get_framework_env(),
+                env.model_config: llm.config.model_dump_json().decode(),
                "OPENLLM_MODEL": model_name,
                "OPENLLM_MODEL_ID": llm.model_id,
                "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
@@ -280,7 +322,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        """
        # The following logics is similar to one of BentoMLCommandGroup

-        from bentoml._internal.configuration import DEBUG_ENV_VAR, QUIET_ENV_VAR
+        from bentoml._internal.configuration import (DEBUG_ENV_VAR,
+                                                     QUIET_ENV_VAR)

        @click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.")
        @click.option(
@@ -668,11 +711,15 @@ def start_grpc_cli():
@output_option
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
@workers_per_resource_option(click, build=True)
+@quantize_option(click)
+@bettertransformer_option(click)
 def build(
    model_name: str,
    model_id: str | None,
    overwrite: bool,
    output: OutputLiteral,
+    quantize: t.Literal["8bit", "4bit", "gptq"] | None,
+    bettertransformer: bool | None,
    workers_per_resource: float | None,
 ):
    """Package a given models into a Bento.
@@ -695,6 +742,8 @@ def build(
        model_name,
        __cli__=True,
        model_id=model_id,
+        quantize=quantize,
+        bettertransformer=bettertransformer,
        _workers_per_resource=workers_per_resource,
        _overwrite_existing_bento=overwrite,
    )
@@ -764,20 +813,20 @@ def models(output: OutputLiteral, show_available: bool):
        for m in models:
            config = openllm.AutoConfig.for_model(m)
            runtime_impl: tuple[t.Literal["pt", "flax", "tf"], ...] = tuple()
-            if config.__openllm_model_name__ in openllm.MODEL_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_MAPPING_NAMES:
                runtime_impl += ("pt",)
-            if config.__openllm_model_name__ in openllm.MODEL_FLAX_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_FLAX_MAPPING_NAMES:
                runtime_impl += ("flax",)
-            if config.__openllm_model_name__ in openllm.MODEL_TF_MAPPING_NAMES:
+            if config["model_name"] in openllm.MODEL_TF_MAPPING_NAMES:
                runtime_impl += ("tf",)
            json_data[m] = {
-                "model_id": config.__openllm_model_ids__,
-                "url": config.__openllm_url__,
-                "requires_gpu": config.__openllm_requires_gpu__,
+                "model_id": config["model_ids"],
+                "url": config["url"],
+                "requires_gpu": config["requires_gpu"],
                "runtime_impl": runtime_impl,
                "installation": "pip install openllm" if m not in extras else f'pip install "openllm[{m}]"',
            }
-            converted.extend([convert_transformers_model_name(i) for i in config.__openllm_model_ids__])
+            converted.extend([convert_transformers_model_name(i) for i in config["model_ids"]])
            if openllm.utils.DEBUG:
                try:
                    openllm.AutoLLM.for_model(m, llm_config=config)
@@ -950,7 +999,7 @@ def query_(
        _echo(res["responses"], fg="white")


-@cli.command()
+@cli.command(name="download")
@click.argument(
    "model_name",
    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
@@ -967,10 +1016,10 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
        openllm.utils.configure_logging()

    config = openllm.AutoConfig.for_model(model_name)
-    env = config.__openllm_env__.get_framework_env()
-    if env == "flax":
+    envvar = config["env"].get_framework_env()
+    if envvar == "flax":
        model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
-    elif env == "tf":
+    elif envvar == "tf":
        model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
    else:
        model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
@@ -978,11 +1027,11 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
    try:
        _ref = bentoml.transformers.get(model.tag)
        if output == "pretty":
-            _echo(f"{model_name} is already setup for framework '{env}': {str(_ref.tag)}", nl=True, fg="yellow")
+            _echo(f"{model_name} is already setup for framework '{envvar}': {str(_ref.tag)}", nl=True, fg="yellow")
        elif output == "json":
            _echo(
                orjson.dumps(
-                    {"previously_setup": True, "framework": env, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
+                    {"previously_setup": True, "framework": envvar, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
                ).decode(),
                fg="white",
            )
@@ -1016,7 +1065,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
        elif output == "json":
            _echo(
                orjson.dumps(
-                    {"previously_setup": False, "framework": env, "tag": str(_ref.tag)},
+                    {"previously_setup": False, "framework": envvar, "tag": str(_ref.tag)},
                    option=orjson.OPT_INDENT_2,
                ).decode()
            )
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -46,7 +46,7 @@ class ChatGLMConfig(openllm.LLMConfig):

    retain_history: bool = openllm.LLMConfig.Field(
        False,
-        description="""Whether to retain history given to the model. 
+        description="""Whether to retain history given to the model.
        If set to True, then the model will retain given history.""",
    )

--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -38,6 +38,7 @@ class DollyV2Config(openllm.LLMConfig):
        "timeout": 3600000,
        "trust_remote_code": True,
        "url": "https://github.com/databrickslabs/dolly",
+        "use_pipeline": True,
        "default_id": "databricks/dolly-v2-3b",
        "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
    }
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -29,6 +29,7 @@ class FalconConfig(openllm.LLMConfig):
        "trust_remote_code": True,
        "requires_gpu": True,
        "timeout": int(36e6),
+        "use_pipeline": True,
        "url": "https://falconllm.tii.ae/",
        "requirements": ["einops", "xformers", "safetensors"],
        "default_id": "tiiuae/falcon-7b",
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -61,6 +61,7 @@ class FlanT5Config(openllm.LLMConfig):
            "google/flan-t5-xl",
            "google/flan-t5-xxl",
        ],
+        "model_type": "seq2seq_lm",
    }

    class GenerationConfig:
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -47,13 +47,12 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN

    def llm_post_init(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.load_in_mha = True if not torch.cuda.is_available() else False
+        self.bettertransformer = True if not torch.cuda.is_available() else False

    @property
    def import_kwargs(self):
        model_kwds = {
            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-            "load_in_8bit": False,
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
        }
        tokenizer_kwds: dict[str, t.Any] = {}
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -47,8 +47,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
    def import_kwargs(self):
        model_kwds = {
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
-            "torch_dtype": torch.float16,
+            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
        }
        tokenizer_kwds = {"padding_side": "left"}
        return model_kwds, tokenizer_kwds
@@ -62,7 +61,6 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
        **attrs: t.Any,
    ) -> bentoml.Model:
        torch_dtype = attrs.pop("torch_dtype", torch.float16)
-        load_in_8bit = attrs.pop("load_in_8bit", True)
        device_map = attrs.pop("device_map", "auto")

        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
@@ -74,7 +72,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
        )

        model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs
+            model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
        )
        try:
            return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -15,42 +15,38 @@
 Utilities function for OpenLLM. User can import these function for convenience, but
 we won't ensure backward compatibility for these functions. So use with caution.
 """
-from __future__ import annotations
+from __future__ import annotations as _annotations

+import functools
+import logging
 import os
 import sys
 import types
 import typing as t

-from bentoml._internal.configuration import get_debug_mode as get_debug_mode
-from bentoml._internal.configuration import get_quiet_mode as get_quiet_mode
-from bentoml._internal.configuration import set_debug_mode as set_debug_mode
-from bentoml._internal.configuration import set_quiet_mode as set_quiet_mode
-from bentoml._internal.log import configure_logging as configure_logging
-from bentoml._internal.log import configure_server_logging as configure_server_logging
+from bentoml._internal.configuration import (get_debug_mode, get_quiet_mode,
+                                             set_debug_mode, set_quiet_mode)
+from bentoml._internal.log import configure_logging, configure_server_logging
 from bentoml._internal.types import LazyType
+from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
+                                     copy_file_to_fs_folder, first_not_none,
+                                     pkg, reserve_free_port,
+                                     resolve_user_filepath)

-# NOTE: The following exports useful utils from bentoml
-from bentoml._internal.utils import LazyLoader as LazyLoader
-from bentoml._internal.utils import bentoml_cattr as bentoml_cattr
-from bentoml._internal.utils import copy_file_to_fs_folder as copy_file_to_fs_folder
-from bentoml._internal.utils import first_not_none as first_not_none
-from bentoml._internal.utils import pkg as pkg
-from bentoml._internal.utils import reserve_free_port as reserve_free_port
-from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath
+from .lazy import LazyModule

-from .lazy import LazyModule as LazyModule
+logger = logging.getLogger(__name__)

 try:
-    from typing import GenericAlias as TypingGenericAlias  # type: ignore
+    from typing import GenericAlias as _TypingGenericAlias  # type: ignore
 except ImportError:
    # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
-    TypingGenericAlias = ()
+    _TypingGenericAlias = ()

 if sys.version_info < (3, 10):
-    WithArgsTypes = (TypingGenericAlias,)
+    _WithArgsTypes = (_TypingGenericAlias,)
 else:
-    WithArgsTypes: t.Any = (
+    _WithArgsTypes: t.Any = (
        t._GenericAlias,  # type: ignore (_GenericAlias is the actual GenericAlias implementation)
        types.GenericAlias,
        types.UnionType,
@@ -61,7 +57,7 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
    try:
        return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
    except TypeError:
-        if isinstance(cls, WithArgsTypes):
+        if isinstance(cls, _WithArgsTypes):
            return False
        raise

@@ -72,27 +68,25 @@ def gpu_count() -> tuple[int, ...]:
    return tuple(NvidiaGpuResource.from_system())


+# equivocal setattr to save one lookup per assignment
+_object_setattr = object.__setattr__
+
+
+def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
+    """This makes sure that we don't overwrite any existing attributes on the object"""
+    _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
+
+    if not hasattr(obj, name):
+        _setattr(name, value)
+
+
 DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get("OPENLLMDEVDEBUG")))

-_extras = {
-    "get_debug_mode": get_debug_mode,
-    "get_quiet_mode": get_quiet_mode,
-    "set_debug_mode": set_debug_mode,
-    "set_quiet_mode": set_quiet_mode,
-    "configure_logging": configure_logging,
-    "configure_server_logging": configure_server_logging,
-    "LazyType": LazyType,
-    "LazyLoader": LazyLoader,
-    "LazyModule": LazyModule,
-    "bentoml_cattr": bentoml_cattr,
-    "copy_file_to_fs_folder": copy_file_to_fs_folder,
-    "first_not_none": first_not_none,
-    "pkg": pkg,
-    "reserve_free_port": reserve_free_port,
-    "resolve_user_filepath": resolve_user_filepath,
-    "lenient_issubclass": lenient_issubclass,
-    "gpu_count": gpu_count,
-    "DEBUG": DEBUG,
+
+# XXX: define all classes, functions import above this line
+# since _extras will be the locals() import from this file.
+_extras: dict[str, t.Any] = {
+    k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
 }

 _import_structure = {
@@ -108,23 +102,46 @@ _import_structure = {
        "is_flax_available",
        "is_tf_available",
        "is_torch_available",
+        "is_bitsandbytes_available",
        "require_backends",
    ],
 }

 if t.TYPE_CHECKING:
+    # NOTE: The following exports useful utils from bentoml
+    from . import LazyLoader as LazyLoader
+    from . import LazyType as LazyType
    from . import analytics as analytics
+    from . import bentoml_cattr as bentoml_cattr
    from . import codegen as codegen
+    from . import configure_logging as configure_logging
+    from . import configure_server_logging as configure_server_logging
+    from . import copy_file_to_fs_folder as copy_file_to_fs_folder
    from . import dantic as dantic
+    from . import first_not_none as first_not_none
+    from . import get_debug_mode as get_debug_mode
+    from . import get_quiet_mode as get_quiet_mode
+    from . import gpu_count as gpu_count
+    from . import lenient_issubclass as lenient_issubclass
+    from . import non_intrusive_setattr as non_intrusive_setattr
+    from . import pkg as pkg
+    from . import reserve_free_port as reserve_free_port
+    from . import resolve_user_filepath as resolve_user_filepath
+    from . import set_debug_mode as set_debug_mode
+    from . import set_quiet_mode as set_quiet_mode
    from .import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
    from .import_utils import DummyMetaclass as DummyMetaclass
    from .import_utils import ModelEnv as ModelEnv
-    from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
+    from .import_utils import \
+        is_bitsandbytes_available as is_bitsandbytes_available
+    from .import_utils import \
+        is_cpm_kernels_available as is_cpm_kernels_available
    from .import_utils import is_einops_available as is_einops_available
    from .import_utils import is_flax_available as is_flax_available
    from .import_utils import is_tf_available as is_tf_available
    from .import_utils import is_torch_available as is_torch_available
    from .import_utils import require_backends as require_backends
+    from .lazy import LazyModule as LazyModule
 else:
    import sys

--- a/src/openllm/utils/analytics.py
+++ b/src/openllm/utils/analytics.py
@@ -81,7 +81,7 @@ class StartInitEvent(_internal_analytics.schemas.EventMeta):

    @staticmethod
    def handler(llm_config: openllm.LLMConfig) -> StartInitEvent:
-        return StartInitEvent(model_name=llm_config.__openllm_model_name__, llm_config=llm_config.model_dump())
+        return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())


 def track_start_init(
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -61,6 +61,7 @@ _tf_available = importlib.util.find_spec("tensorflow") is not None
 _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
 _einops_available = _is_package_available("einops")
 _cpm_kernel_available = _is_package_available("cpm_kernels")
+_bitsandbytes_available = _is_package_available("bitsandbytes")


 def is_einops_available():
@@ -71,6 +72,10 @@ def is_cpm_kernels_available():
    return _cpm_kernel_available


+def is_bitsandbytes_available():
+    return _bitsandbytes_available
+
+
 def is_torch_available():
    global _torch_available
    if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
--- a/tests/_strategies/_configuration.py
+++ b/tests/_strategies/_configuration.py
@@ -37,6 +37,9 @@ def model_settings(draw: st.DrawFn):
        requires_gpu=st.booleans(),
        trust_remote_code=st.booleans(),
        requirements=st.none() | st.lists(st.text(), min_size=1),
+        use_pipeline=st.booleans(),
+        model_type=st.sampled_from(["causal_lm", "seq2seq_lm"]),
+        runtime=st.sampled_from(["transformers", "cpp"]),
        name_type=st.sampled_from(["dasherize", "lowercase"]),
        timeout=st.integers(min_value=3600),
        workers_per_resource=st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
--- a/tests/test_configuration.py
+++ b/tests/test_configuration.py
@@ -23,7 +23,8 @@ from hypothesis import assume, given
 from hypothesis import strategies as st

 import openllm
-from openllm._configuration import GenerationConfig, ModelSettings, _field_env_key
+from openllm._configuration import (GenerationConfig, ModelSettings,
+                                    _field_env_key)
 from openllm.utils import DEBUG

 from ._strategies._configuration import make_llm_config, model_settings
@@ -67,7 +68,7 @@ def test_forbidden_access():

@given(model_settings())
 def test_class_normal_gen(gen_settings: ModelSettings):
-    assume(gen_settings["default_id"] and gen_settings["model_ids"])
+    assume(gen_settings["default_id"] and all(i for i in gen_settings["model_ids"]))
    cl_: type[openllm.LLMConfig] = make_llm_config("NotFullLLM", gen_settings)
    assert issubclass(cl_, openllm.LLMConfig)
    for key in gen_settings:
--- a/tools/assert-model-table-latest
+++ b/tools/assert-model-table-latest
@@ -3,11 +3,10 @@
 from __future__ import annotations

 import os
+import subprocess

 from markdown_it import MarkdownIt

-import openllm
-
 md = MarkdownIt()

 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -17,7 +16,7 @@ with open(os.path.join(ROOT, "README.md"), "r") as f:
 # NOTE: Currently, we only have one table in README, which is the Model readme.
 table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]

-available = len(openllm.CONFIG_MAPPING.keys())
+available = subprocess.check_output(["openllm", "models", "-o", "porcelain"]).strip().decode("utf-8").count("\n") + 1

 on_table = len(table)  # NOTE: minus the header

--- a/tools/update-optional-dependencies.py
+++ b/tools/update-optional-dependencies.py
@@ -31,9 +31,9 @@ FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 OPENAI_DEPS = ["openai", "tiktoken"]

 _base_requirements = {
-    inflection.dasherize(name): config.__openllm_requirements__
-    for name, config in openllm.CONFIG_MAPPING.items()
-    if config.__openllm_requirements__
+    inflection.dasherize(name): config_cls.__openllm_requirements__
+    for name, config_cls in openllm.CONFIG_MAPPING.items()
+    if config_cls.__openllm_requirements__
 }

 # NOTE: update this table when adding new external dependencies
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -47,13 +47,13 @@ def main() -> int:
        "Model Ids": [],
    }
    max_install_len_div = 0
-    for name, config in openllm.CONFIG_MAPPING.items():
+    for name, config_cls in openllm.CONFIG_MAPPING.items():
        dashed = inflection.dasherize(name)
        formatted["Model"].append(dashed)
-        formatted["URL"].append(config.__openllm_url__)
+        formatted["URL"].append(config_cls.__openllm_url__)
        formatted["GPU"].append("✅")
-        formatted["CPU"].append("✅" if not config.__openllm_requires_gpu__ else "❌")
-        formatted["Model Ids"].append(config.__openllm_model_ids__)
+        formatted["CPU"].append("✅" if not config_cls.__openllm_requires_gpu__ else "❌")
+        formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
        if dashed in deps:
            instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
        else: