commit dd8b6050b2742138eaf1c1d2eba7e73cb2378a8b
Author: Chaoyu <paranoyang@gmail.com>
Date:   Tue Apr 18 17:27:53 2023 -0700

    feat: FLAN-T5 supports
    
    - add infrastructure, to be implemented: cache, chat history
    
    - Base Runnable Implementation, that fits LangChain API
    
    - Added a Prompt descriptor and utils.
    
    feat: license headers and auto factory impl and CLI
    
    Auto construct args from pydantic config
    
    Add auto factory for ease of use
    
    only provide `/generate` to streamline UX experience
    
    CLI > envvar > input contract for configuration
    
    fix: serve from a thread
    
    fix CLI args
    
    chore: cleanup names and refactor imports
    
    Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

diff --git a/.bazelignore b/.bazelignore
new file mode 100644
index 00000000..e3fbd983
--- /dev/null
+++ b/.bazelignore
@@ -0,0 +1,2 @@
+build
+node_modules
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 00000000..625f1e43
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,3 @@
+# load bazelrc from the legacy location as recommended 
+# in https://github.com/bazelbuild/bazel/issues/6319
+import %workspace%/tools/bazel.rc
diff --git a/.bazelversion b/.bazelversion
new file mode 100644
index 00000000..09b254e9
--- /dev/null
+++ b/.bazelversion
@@ -0,0 +1 @@
+6.0.0
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 00000000..aa8a63dc
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @aarnphm @parano @ssheng 
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 00000000..96a6be27
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,57 @@
+name: 🐛 Bug Report
+description: Create a bug report on OpenLLM.
+title: ''
+labels: ['']
+body:
+  - type: markdown
+    id: exists
+    attributes:
+      value: |
+        Please search to see if an issue already exists for the bug you encountered.
+        See [Searching Issues and Pull Requests](https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests) for how to use the GitHub search bar and filters.
+  - type: textarea
+    id: describe-the-bug
+    validations:
+      required: true
+    attributes:
+      label: Describe the bug
+      description: |
+        Please provide a clear and concise description about the problem you ran into.
+      placeholder: This happened when I...
+  - type: textarea
+    id: to-reproduce
+    validations:
+      required: false
+    attributes:
+      label: To reproduce
+      description: |
+        Please provide a code sample or a code snipet to reproduce said problem. If you have code snippets, error messages, stack trace please also provide them here.
+
+        **IMPORTANT**: make sure to use [code tag](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks#syntax-highlighting) to correctly format your code. Screenshot is helpful but don't use it for code snippets as it doesn't allow others to copy-and-paste your code.
+
+        To give us more information for diagnosing the issue, it would be great if you can provide a minimal reproducible!
+      placeholder: |
+        Steps to reproduce the bug:
+
+          1. Provide '...'
+          2. Run '...'
+          3. See error
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: 'Please include the Python logs if you can.'
+      render: shell
+  - type: textarea
+    id: environment-info
+    attributes:
+      label: Environment
+      description: |
+        Please share your environment with us. You should run `bentoml env`, `transformers-cli env` and paste the result here.
+      placeholder: |
+        bentoml: ...
+        transformers: ...
+        python: ...
+        platform: ...
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..1980b167
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+blank_issues_enabled: true
+version: 2.1
+contact_links:
+  - name: Blank issues
+    url: https://github.com/bentoml/open-llm-server/issues/new
+    about: To create a blank issue
+  - name: BentoML Discussions
+    url: https://github.com/bentoml/BentoML/discussions
+    about: Please ask general questions here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 00000000..a542e870
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,35 @@
+name: 🚀 Feature Request
+description: Submit a proposal/request for new OpenLLM features.
+title: ''
+labels: ['']
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature request.
+      placeholder: |
+        I would like it if...
+  - type: textarea
+    id: motivation
+    validations:
+      required: false
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for this feature request. Is your feature request related to a problem? e.g., I'm always frustrated when [...].
+        If this is related to another issue, please link here too.
+        If you have a current workaround, please also provide it here.
+      placeholder: |
+        This feature would solve ...
+  - type: textarea
+    id: other
+    attributes:
+      label: Other
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
+      placeholder: |
+        I would love to contribute ...
diff --git a/.github/actions/setup-repo/action.yml b/.github/actions/setup-repo/action.yml
new file mode 100644
index 00000000..625651c2
--- /dev/null
+++ b/.github/actions/setup-repo/action.yml
@@ -0,0 +1,46 @@
+name: Setup repo
+description: Setup repo with all features on CI
+inputs:
+  python-version:
+    description: 'Python version'
+    required: true
+    default: '3.8'
+  architecture:
+    description: 'Which architecture to run on'
+    required: true
+    default: x64
+runs:
+  using: composite
+  steps:
+    - name: Fetch base reference.
+      shell: bash
+      run: git fetch origin $GITHUB_BASE_REF
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+        architecture: ${{ inputs.architecture }}
+    - name: Setup node
+      uses: actions/setup-node@v3
+      with:
+        node-version: '17'
+    - name: Get cache key prefix
+      id: get-cache-key-prefix
+      shell: bash
+      run: echo "prefix=${{ runner.os }}-${{ inputs.python-version }}" >> $GITHUB_OUTPUT
+    - name: Get pip cache dir
+      id: pip-cache-dir
+      shell: bash
+      run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
+    - name: Cache pip dependencies
+      uses: actions/cache@v3
+      id: cache-pip
+      with:
+        path: ${{ steps.pip-cache-dir.outputs.dir }}
+        key: ${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi
+        restore-keys: |
+          ${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi-
+    # TODO: setup hatch actions instead
+    - name: Install dependencies
+      shell: bash
+      run: pip install -r requirements/tests.txt
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..d036e3ef
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,15 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: '/'
+    schedule:
+      interval: 'weekly'
+      day: 'monday'
+      time: '09:00'
+  # maintain required dependencies
+  - package-ecosystem: pip
+    directory: '/'
+    schedule:
+      interval: 'daily'
+    open-pull-requests-limit: 5
+    versioning-strategy: increase-if-necessary
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..b96f3e9c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,211 @@
+name: ci
+on:
+  push:
+    branches: [main]
+  pull_request:
+  schedule:
+    - cron: '0 0 * * 1/2'
+env:
+  LINES: 120
+  COLUMNS: 120
+  BENTOML_DO_NOT_TRACK: True
+  PYTEST_PLUGINS: bentoml.testing.pytest.plugin
+# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
+defaults:
+  run:
+    shell: bash --noprofile --norc -exo pipefail {0}
+jobs:
+  codestyle_check:
+    runs-on: ubuntu-latest
+    needs:
+      - diff
+    if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.bentoml == 'true') || github.event_name == 'push' }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.6'
+          architecture: x64
+      - name: Get pip cache dir
+        id: cache-dir
+        run: |
+          echo ::set-output name=dir::$(pip cache dir)
+      - name: Fetch origin
+        run: git fetch origin "$GITHUB_BASE_REF"
+      - name: Setup node
+        uses: actions/setup-node@v3
+        with:
+          node-version: '17'
+      - name: Install pyright
+        run: |
+          npm install -g npm@^7 pyright
+      - name: Setup bufbuild/buf
+        uses: bufbuild/buf-setup-action@v1.17.0
+        with:
+          github_token: ${{ github.token }}
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        id: cache-pip
+        with:
+          path: ${{ steps.cache-dir.outputs.dir }}
+          key: codestyle-${{ hashFiles('requirements/dev-requirements.txt') }}
+      - name: Install dependencies
+        run: |
+          pip install .
+          pip install -r requirements/dev-requirements.txt
+      - name: Format check
+        run: |
+          black --check src examples tests
+          black --check --pyi typings
+          isort --check .
+      - name: Lint check
+        run: ruff check src tests examples
+      - name: Type check
+        if: ${{ github.event_name == 'pull_request' }}
+        run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty pyright
+      - name: Proto check
+        if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.protos == 'true') || github.event_name == 'push' }}
+        run: |
+          buf lint --config "src/bentoml/grpc/buf.yaml" --error-format msvs src
+  documentation_spelling_check:
+    runs-on: ubuntu-latest
+    needs:
+      - diff
+    if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.docs == 'true') || github.event_name == 'push' }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 # fetch all tags and branches
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Get pip cache dir
+        id: cache-dir
+        run: |
+          echo ::set-output name=dir::$(pip cache dir)
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        id: cache-pip
+        with:
+          path: ${{ steps.cache-dir.outputs.dir }}
+          key: ${{ runner.os }}-docs-${{ hashFiles('requirements/docs-requirements.txt') }}
+      - name: Install dependencies
+        run: |
+          pip install .
+          pip install -r requirements/docs-requirements.txt
+      - name: Install libenchant
+        run: |
+          sudo apt-get update && sudo apt-get install -y libenchant-2-dev
+      - name: Run spellcheck script
+        run: make spellcheck-docs
+  unit_tests:
+    needs:
+      - diff
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ['3.7', '3.8', '3.9', '3.10']
+    if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.bentoml == 'true') || github.event_name == 'push' }}
+    name: python${{ matrix.python-version }}_unit_tests (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 # fetch all tags and branches
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Get pip cache dir
+        id: cache-dir
+        run: |
+          echo ::set-output name=dir::$(pip cache dir)
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        id: cache-pip
+        with:
+          path: ${{ steps.cache-dir.outputs.dir }}
+          key: ${{ runner.os }}-tests-${{ hashFiles('requirements/tests-requirements.txt') }}
+      - name: Install dependencies
+        run: |
+          pip install ".[grpc]"
+          pip install -r requirements/tests-requirements.txt
+      - name: Run unit tests
+        run: |
+          OPTS=(--cov-config pyproject.toml --cov=src/bentoml --cov-append)
+          if [ "${{ matrix.os }}" != 'windows-latest' ]; then
+            # we will use pytest-xdist to improve tests run-time.
+            OPTS=(${OPTS[@]} --dist loadfile -n auto --run-grpc-tests)
+          fi
+          # Now run the unit tests
+          coverage run -m pytest tests/unit "${OPTS[@]}"
+  bento_server_e2e_tests:
+    needs:
+      - diff
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ['3.7', '3.8', '3.9', '3.10']
+        server_type: ['http', 'grpc']
+        exclude:
+          - os: windows-latest
+            server_type: 'grpc'
+          - os: macos-latest
+            server_type: 'grpc'
+            python-version: '3.10'
+    if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.bentoml == 'true') || github.event_name == 'push' }}
+    name: python${{ matrix.python-version }}_${{ matrix.server_type }}_e2e_tests (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 20
+    env:
+      SETUPTOOLS_USE_DISTUTILS: stdlib
+      BENTOML_BUNDLE_LOCAL_BUILD: True
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 # fetch all tags and branches
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+      - name: Set up Docker Buildx
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        id: buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Get pip cache dir
+        id: cache-dir
+        run: |
+          echo ::set-output name=dir::$(pip cache dir)
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        id: cache-pip
+        with:
+          path: ${{ steps.cache-dir.outputs.dir }}
+          key: ${{ runner.os }}-tests-${{ hashFiles('requirements/tests-requirements.txt') }}
+      - name: Install dependencies for ${{ matrix.server_type }}-based tests.
+        run: |
+          pip install -r requirements/tests-requirements.txt
+          if [ "${{ matrix.server_type }}" == 'grpc' ]; then
+            pip install -e ".[grpc]"
+          else
+            pip install -e .
+          fi
+          if [ -f "tests/e2e/bento_server_${{ matrix.server_type }}/requirements.txt" ]; then
+            pip install -r tests/e2e/bento_server_${{ matrix.server_type }}/requirements.txt
+          fi
+      - name: Run ${{ matrix.server_type }} tests and generate coverage report
+        run: |
+          OPTS=(--cov-config pyproject.toml --cov=src/bentoml --cov-append)
+          coverage run -m pytest tests/e2e/bento_server_${{ matrix.server_type }} "${OPTS[@]}"
+concurrency:
+  group: ci-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
new file mode 100644
index 00000000..ec55784a
--- /dev/null
+++ b/.github/workflows/codeql-analysis.yml
@@ -0,0 +1,33 @@
+name: 'codeql'
+on:
+  push:
+    branches: [main]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [main]
+  schedule:
+    - cron: '37 15 * * 2'
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ['python']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v2
+        with:
+          languages: ${{ matrix.language }}
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v2
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v2
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..28c87b44
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,130 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+bazel-*
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 00000000..365e7de4
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,185 @@
+load("@bazel_skylib//rules:write_file.bzl", "write_file")
+
+# rules_python
+load("@com_github_bazelbuild_buildtools//buildifier:def.bzl", "buildifier", "buildifier_test")
+load("//rules/py/vendorred:pypi.bzl", "entry_point")
+
+package(default_visibility = ["//:__subpackages__"])
+
+exports_files([
+    "README.md",
+    "package.json",
+    "yarn.lock",
+])
+
+# export default pyproject.toml config
+filegroup(
+    name = "pyproject",
+    srcs = ["pyproject.toml"],
+)
+
+genrule(
+    name = "make-tests-requirements",
+    srcs = ["@tests//:requirements.bzl"],
+    outs = ["tests.clean.bzl"],
+    cmd = " | ".join([
+        "cat $<",
+        "cat >$@",
+    ]),
+)
+
+write_file(
+    name = "gen-tests-starlark",
+    out = "update-tests.sh",
+    content = [
+        # This depends on bash, which is not going to work on Windows.
+        "#!/usr/bin/env bash",
+        "cd $BUILD_WORKSPACE_DIRECTORY",
+        "cp -fv bazel-bin/tests.clean.bzl rules/py/vendorred/tests.bzl",
+    ],
+)
+
+sh_binary(
+    name = "vendor-tests",
+    srcs = ["update-tests.sh"],
+    data = [":make-tests-requirements"],
+)
+
+genrule(
+    name = "make-tensorflow-requirements",
+    srcs = ["@tensorflow//:requirements.bzl"],
+    outs = ["tensorflow.clean.bzl"],
+    cmd = " | ".join([
+        "cat $<",
+        "cat >$@",
+    ]),
+)
+
+write_file(
+    name = "gen-tensorflow-starlark",
+    out = "update-tensorflow.sh",
+    content = [
+        # This depends on bash, which is not going to work on Windows.
+        "#!/usr/bin/env bash",
+        "cd $BUILD_WORKSPACE_DIRECTORY",
+        "cp -fv bazel-bin/tensorflow.clean.bzl rules/py/vendorred/tensorflow.bzl",
+    ],
+)
+
+sh_binary(
+    name = "vendor-tensorflow",
+    srcs = ["update-tensorflow.sh"],
+    data = [":make-tensorflow-requirements"],
+)
+
+genrule(
+    name = "make-pypi-requirements",
+    srcs = ["//rules/py/vendorred:pypi.bzl"],
+    outs = ["pypi.clean.bzl"],
+    cmd = " | ".join([
+        "cat $<",
+        "cat >$@",
+    ]),
+)
+
+write_file(
+    name = "gen-pypi-starlark",
+    out = "update-pypi.sh",
+    content = [
+        # This depends on bash, which is not going to work on Windows.
+        "#!/usr/bin/env bash",
+        "cd $BUILD_WORKSPACE_DIRECTORY",
+        "cp -fv bazel-bin/pypi.clean.bzl rules/py/vendorred/pypi.bzl",
+    ],
+)
+
+sh_binary(
+    name = "vendor-pypi",
+    srcs = ["update-pypi.sh"],
+    data = [":make-pypi-requirements"],
+)
+
+[
+    alias(
+        name = "{}_proto".format(version),
+        actual = "//src/bentoml/grpc/{}:service_proto".format(version),
+        visibility = ["//visibility:public"],
+    )
+    for version in [
+        "v1alpha1",
+        "v1",
+    ]
+]
+
+[
+    alias(
+        name = "{}_proto_lint".format(version),
+        actual = "//src/bentoml/grpc/{}:service_proto_lint".format(version),
+        visibility = ["//visibility:public"],
+    )
+    for version in [
+        "v1alpha1",
+        "v1",
+    ]
+]
+
+## Expose public entrypoint for Bazel target.
+alias(
+    name = "cli",
+    actual = "//src/bentoml_cli:cli",
+    visibility = ["//visibility:public"],
+)
+
+alias(
+    name = "sdk",
+    actual = "//src/bentoml:bentoml",
+    visibility = ["//visibility:public"],
+)
+
+# thirdparty alias
+alias(
+    name = "buildozer",
+    actual = "@com_github_bazelbuild_buildtools//buildozer",
+)
+
+buildifier(
+    name = "buildfmt",
+)
+
+buildifier_test(
+    name = "buildcheck",
+    srcs = glob([
+        "**/*.bzl",
+        "**/*.bazel",
+    ]),
+)
+
+alias(
+    name = "pyright",
+    actual = "@npm//:node_modules/pyright/index.js",
+)
+
+alias(
+    name = "sphinx-build",
+    actual = entry_point(
+        "sphinx",
+        script = "sphinx-build",
+    ),
+)
+
+alias(
+    name = "sphinx-autobuild",
+    actual = entry_point("sphinx-autobuild"),
+)
+
+[
+    alias(
+        name = tool,
+        actual = entry_point(tool),
+    )
+    for tool in [
+        "black",
+        "isort",
+        "pylint",
+    ]
+]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..ce3d54b2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 Aaron Pham and BentoML Team. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..3ebca145
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+<div align="center">
+    <h1 align="center">OpenLLM</h1>
+    <br>
+    <strong>REST/gRPC API server for running any Open Large-Language Model - StableLM, Llama, Alpaca, Dolly, Flan-T5, and more<br></strong>
+    <i>Powered by BentoML 🍱 + HuggingFace 🤗</i>
+    <br>
+</div>
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 00000000..a49cdbaf
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,53 @@
+# TODO: Migrate to bzlmod once 6.0.0 is released.
+workspace(name = "com_github_bentoml_bentoml")
+
+load("//rules:deps.bzl", "bentoml_dependencies")
+
+bentoml_dependencies()
+
+load("@com_github_bentoml_plugins//rules:deps.bzl", "plugins_dependencies")
+
+plugins_dependencies()
+
+# NOTE: external users wish to use BentoML workspace setup
+# should always be loaded in this order.
+load("@com_github_bentoml_plugins//rules:workspace0.bzl", "workspace0")
+
+workspace0()
+
+load("@com_github_bentoml_plugins//rules:workspace1.bzl", "workspace1")
+
+workspace1()
+
+load("@com_github_bentoml_plugins//rules:workspace2.bzl", "workspace2")
+
+workspace2()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "pypi",
+    requirements = "//requirements:bazel-requirements.lock.txt",
+)
+
+pip_parse(
+    name = "tensorflow",
+    requirements = "//requirements:bazel-tensorflow-requirements.lock.txt",
+)
+
+pip_parse(
+    name = "tests",
+    requirements = "//requirements:bazel-tests-requirements.lock.txt",
+)
+
+load("//rules/py/vendorred:pypi.bzl", pypi_deps = "install_deps")
+
+pypi_deps()
+
+load("//rules/py/vendorred:tests.bzl", tests_deps = "install_deps")
+
+tests_deps()
+
+load("//rules/py/vendorred:tensorflow.bzl", tensorflow_deps = "install_deps")
+
+tensorflow_deps()
diff --git a/examples/interaction.py b/examples/interaction.py
new file mode 100644
index 00000000..e69de29b
diff --git a/package.json b/package.json
new file mode 100644
index 00000000..3f1f3259
--- /dev/null
+++ b/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "openllm-tools",
+  "version": "0.0.0",
+  "description": "JS tooling for OpenLLM",
+  "author": "Aaron Pham",
+  "license": "Apache-2.0",
+  "dependencies": {
+    "pyright": "^1.1.279",
+    "@grpc/grpc-js": "^1.7.1",
+    "google-protobuf": "^3.21.0",
+    "grpc-tools": "^1.11.2",
+    "ts-protoc-gen": "^0.15.0"
+  }
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..29a6234c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,188 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "openllm"
+dynamic = ["version"]
+description = 'OpenLLM: REST/gRPC API server for running any open Large-Language Model - StableLM, Llama, Alpaca, Dolly, Flan-T5, Custom'
+readme = "README.md"
+requires-python = ">=3.8"
+license = "Apache-2.0"
+keywords = [
+    "MLOps",
+    "AI",
+    "BentoML",
+    "Model Serving",
+    "Model Deployment",
+    "LLMOps",
+    "Large Language Model",
+    "Generative AI",
+    "Stable Diffusion",
+    "StableLM",
+    "Alpaca",
+    "PyTorch",
+    "Transformers",
+]
+authors = [
+    { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+    { name = "BentoML Team", email = "contact@bentoml.com" },
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: Apache Software License",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+    # bentoml[io] includes pydantic, PIL, filetype, pandas and numpy
+    # bentoml[grpc,grpc-reflection] include grpcio, grpcio-reflection
+    "bentoml[io,grpc,grpc-reflection]>=1.0.19",
+    # bentoml[torch] includes torch and transformers
+    "transformers[torch]>=4.28.1",
+    # Super fast JSON serialization
+    "orjson",
+    # clidantic
+    "clidantic",
+]
+
+[project.urls]
+Documentation = "https://github.com/bentoml/open-llm-server#readme"
+Issues = "https://github.com/bentoml/open-llm-server/issues"
+Source = "https://github.com/bentoml/open-llm-server"
+
+[project.scripts]
+openllm = "openllm.__main__:cli"
+
+[tool.hatch.version]
+path = "src/openllm/__about__.py"
+
+[tool.hatch.envs.default]
+dependencies = [
+    "coverage[toml]>=6.5",
+    "pytest",
+    "pytest-asyncio>=0.21.0",
+    "pytest-xdist[psutil]",
+    "pytest-cov",
+    "pytest-mock",
+    "pytest-randomly",
+    "pytest-rerunfailures",
+]
+[tool.hatch.envs.default.scripts]
+test = "pytest {args:tests}"
+test-cov = "coverage run -m pytest {args:tests}"
+cov-report = ["- coverage combine", "coverage report"]
+cov = ["test-cov", "cov-report"]
+
+[[tool.hatch.envs.all.matrix]]
+python = ["3.8", "3.9", "3.10", "3.11"]
+
+[tool.hatch.envs.lint]
+detached = true
+dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", "pyright"]
+[tool.hatch.envs.lint.scripts]
+typing = "pyright {args:src/openllm tests}"
+style = ["ruff {args:.}", "black --check --diff {args:.}"]
+fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
+all = ["style", "typing"]
+
+[tool.pytest.ini_options]
+addopts = ["-rfEX", "-pno:warnings"]
+python_files = ["test_*.py", "*_test.py"]
+testpaths = ["tests"]
+
+[tool.black]
+target-version = ["py311"]
+line-length = 120
+exclude = '''
+(
+  /(
+      \.eggs
+    | \.git
+    | \.tox
+    | \.venv
+    | _build
+    | .build
+    | bazel-*
+    | build
+    | venv
+    | lib
+    | dist
+  )/
+  | src/openllm/__about__.py
+)
+'''
+
+[tool.ruff]
+target-version = "py311"
+line-length = 120
+ignore = [
+    # Allow non-abstract empty methods in abstract base classes
+    "B027",
+    # Allow boolean positional values in function calls, like `dict.get(... True)`
+    "FBT003",
+    # Ignore checks for possible passwords
+    "S105",
+    "S106",
+    "S107",
+    # Ignore complexity
+    "C901",
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+]
+unfixable = [
+    "F401", # Don't touch unused imports, just warn about it.
+]
+
+[tool.ruff.pydocstyle]
+convention = "google"
+
+[tool.ruff.isort]
+known-first-party = ["openllm", "bentoml"]
+
+[tool.ruff.flake8-quotes]
+inline-quotes = "single"
+
+[tool.ruff.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.per-file-ignores]
+# Tests can use magic values, assertions, and relative imports
+"tests/**/*" = ["PLR2004", "S101", "TID252"]
+
+[tool.pyright]
+pythonVersion = "3.11"
+include = ["src/", "tests/"]
+analysis.useLibraryCodeForTypes = true
+typeCheckingMode = "strict"
+strictListInference = true
+strictDictionaryInference = true
+strictSetInference = true
+strictParameterNoneValue = true
+enableTypeIgnoreComments = true
+
+
+[tool.coverage.run]
+source_pkgs = ["openllm", "tests"]
+branch = true
+parallel = true
+omit = ["src/openllm/__about__.py"]
+
+[tool.coverage.paths]
+openllm = ["src/openllm", "*/openllm/src/openllm"]
+tests = ["tests", "*/openllm/tests"]
+
+[tool.coverage.report]
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
diff --git a/rules/BUILD b/rules/BUILD
new file mode 100644
index 00000000..c8b1eabb
--- /dev/null
+++ b/rules/BUILD
@@ -0,0 +1,11 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+package(default_visibility = ["//:__pkg__"])
+
+[
+    bzl_library(
+        name = "{}.bzl".format(src),
+        srcs = ["{}.bzl".format(src)],
+    )
+    for src in ["deps"]
+]
diff --git a/rules/deps.bzl b/rules/deps.bzl
new file mode 100644
index 00000000..a1dbfa28
--- /dev/null
+++ b/rules/deps.bzl
@@ -0,0 +1,190 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+
+# NOTE: sync with pyproject.toml
+GRPC_VERSION = "1.51.1"
+GRPC_SHA256 = "b55696fb249669744de3e71acc54a9382bea0dce7cd5ba379b356b12b82d4229"
+PROTOBUF_VERSION = "21.11"
+PROTOBUF_SHA256 = "b1d6dd2cbb5d87e17af41cadb720322ce7e13af826268707bd8db47e5654770b"
+
+def bentoml_dependencies():
+    # bentoml/plugins
+    maybe(
+        git_repository,
+        name = "com_github_bentoml_plugins",
+        remote = "https://github.com/bentoml/plugins.git",
+        branch = "main",
+    )
+
+    maybe(
+        http_archive,
+        name = "bazel_skylib",
+        sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+            "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "io_bazel_rules_go",
+        sha256 = "d6b2513456fe2229811da7eb67a444be7785f5323c6708b38d851d2b51e54d83",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.30.0/rules_go-v0.30.0.zip",
+            "https://github.com/bazelbuild/rules_go/releases/download/v0.30.0/rules_go-v0.30.0.zip",
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "io_bazel_rules_docker",
+        sha256 = "b1e80761a8a8243d03ebca8845e9cc1ba6c82ce7c5179ce2b295cd36f7e394bf",
+        urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.25.0/rules_docker-v0.25.0.tar.gz"],
+    )
+
+    maybe(
+        http_archive,
+        name = "bazel_gazelle",
+        sha256 = "de69a09dc70417580aabf20a28619bb3ef60d038470c7cf8442fafcf627c21cb",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.24.0/bazel-gazelle-v0.24.0.tar.gz",
+            "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.24.0/bazel-gazelle-v0.24.0.tar.gz",
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "rules_proto",
+        sha256 = "80d3a4ec17354cccc898bfe32118edd934f851b03029d63ef3fc7c8663a7415c",
+        strip_prefix = "rules_proto-5.3.0-21.5",
+        urls = [
+            "https://github.com/bazelbuild/rules_proto/archive/refs/tags/5.3.0-21.5.tar.gz",
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "rules_proto_grpc",
+        strip_prefix = "rules_proto_grpc-4.2.0",
+        sha256 = "bbe4db93499f5c9414926e46f9e35016999a4e9f6e3522482d3760dc61011070",
+        urls = ["https://github.com/rules-proto-grpc/rules_proto_grpc/archive/4.2.0.tar.gz"],
+    )
+
+    maybe(
+        http_archive,
+        name = "com_google_protobuf",
+        strip_prefix = "protobuf-{}".format(PROTOBUF_VERSION),
+        sha256 = PROTOBUF_SHA256,
+        urls = [
+            "https://github.com/protocolbuffers/protobuf/archive/v{}.tar.gz".format(PROTOBUF_VERSION),
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "com_github_grpc_grpc",
+        strip_prefix = "grpc-{}".format(GRPC_VERSION),
+        sha256 = GRPC_SHA256,
+        urls = [
+            "https://github.com/grpc/grpc/archive/v{}.tar.gz".format(GRPC_VERSION),
+        ],
+    )
+
+    maybe(
+        http_archive,
+        name = "rules_foreign_cc",
+        sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
+        strip_prefix = "rules_foreign_cc-0.9.0",
+        url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz",
+    )
+
+    # buildifier
+    maybe(
+        http_archive,
+        name = "com_github_bazelbuild_buildtools",
+        sha256 = "ae34c344514e08c23e90da0e2d6cb700fcd28e80c02e23e4d5715dddcb42f7b3",
+        strip_prefix = "buildtools-4.2.2",
+        urls = [
+            "https://github.com/bazelbuild/buildtools/archive/refs/tags/4.2.2.tar.gz",
+        ],
+    )
+
+    # buf rules
+    maybe(
+        http_archive,
+        name = "rules_buf",
+        sha256 = "523a4e06f0746661e092d083757263a249fedca535bd6dd819a8c50de074731a",
+        strip_prefix = "rules_buf-0.1.1",
+        urls = [
+            "https://github.com/bufbuild/rules_buf/archive/refs/tags/v0.1.1.zip",
+        ],
+    )
+
+    # python rules
+    maybe(
+        http_archive,
+        name = "rules_python",
+        sha256 = "8c15896f6686beb5c631a4459a3aa8392daccaab805ea899c9d14215074b60ef",
+        strip_prefix = "rules_python-0.17.3",
+        url = "https://github.com/bazelbuild/rules_python/archive/refs/tags/0.17.3.tar.gz",
+    )
+
+    # The following library will need to be built from source.
+    maybe(
+        new_git_repository,
+        name = "com_github_microsoft_lightgbm",
+        init_submodules = True,
+        recursive_init_submodules = True,
+        commit = "f1d3181ced9fd01f4b2899054abd99be6773e939",
+        build_file = Label("//third_party:BUILD.lightgbm"),
+        remote = "https://github.com/microsoft/LightGBM.git",
+        shallow_since = "1667710116 -0500",
+    )
+
+    # io_grpc_grpc_java is for java_grpc_library and related dependencies.
+    # Using commit 0cda133c52ed937f9b0a19bcbfc36bf2892c7aa8
+    maybe(
+        http_archive,
+        name = "rules_jvm_external",
+        sha256 = "c21ce8b8c4ccac87c809c317def87644cdc3a9dd650c74f41698d761c95175f3",
+        strip_prefix = "rules_jvm_external-1498ac6ccd3ea9cdb84afed65aa257c57abf3e0a",
+        url = "https://github.com/bazelbuild/rules_jvm_external/archive/1498ac6ccd3ea9cdb84afed65aa257c57abf3e0a.zip",
+    )
+    maybe(
+        http_archive,
+        name = "io_grpc_grpc_java",
+        sha256 = "35189faf484096c9eb2928c43b39f2457d1ca39046704ba8c65a69482f8ceed5",
+        strip_prefix = "grpc-java-0cda133c52ed937f9b0a19bcbfc36bf2892c7aa8",
+        urls = ["https://github.com/grpc/grpc-java/archive/0cda133c52ed937f9b0a19bcbfc36bf2892c7aa8.tar.gz"],
+    )
+
+    # rules_kotlin
+    maybe(
+        http_archive,
+        name = "io_bazel_rules_kotlin",
+        sha256 = "a57591404423a52bd6b18ebba7979e8cd2243534736c5c94d35c89718ea38f94",
+        urls = ["https://github.com/bazelbuild/rules_kotlin/releases/download/v1.6.0/rules_kotlin_release.tgz"],
+    )
+    maybe(
+        http_archive,
+        name = "com_github_grpc_grpc_kotlin",
+        sha256 = "b1ec1caa5d81f4fa4dca0662f8112711c82d7db6ba89c928ca7baa4de50afbb2",
+        strip_prefix = "grpc-kotlin-a1659c1b3fb665e01a6854224c7fdcafc8e54d56",
+        urls = ["https://github.com/grpc/grpc-kotlin/archive/a1659c1b3fb665e01a6854224c7fdcafc8e54d56.tar.gz"],
+    )
+
+    # rules_swift and rules_apple
+    maybe(
+        http_archive,
+        name = "build_bazel_rules_swift",
+        sha256 = "51efdaf85e04e51174de76ef563f255451d5a5cd24c61ad902feeadafc7046d9",
+        url = "https://github.com/bazelbuild/rules_swift/releases/download/1.2.0/rules_swift.1.2.0.tar.gz",
+    )
+    maybe(
+        http_archive,
+        name = "build_bazel_apple_support",
+        sha256 = "2e3dc4d0000e8c2f5782ea7bb53162f37c485b5d8dc62bb3d7d7fc7c276f0d00",
+        url = "https://github.com/bazelbuild/apple_support/releases/download/1.3.2/apple_support.1.3.2.tar.gz",
+    )
diff --git a/src/openllm/__about__.py b/src/openllm/__about__.py
new file mode 100644
index 00000000..3ab2e17e
--- /dev/null
+++ b/src/openllm/__about__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.2"
diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
new file mode 100644
index 00000000..dcaae940
--- /dev/null
+++ b/src/openllm/__init__.py
@@ -0,0 +1,228 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+OpenLLM
+=======
+
+OpenLLM: Your one stop-and-go-solution for serving any Open Large-Language Model
+
+- StableLM, Llama, Alpaca, Dolly, Flan-T5, and more
+- Powered by BentoML 🍱 + HuggingFace 🤗
+"""
+from __future__ import annotations
+
+import typing as t
+
+from .__about__ import __version__ as __version__
+from .exceptions import MissingDependencyError
+from .utils import LazyModule as _LazyModule
+from .utils import import_utils_shim as imports
+
+_import_structure = {
+    "build_utils": [],
+    # TODO: implement
+    # "cache": [],
+    "cli": [],
+    "configuration_utils": ["LLMConfig"],
+    "exceptions": [],
+    "prompts": ["Prompt"],
+    "runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
+    "schema": ["PromptTemplate"],
+    "server_utils": ["start"],
+    "types": [],
+    "utils": [
+        "get_pretrained_env",
+        "get_working_dir",
+        "FRAMEWORK_ENV_VAR",
+        "generate_service_name",
+        "generate_tag_from_model_name",
+    ],
+    "utils.logging": [],
+    "models": [],
+    # NOTE: models
+    "models.auto": [
+        "Config",
+        "CONFIG_MAPPING",
+        "Tokenizer",
+        "TOKENIZER_MAPPING",
+        "TOKENIZER_MAPPING_NAMES",
+    ],
+    "models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
+}
+
+try:
+    if not imports.is_torch_available():
+        raise MissingDependencyError
+except MissingDependencyError:
+    pass
+else:
+    _import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
+    _import_structure["models.auto"].extend(
+        [
+            "LLM",
+            "LLMWithTokenizer",
+            "MODEL_MAPPING_NAMES",
+            "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
+            "MODEL_MAPPING",
+            "MODEL_WITH_TOKENIZER_MAPPING",
+        ]
+    )
+
+try:
+    if not imports.is_flax_available():
+        raise MissingDependencyError
+except MissingDependencyError:
+    pass
+else:
+    _import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
+    _import_structure["models.auto"].extend(
+        [
+            "FlaxLLM",
+            "FlaxLLMWithTokenizer",
+            "MODEL_FLAX_MAPPING_NAMES",
+            "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
+            "MODEL_FLAX_MAPPING",
+            "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
+        ]
+    )
+
+try:
+    if not imports.is_tf_available():
+        raise MissingDependencyError
+except MissingDependencyError:
+    pass
+else:
+    _import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
+    _import_structure["models.auto"].extend(
+        [
+            "TFLLM",
+            "TFLLMWithTokenizer",
+            "MODEL_TF_MAPPING_NAMES",
+            "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
+            "MODEL_TF_MAPPING",
+            "MODEL_TF_WITH_TOKENIZER_MAPPING",
+        ]
+    )
+
+
+# declaration for OpenLLM-related modules
+if t.TYPE_CHECKING:
+    from . import build_utils as build_utils
+    from . import cli as cli
+    from . import configuration_utils as configuration_utils
+    from . import exceptions as exceptions
+    from . import models as models
+    from . import prompts as prompts
+    from . import runner_utils as runner_utils
+    from . import schema as schema
+    from . import server_utils as server_utils
+    from . import types as types
+    from . import utils as utils
+    # Specific types import
+    from .configuration_utils import LLMConfig as LLMConfig
+    from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
+    from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
+    from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
+    from .models.auto import Config as Config
+    from .models.auto import Tokenizer as Tokenizer
+    from .models.flan_t5 import \
+        START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
+    from .models.flan_t5 import FlanT5Config as FlanT5Config
+    from .prompts import Prompt as Prompt
+    from .runner_utils import LLMRunnable as LLMRunnable
+    from .runner_utils import LLMRunner as LLMRunner
+    from .runner_utils import \
+        generate_tokenizer_runner as generate_tokenizer_runner
+    from .schema import PromptTemplate as PromptTemplate
+    from .server_utils import start as start
+    from .utils import FRAMEWORK_ENV_VAR as FRAMEWORK_ENV_VAR
+    from .utils import generate_service_name as generate_service_name
+    from .utils import \
+        generate_tag_from_model_name as generate_tag_from_model_name
+    from .utils import get_pretrained_env as get_pretrained_env
+    from .utils import get_working_dir as get_working_dir
+
+    try:
+        if not imports.is_torch_available():
+            raise MissingDependencyError
+    except MissingDependencyError:
+        pass
+    else:
+        from .models.auto import LLM as LLM
+        from .models.auto import MODEL_MAPPING as MODEL_MAPPING
+        from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
+        from .models.auto import \
+            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
+        from .models.auto import \
+            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_WITH_TOKENIZER_MAPPING_NAMES
+        from .models.auto import LLMWithTokenizer as LLMWithTokenizer
+        from .models.flan_t5 import FlanT5 as FlanT5
+        from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
+        from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer
+
+    try:
+        if not imports.is_flax_available():
+            raise MissingDependencyError
+    except MissingDependencyError:
+        pass
+    else:
+        from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
+        from .models.auto import \
+            MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
+        from .models.auto import \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING
+        from .models.auto import \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
+        from .models.auto import FlaxLLM as FlaxLLM
+        from .models.auto import FlaxLLMWithTokenizer as FlaxLLMWithTokenizer
+        from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
+        from .models.flan_t5 import \
+            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
+
+    try:
+        if not imports.is_tf_available():
+            raise MissingDependencyError
+    except MissingDependencyError:
+        pass
+    else:
+        from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
+        from .models.auto import \
+            MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
+        from .models.auto import \
+            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
+        from .models.auto import \
+            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
+        from .models.auto import TFLLM as TFLLM
+        from .models.auto import TFLLMWithTokenizer as TFLLMWithTokenizer
+        from .models.flan_t5 import TFFlanT5 as TFFlanT5
+        from .models.flan_t5 import \
+            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )
+    del sys, _LazyModule
+
+del imports, t, _import_structure, MissingDependencyError
diff --git a/src/openllm/__main__.py b/src/openllm/__main__.py
new file mode 100644
index 00000000..e9c17f26
--- /dev/null
+++ b/src/openllm/__main__.py
@@ -0,0 +1,104 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CLI entrypoint for OpenLLM.
+
+Usage:
+    openllm --help
+
+To start any OpenLLM model:
+    openllm start <model_name> --options ...
+"""
+from __future__ import annotations
+
+import typing as t
+
+import click
+
+import openllm
+
+_CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]}
+
+
+@click.group(cls=openllm.cli.OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS)
+@click.version_option(openllm.__version__, "-v", "--version")
+def cli():
+    """
+    \b
+     ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
+    ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
+    ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
+    ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
+    ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
+     ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
+
+    \b
+    OpenLLM: Your one stop-and-go-solution for serving any Open Large-Language Model
+
+        - StableLM, Llama, Alpaca, Dolly, Flan-T5, and more
+
+    \b
+        - Powered by BentoML 🍱 + HuggingFace 🤗
+    """
+
+
+@cli.group(cls=openllm.cli.StartCommand, context_settings=_CONTEXT_SETTINGS)
+def start():
+    """
+    Start any LLM as a REST server.
+
+    $ openllm start <model_name> --<options> ...
+    """
+
+
+@cli.group(cls=openllm.cli.StartCommand, context_settings=_CONTEXT_SETTINGS, _serve_grpc=True, name="start-grpc")
+def start_grpc():
+    """
+    Start any LLM as a gRPC server.
+
+    $ openllm start-grpc <model_name> --<options> ...
+    """
+
+
+@cli.command(aliases=["bundle"])
+def build():
+    """
+    Package a given models.
+
+    If given format is container, then also package the bundle into a container.
+    """
+
+
+@cli.command(hidden=True)
+def deploy():
+    """
+    Deploy a model to a target platform.
+
+    Deployment options:
+    - BentoCloud
+    - Self-hosted Yatai
+    - SageMaker, ECR, EC2
+    """
+
+
+@cli.command(name="supported-models")
+def supported_models():
+    """
+    List all supported models.
+    """
+    click.secho(f"\nSupported LLM: {', '.join(openllm.CONFIG_MAPPING.keys())}", fg="blue")
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/src/openllm/build_utils.py b/src/openllm/build_utils.py
new file mode 100644
index 00000000..a06ae784
--- /dev/null
+++ b/src/openllm/build_utils.py
@@ -0,0 +1,17 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Any build-related utilities. This is used for CI.
+"""
+from __future__ import annotations
diff --git a/src/openllm/cache/__init__.py b/src/openllm/cache/__init__.py
new file mode 100644
index 00000000..aa6319c7
--- /dev/null
+++ b/src/openllm/cache/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Cache utilities for OpenLLM client.
+"""
diff --git a/src/openllm/cache/inmemory.py b/src/openllm/cache/inmemory.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
new file mode 100644
index 00000000..aa9f6607
--- /dev/null
+++ b/src/openllm/cli.py
@@ -0,0 +1,244 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CLI utilities for OpenLLM.
+
+This extends clidantic and BentoML's internal CLI CommandGroup.
+"""
+from __future__ import annotations
+
+import difflib
+import functools
+import inspect
+import logging
+import typing as t
+
+import click
+from click_option_group import optgroup
+
+import openllm
+
+if t.TYPE_CHECKING:
+    from openllm.types import F, P
+
+    class ClickFunctionProtocol(t.Protocol[P]):
+        __name__: str
+        __click_params__: list[click.Option]
+
+        def __call__(*args: P.args, **kwargs: P.kwargs) -> t.Any:
+            ...
+
+    ServeCommand = t.Literal["serve", "serve-grpc", "start-http-server", "start-grpc-server", "start-runner-server"]
+
+
+logger = logging.getLogger(__name__)
+
+
+class OpenLLMCommandGroup(click.Group):
+    NUM_COMMON_PARAMS = 2
+
+    @staticmethod
+    def common_params(f: F[P]) -> ClickFunctionProtocol[t.Any]:
+        # The following logics is similar to one of BentoMLCommandGroup
+
+        from bentoml._internal.configuration import (DEBUG_ENV_VAR,
+                                                     QUIET_ENV_VAR,
+                                                     set_debug_mode,
+                                                     set_quiet_mode)
+        from bentoml._internal.log import configure_logging
+
+        @click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.")
+        @click.option(
+            "--debug", "--verbose", envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help="Print out debug logs."
+        )
+        @functools.wraps(f)
+        def wrapper(quiet: bool, debug: bool, *args: P.args, **kwargs: P.kwargs) -> t.Any:
+            if quiet:
+                set_quiet_mode(True)
+                if debug:
+                    logger.warning("'--quiet' passed; ignoring '--verbose/--debug'")
+            elif debug:
+                set_debug_mode(True)
+
+            configure_logging()
+
+            return f(*args, **kwargs)
+
+        return wrapper
+
+    def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
+        super(OpenLLMCommandGroup, self).__init__(*args, **kwargs)
+        # these two dictionaries will store known aliases for commands and groups
+        self._commands: dict[str, list[str]] = {}
+        self._aliases: dict[str, str] = {}
+
+    # ported from bentoml_cli.utils.BentoMLCommandGroup to handle aliases and command difflib.
+    def resolve_alias(self, cmd_name: str):
+        return self._aliases[cmd_name] if cmd_name in self._aliases else cmd_name
+
+    def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
+        cmd_name = self.resolve_alias(cmd_name)
+        return super(OpenLLMCommandGroup, self).get_command(ctx, cmd_name)
+
+    @staticmethod
+    def common_chain(f: F[P]) -> ClickFunctionProtocol[t.Any]:
+        # Wrap implementation withc common parameters
+        wrapped = OpenLLMCommandGroup.common_params(f)
+        # TODO: Tracking
+        # TODO: Handling exception, using ExceptionGroup and Rich
+
+        # move common parameters to end of the parameters list
+        wrapped.__click_params__ = (
+            wrapped.__click_params__[-OpenLLMCommandGroup.NUM_COMMON_PARAMS :]
+            + wrapped.__click_params__[: -OpenLLMCommandGroup.NUM_COMMON_PARAMS]
+        )
+        return wrapped
+
+    def command(self, *args: t.Any, **kwargs: t.Any) -> t.Callable[[F[P]], click.Command]:
+        if "context_settings" not in kwargs:
+            kwargs["context_settings"] = {}
+        kwargs["context_settings"]["max_content_width"] = 119
+        aliases = kwargs.pop("aliases", None)
+
+        def wrapper(f: F[P]) -> click.Command:
+            name = f.__name__.lower().replace("_", "-")
+            kwargs.setdefault("help", inspect.getdoc(f))
+            kwargs.setdefault("name", name)
+
+            cmd = super(OpenLLMCommandGroup, self).command(*args, **kwargs)(OpenLLMCommandGroup.common_chain(f))
+            # add aliases to a given commands if it is specified.
+            if aliases is not None:
+                assert cmd.name
+                self._commands[cmd.name] = aliases
+                self._aliases.update({alias: cmd.name for alias in aliases})
+
+            return cmd
+
+        return wrapper
+
+    def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
+        rows: list[tuple[str, str]] = []
+        sub_commands = self.list_commands(ctx)
+
+        max_len = max(len(cmd) for cmd in sub_commands)
+        limit = formatter.width - 6 - max_len
+
+        for sub_command in sub_commands:
+            cmd = self.get_command(ctx, sub_command)
+            if cmd is None:
+                continue
+            # If the command is hidden, then we skip it.
+            if hasattr(cmd, "hidden") and cmd.hidden:
+                continue
+            if sub_command in self._commands:
+                aliases = ",".join(sorted(self._commands[sub_command]))
+                sub_command = "%s (%s)" % (sub_command, aliases)
+            # this cmd_help is available since click>=7
+            # BentoML requires click>=7.
+            cmd_help = cmd.get_short_help_str(limit)
+            rows.append((sub_command, cmd_help))
+        if rows:
+            with formatter.section("Commands"):
+                formatter.write_dl(rows)
+
+    def resolve_command(
+        self, ctx: click.Context, args: list[str]
+    ) -> tuple[str | None, click.Command | None, list[str]]:
+        try:
+            return super(OpenLLMCommandGroup, self).resolve_command(ctx, args)
+        except click.exceptions.UsageError as e:
+            error_msg = str(e)
+            original_cmd_name = click.utils.make_str(args[0])
+            matches = difflib.get_close_matches(original_cmd_name, self.list_commands(ctx), 3, 0.5)
+            if matches:
+                fmt_matches = "\n    ".join(matches)
+                error_msg += "\n\n"
+                error_msg += f"Did you mean?\n    {fmt_matches}"
+            raise click.exceptions.UsageError(error_msg, e.ctx)
+
+
+def start_model_command(
+    model_name: str, _context_settings: dict[str, t.Any] | None = None, _serve_grpc: bool = False
+) -> click.Command:
+    _context_settings = _context_settings or {}
+    config = openllm.Config.for_model(model_name)
+
+    def decorator(f: F[P]) -> click.Command:
+        f = openllm.configuration_utils.LLMConfig.generate_click_options(config)(f)
+        f = parse_serve_args(_serve_grpc)(f)
+        return click.command(
+            model_name,
+            short_help=f"Start a LLMServer for '{model_name}' ('--help' for more details)",
+            context_settings=_context_settings,
+            help=getattr(openllm, f"START_{openllm.utils.kebab_to_snake_case(model_name).upper()}_COMMAND_DOCSTRING"),
+        )(openllm.cli.OpenLLMCommandGroup.common_chain(f))
+
+    # The actual `start <model_name>` implementation
+    def model_start(**attrs: t.Any):
+        llm_config_args = {k: attrs[k] for k in config.__fields__ if k in attrs}
+        # The rest should be server-related args
+        server_args = {k: v for k, v in attrs.items() if k not in list(llm_config_args.keys())}
+
+        openllm.start(model_name, server_args=server_args, serve_grpc=_serve_grpc, **llm_config_args)
+
+    return decorator(model_start)
+
+
+class StartCommand(click.MultiCommand):
+    def __init__(self, *args: t.Any, **kwargs: t.Any):
+        self._serve_grpc = kwargs.pop("_serve_grpc", False)
+        super(StartCommand, self).__init__(*args, **kwargs)
+        self._cached_command: dict[str, click.Command] = {}
+
+    def list_commands(self, ctx: click.Context):
+        return openllm.CONFIG_MAPPING.keys()
+
+    def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command:
+        if cmd_name not in self._cached_command:
+            self._cached_command[cmd_name] = start_model_command(cmd_name, _serve_grpc=self._serve_grpc)
+        return self._cached_command[cmd_name]
+
+
+def parse_serve_args(serve_grpc: bool = False) -> F[P]:
+    """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`"""
+    from bentoml_cli.cli import cli
+
+    command = "serve-http" if not serve_grpc else "serve-grpc"
+    group = optgroup.group(
+        f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
+        help=f"Related to serving the model [synonymous to `bentoml {command}`]",
+    )
+
+    def decorator(f: F[P]) -> F[P]:
+        _, serve_command, _ = cli.resolve_command(click.get_current_context(), [command])
+        # The first variable is the argument bento
+        # and the last three are shared default, which we don't need.
+        serve_options = serve_command.params[1:-3]
+        for options in reversed(serve_options):
+            attrs = options.to_info_dict()
+            # we don't need param_type_name, since it should all be options
+            attrs.pop("param_type_name")
+            # name is not a valid args
+            name = attrs.pop("name")
+            # type can be determine from default value
+            attrs.pop("type")
+            param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts"))
+            # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
+            # NOTE: production is also deprecated
+            if name not in ("working_dir", "production"):
+                f = optgroup.option(*param_decls, **attrs)(f)
+
+        return group(f)
+
+    return decorator
diff --git a/src/openllm/configuration_utils.py b/src/openllm/configuration_utils.py
new file mode 100644
index 00000000..8f675e10
--- /dev/null
+++ b/src/openllm/configuration_utils.py
@@ -0,0 +1,101 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Configuration utilities for OpenLLM. All model configuration will inherit from openllm.configuration_utils.LLMConfig.
+
+Note that ``openllm.configuration_utils.LLMConfig`` is a subclass of ``pydantic.BaseModel``. It also has a ``to_cli``
+that returns a list of Click-compatible options for the model. such options will then be parsed to ``openllm.__main__.cli``.
+"""
+from __future__ import annotations
+
+import typing as t
+from abc import ABC
+
+import click
+import pydantic
+import yaml
+from bentoml._internal.models.model import ModelSignature
+from click_option_group import optgroup
+from clidantic.click import (allows_multiple, parse_default, parse_type,
+                             should_show_default)
+from clidantic.convert import param_from_field
+from pydantic.utils import lenient_issubclass
+
+import openllm
+
+if t.TYPE_CHECKING:
+    from openllm.types import F, P
+
+
+__all__ = ["LLMConfig", "ModelSignature"]
+
+delimiter = "."
+internal_delimiter = "__"
+
+
+def models_to_options(
+    model: type[LLMConfig], parent_path: tuple[str, ...] = tuple()
+) -> t.Generator[t.Callable[[F[P]], F[P]], None, None]:
+    # The following logics are inspired from clidantic.convert
+    for field in model.__fields__.values():
+        kebab_name = field.name.replace("_", "-")
+        assert internal_delimiter not in kebab_name, f"Field name {kebab_name} contains internal delimiter"
+        if lenient_issubclass(field.outer_type_, pydantic.BaseModel):
+            yield from models_to_options(field.outer_type_, parent_path=parent_path + (kebab_name,))
+            continue
+
+        params = param_from_field(field, kebab_name, delimiter, internal_delimiter, parent_path)
+        yield optgroup.option(
+            *params,
+            type=parse_type(field.outer_type_),
+            required=field.required,
+            default=parse_default(field.default, field.outer_type_),
+            show_default=should_show_default(field.default, field.outer_type_),
+            multiple=allows_multiple(field.outer_type_),
+            help=field.field_info.description,
+            show_envvar=True,
+            envvar=f"OPENLLM_{model.__name__.partition('Config')[0].upper()}_{field.name.upper()}",
+        )
+
+
+class BaseConfig(pydantic.BaseModel, ABC):
+    class Config:
+        extra = "forbid"
+        underscore_attrs_are_private = True
+
+    def with_options(self, **kwargs: t.Any) -> BaseConfig:
+        return self.copy(update=kwargs)
+
+    @classmethod
+    def from_yaml(cls, path: str, ctx: str | None = None) -> BaseConfig:
+        with open(openllm.utils.resolve_user_filepath(path, ctx=ctx), "rb") as f:
+            content = yaml.safe_load(f)
+        return cls(**content)
+
+    def to_yaml(self):
+        return yaml.safe_dump(self.dict(), sort_keys=False)
+
+
+class LLMConfig(BaseConfig):
+    @staticmethod
+    def generate_click_options(config: LLMConfig) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
+        klass = config.__class__
+        group = optgroup.group(f"{klass.__name__} options", help=f"[Auto-generated from '{klass}']")
+
+        def wrapper(f: t.Callable[..., t.Any]) -> click.Command:
+            for option in reversed(list(models_to_options(klass))):
+                f = option(f)
+            return group(f)
+
+        return wrapper
diff --git a/src/openllm/exceptions.py b/src/openllm/exceptions.py
new file mode 100644
index 00000000..2a27e631
--- /dev/null
+++ b/src/openllm/exceptions.py
@@ -0,0 +1,31 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Base exceptions for OpenLLM. This extends BentoML exceptions.
+"""
+from __future__ import annotations
+
+import bentoml
+
+
+class OpenLLMException(bentoml.exceptions.BentoMLException):
+    """Base class for all OpenLLM exceptions. This extends BentoMLException."""
+
+
+class ForbiddenAttributeError(OpenLLMException):
+    """Raised when using an _internal field."""
+
+
+class MissingDependencyError(BaseException):
+    """Raised when a dependency is missing."""
diff --git a/src/openllm/models/__init__.py b/src/openllm/models/__init__.py
new file mode 100644
index 00000000..b32b98e0
--- /dev/null
+++ b/src/openllm/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import auto as auto
+from . import flan_t5 as flan_t5
diff --git a/src/openllm/models/auto/__init__.py b/src/openllm/models/auto/__init__.py
new file mode 100644
index 00000000..b41f1f5f
--- /dev/null
+++ b/src/openllm/models/auto/__init__.py
@@ -0,0 +1,141 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
+
+from __future__ import annotations
+
+import typing as t
+
+import openllm
+
+from ...utils import LazyModule
+from ...utils import import_utils_shim as imports
+
+_import_structure = {
+    "configuration_auto": ["Config", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
+    "tokenization_auto": ["Tokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
+}
+
+try:
+    if not imports.is_torch_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_auto"] = [
+        "LLM",
+        "LLMWithTokenizer",
+        "MODEL_MAPPING_NAMES",
+        "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
+        "MODEL_MAPPING",
+        "MODEL_WITH_TOKENIZER_MAPPING",
+    ]
+
+try:
+    if not imports.is_flax_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_flax_auto"] = [
+        "FlaxLLM",
+        "FlaxLLMWithTokenizer",
+        "MODEL_FLAX_MAPPING_NAMES",
+        "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
+        "MODEL_FLAX_MAPPING",
+        "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
+    ]
+
+try:
+    if not imports.is_tf_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_tf_auto"] = [
+        "TFLLM",
+        "TFLLMWithTokenizer",
+        "MODEL_TF_MAPPING_NAMES",
+        "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
+        "MODEL_TF_MAPPING",
+        "MODEL_TF_WITH_TOKENIZER_MAPPING",
+    ]
+
+if t.TYPE_CHECKING:
+    from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
+    from .configuration_auto import \
+        CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
+    from .configuration_auto import Config as Config
+    from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
+    from .tokenization_auto import \
+        TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
+    from .tokenization_auto import Tokenizer as Tokenizer
+
+    try:
+        if not imports.is_torch_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_auto import LLM as LLM
+        from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
+        from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
+        from .modeling_auto import \
+            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
+        from .modeling_auto import \
+            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_WITH_TOKENIZER_MAPPING_NAMES
+        from .modeling_auto import LLMWithTokenizer as LLMWithTokenizer
+
+    try:
+        if not imports.is_flax_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_flax_auto import \
+            MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
+        from .modeling_flax_auto import \
+            MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
+        from .modeling_flax_auto import \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING
+        from .modeling_flax_auto import \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
+        from .modeling_flax_auto import FlaxLLM as FlaxLLM
+        from .modeling_flax_auto import \
+            FlaxLLMWithTokenizer as FlaxLLMWithTokenizer
+
+    try:
+        if not imports.is_tf_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
+        from .modeling_tf_auto import \
+            MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
+        from .modeling_tf_auto import \
+            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
+        from .modeling_tf_auto import \
+            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
+            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
+        from .modeling_tf_auto import TFLLM as TFLLM
+        from .modeling_tf_auto import TFLLMWithTokenizer as TFLLMWithTokenizer
+else:
+    import sys
+
+    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py
new file mode 100644
index 00000000..5e366137
--- /dev/null
+++ b/src/openllm/models/auto/configuration_auto.py
@@ -0,0 +1,88 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import types
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+# NOTE: This is the entrypoint when adding new model config
+CONFIG_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Config")])
+
+
+class _LazyConfigMapping(OrderedDict[str, str]):
+    def __init__(self, mapping: OrderedDict[str, str]):
+        self._mapping = mapping
+        self._extra_content: dict[str, t.Any] = {}
+        self._modules: dict[str, types.ModuleType] = {}
+
+    def __getitem__(self, key: str):
+        if key in self._extra_content:
+            return self._extra_content[key]
+        if key not in self._mapping:
+            raise KeyError(key)
+        value = self._mapping[key]
+        module_name = openllm.utils.kebab_to_snake_case(key)
+        if module_name not in self._modules:
+            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+        if hasattr(self._modules[module_name], value):
+            return getattr(self._modules[module_name], value)
+
+        # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the
+        # object at the top level.
+        return getattr(openllm, value)
+
+    def keys(self):
+        return list(self._mapping.keys()) + list(self._extra_content.keys())
+
+    def values(self):
+        return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())
+
+    def items(self):
+        return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())
+
+    def __iter__(self):
+        return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
+
+    def __contains__(self, item: t.Any):
+        return item in self._mapping or item in self._extra_content
+
+    def register(self, key: str, value: t.Any):
+        """
+        Register a new configuration in this mapping.
+        """
+        if key in self._mapping.keys():
+            raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
+        self._extra_content[key] = value
+
+
+CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+
+
+class Config:
+    def __init__(self, *args: t.Any, **kwargs: t.Any):
+        raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.")
+
+    @classmethod
+    def for_model(cls, model_name: str, *args: t.Any, **kwargs: t.Any):
+        model_name = openllm.utils.kebab_to_snake_case(model_name)
+        if model_name in CONFIG_MAPPING:
+            return CONFIG_MAPPING[model_name](*args, **kwargs)
+        raise ValueError(
+            f"Unrecognized configuration class for {model_name}. "
+            f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
+        )
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
new file mode 100644
index 00000000..29362af0
--- /dev/null
+++ b/src/openllm/models/auto/factory.py
@@ -0,0 +1,211 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import importlib
+import types
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+from .configuration_auto import Config
+
+M = t.TypeVar("M")
+K = t.TypeVar("K")
+V = t.TypeVar("V")
+
+
+def _get_runnable_class(
+    config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping[t.Any, t.Any]
+) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
+    supported_runnables = runnable_mapping[type(config)]
+    if not isinstance(supported_runnables, (list, tuple)):
+        return supported_runnables
+    return supported_runnables[0]
+
+
+class _BaseAutoRunnerFactory(t.Generic[M]):
+    _model_mapping: _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[M, t.Any]]]
+
+    def __init__(self, *args: t.Any, **kwargs: t.Any):
+        raise EnvironmentError(
+            f"Cannot instantiate {self.__class__.__name__} directly. \
+            Please use '{self.__class__.__name__}.create_runner(model_name)' instead."
+        )
+
+    @classmethod
+    def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
+        config = kwargs.pop("config", None)
+
+        runner_kwargs_name = [
+            "runner_name",
+            "models",
+            "max_batch_size",
+            "max_latency_ms",
+            "method_configs",
+            "embedded",
+            "import_model_kwargs",
+            "import_tokenizer_kwargs",
+            "import_config_kwargs",
+        ]
+        create_runner_kwargs = {k: kwargs.pop(k) for k in runner_kwargs_name if k in kwargs}
+
+        if not isinstance(config, openllm.LLMConfig):
+            # The rest of kwargs is now passed to config
+            config = Config.for_model(model_name, **kwargs)
+        if type(config) in cls._model_mapping.keys():
+            runnable_class = _get_runnable_class(config, cls._model_mapping)
+            if pretrained_or_path is None:
+                pretrained_or_path = openllm.utils.get_pretrained_env(runnable_class.start_model_name)
+            return runnable_class.create_runner(
+                pretrained_or_path=pretrained_or_path, config=config, **create_runner_kwargs
+            )
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoRunner: {cls.__name__}.\n"
+            f"Runnable type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+        )
+
+    @classmethod
+    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
+        """
+        Register a new model for this class.
+
+        Args:
+            config_class: The configuration corresponding to the model to register.
+            runnable_class: The runnable to register.
+        """
+        if hasattr(runnable_class, "config_class") and runnable_class.config_class != config_class:
+            raise ValueError(
+                "The model class you are passing has a `config_class` attribute that is not consistent with the "
+                f"config class you passed (model has {runnable_class.config_class} and you passed {config_class}. Fix "
+                "one of those so they match!"
+            )
+        cls._model_mapping.register(config_class, runnable_class)
+
+
+def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
+    if attr is None:
+        return None
+    if isinstance(attr, tuple):
+        return tuple(getattribute_from_module(module, a) for a in attr)
+    if hasattr(module, attr):
+        return getattr(module, attr)
+    # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the
+    # object at the top level.
+    openllm_module = importlib.import_module("openllm")
+
+    if module != openllm_module:
+        try:
+            return getattribute_from_module(openllm_module, attr)
+        except ValueError:
+            raise ValueError(f"Could not find {attr} neither in {module} nor in {openllm_module}!")
+    else:
+        raise ValueError(f"Could not find {attr} in {openllm_module}!")
+
+
+class _LazyAutoMapping(OrderedDict[K, V]):
+    """Based on transformers.models.auto.configuration_auto._LazyAutoMapping"""
+
+    def __init__(self, config_mapping: OrderedDict[str, str], model_mapping: OrderedDict[str, str]):
+        self._config_mapping = config_mapping
+        self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
+        self._model_mapping = model_mapping
+        self._extra_content: dict[K, V] = {}
+        self._modules: dict[str, types.ModuleType] = {}
+
+    def __len__(self):
+        common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys())
+        return len(common_keys) + len(self._extra_content)
+
+    def __getitem__(self, key: K) -> V:
+        if key in self._extra_content:
+            return self._extra_content[key]
+        model_type = self._reverse_config_mapping[key.__name__]
+        if model_type in self._model_mapping:
+            model_name = self._model_mapping[model_type]
+            return self._load_attr_from_module(model_type, model_name)
+
+        # Maybe there was several model types associated with this config.
+        model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
+        for mtype in model_types:
+            if mtype in self._model_mapping:
+                model_name = self._model_mapping[mtype]
+                return self._load_attr_from_module(mtype, model_name)
+        raise KeyError(key)
+
+    def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
+        module_name = openllm.utils.kebab_to_snake_case(model_type)
+        if module_name not in self._modules:
+            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+        return getattribute_from_module(self._modules[module_name], attr)
+
+    def keys(self):
+        mapping_keys = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._config_mapping.items()
+            if key in self._model_mapping.keys()
+        ]
+        return mapping_keys + list(self._extra_content.keys())
+
+    def get(self, key: str, default: t.Any):
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return default
+
+    def __bool__(self):
+        return bool(self.keys())
+
+    def values(self):
+        mapping_values = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._model_mapping.items()
+            if key in self._config_mapping.keys()
+        ]
+        return mapping_values + list(self._extra_content.values())
+
+    def items(self):
+        mapping_items = [
+            (
+                self._load_attr_from_module(key, self._config_mapping[key]),
+                self._load_attr_from_module(key, self._model_mapping[key]),
+            )
+            for key in self._model_mapping.keys()
+            if key in self._config_mapping.keys()
+        ]
+        return mapping_items + list(self._extra_content.items())
+
+    def __iter__(self):
+        return iter(self.keys())
+
+    def __contains__(self, item: t.Any):
+        if item in self._extra_content:
+            return True
+        if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping:
+            return False
+        model_type = self._reverse_config_mapping[item.__name__]
+        return model_type in self._model_mapping
+
+    def register(self, key: K, value: V):
+        """
+        Register a new model in this mapping.
+        """
+        if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
+            model_type = self._reverse_config_mapping[key.__name__]
+            if model_type in self._model_mapping.keys():
+                raise ValueError(f"'{key}' is already used by a OpenLLM model.")
+
+        self._extra_content[key] = value
diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py
new file mode 100644
index 00000000..a555630c
--- /dev/null
+++ b/src/openllm/models/auto/modeling_auto.py
@@ -0,0 +1,43 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
+
+MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])
+
+MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
+
+MODEL_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES
+)
+
+MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES
+)
+
+
+class LLM(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_MAPPING
+
+
+class LLMWithTokenizer(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_flax_auto.py b/src/openllm/models/auto/modeling_flax_auto.py
new file mode 100644
index 00000000..4f3fac7a
--- /dev/null
+++ b/src/openllm/models/auto/modeling_flax_auto.py
@@ -0,0 +1,43 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
+
+MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])
+
+MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
+
+MODEL_FLAX_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES
+)
+
+MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
+)
+
+
+class FlaxLLM(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_FLAX_MAPPING
+
+
+class FlaxLLMWithTokenizer(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_tf_auto.py b/src/openllm/models/auto/modeling_tf_auto.py
new file mode 100644
index 00000000..14599f90
--- /dev/null
+++ b/src/openllm/models/auto/modeling_tf_auto.py
@@ -0,0 +1,43 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+from .configuration_auto import CONFIG_MAPPING_NAMES
+from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
+
+MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])
+
+MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
+
+MODEL_TF_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES
+)
+
+MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping[type[openllm.LLMConfig], type[openllm.LLMRunnable[t.Any, t.Any]]](
+    CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
+)
+
+
+class TFLLM(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_TF_MAPPING
+
+
+class TFLLMWithTokenizer(_BaseAutoRunnerFactory[type[openllm.LLMRunnable[t.Any, t.Any]]]):
+    _model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/tokenization_auto.py b/src/openllm/models/auto/tokenization_auto.py
new file mode 100644
index 00000000..7e3917f6
--- /dev/null
+++ b/src/openllm/models/auto/tokenization_auto.py
@@ -0,0 +1,46 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing as t
+from collections import OrderedDict
+
+import openllm
+
+from .configuration_auto import _LazyConfigMapping
+
+TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
+
+TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
+
+
+class Tokenizer:
+    def __init__(self):
+        raise EnvironmentError(
+            "This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
+        )
+
+    @classmethod
+    def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
+        model_name = openllm.utils.kebab_to_snake_case(model_name)
+        if model_name in TOKENIZER_MAPPING:
+            tokenizer_class = TOKENIZER_MAPPING[model_name]
+            if pretrained_or_path is None:
+                pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
+            return tokenizer_class(pretrained_or_path, **kwargs)
+        raise ValueError(
+            f"Unrecognized model {model_name} to build an Tokenizer.\n"
+            f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
+        )
diff --git a/src/openllm/models/flan_t5/__init__.py b/src/openllm/models/flan_t5/__init__.py
new file mode 100644
index 00000000..eaebaaaf
--- /dev/null
+++ b/src/openllm/models/flan_t5/__init__.py
@@ -0,0 +1,95 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing as t
+
+import openllm
+from openllm.utils import import_utils_shim as imports
+
+_import_structure = {
+    "prompts_flan_t5": ["DEFAULT_PROMPT_TEMPLATE"],
+    "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
+    "service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
+}
+
+try:
+    if not imports.is_torch_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
+
+try:
+    if not imports.is_flax_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
+
+try:
+    if not imports.is_tf_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
+
+
+if t.TYPE_CHECKING:
+    from .configuration_flan_t5 import \
+        START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
+    from .configuration_flan_t5 import FlanT5Config as FlanT5Config
+    from .prompts_flan_t5 import \
+        DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
+
+    try:
+        if not imports.is_torch_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_flan_t5 import FlanT5 as FlanT5
+        from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
+        from .modeling_flan_t5 import \
+            FlanT5WithTokenizer as FlanT5WithTokenizer
+
+    try:
+        if not imports.is_flax_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
+        from .modeling_flax_flan_t5 import \
+            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
+
+    try:
+        if not imports.is_tf_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
+        from .modeling_tf_flan_t5 import \
+            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
+else:
+    import sys
+
+    sys.modules[__name__] = openllm.utils.LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/src/openllm/models/flan_t5/configuration_flan_t5.py
new file mode 100644
index 00000000..b98ab793
--- /dev/null
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -0,0 +1,52 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import pydantic
+
+from ...configuration_utils import LLMConfig
+
+START_FLAN_T5_COMMAND_DOCSTRING = """\
+Run a LLMServer for FLAN-T5 models .
+
+\b
+> See more information about FLAN-T5 at [huggingface/transformers](https://huggingface.co/docs/transformers/model_doc/flan-t5)
+
+\b
+## Usage
+
+By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
+
+\b
+- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"``
+
+\b
+- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"``
+
+\b
+FLAN-T5 Runner will use google/flan-t5-large as the default model. To change any to any other FLAN-T5
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_PRETRAINED='google/flan-t5-xxl'``
+"""
+
+
+class FlanT5Config(LLMConfig):
+    """Configuration for the FLAN-T5 model."""
+
+    temperature: float = pydantic.Field(0.75, ge=0.01, le=5, description="Determine how random generation should be.")
+    max_length: int = pydantic.Field(
+        3000, ge=1, description="Maximum number of tokens to generate. A word is around 2-3 tokens."
+    )
+    top_k: float = pydantic.Field(1, description="Total number of tokens to consider at each step.")
+    top_p: float = pydantic.Field(0.25, description="Total probability mass of tokens to consider at each step.")
+    repetition_penalty: float = pydantic.Field(1.2, description="Penalizes repeated tokens according to frequency.")
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
new file mode 100644
index 00000000..c7229974
--- /dev/null
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -0,0 +1,145 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import typing as t
+
+import openllm
+
+from ...runner_utils import (LLMRunnable, assign_start_model_name,
+                             generate_tokenizer_runner)
+from .configuration_flan_t5 import FlanT5Config
+
+if t.TYPE_CHECKING:
+    import bentoml
+    import torch
+    import transformers
+else:
+    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
+    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+    bentoml = openllm.utils.LazyLoader("bentoml", globals(), "bentoml")
+
+
+def import_model(
+    pretrained_or_path: str,
+    model_kwargs: dict[str, t.Any] | None = None,
+    tokenizer_kwargs: dict[str, t.Any] | None = None,
+    config_kwargs: dict[str, t.Any] | None = None,
+) -> bentoml.Model:
+    """Import any PyTorch Flan-T5 pretrained models weight into the store.
+
+    Args:
+        model_name: The name of the model to import.
+        model_kwargs: Additional kwargs to pass to the ``transformers.AutoModelForSeq2SeqLM`` constructors.
+        tokenizer_kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
+        config_kwargs: Additional kwargs to pass to the ``transformers.AutoConfig`` constructors to determine the model tag.
+
+    Returns:
+        a ``bentoml.Model`` instance.
+    """
+    model_kwargs = model_kwargs or {}
+    tokenizer_kwargs = tokenizer_kwargs or {}
+    config_kwargs = config_kwargs or {}
+
+    tag = openllm.utils.generate_tag_from_model_name(pretrained_or_path, **config_kwargs)
+
+    try:
+        return bentoml.transformers.get(tag)
+    except bentoml.exceptions.NotFound:
+        model = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained_or_path, **model_kwargs)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_or_path, **tokenizer_kwargs)
+        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
+
+
+def _FlanT5Tokenizer(
+    pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
+) -> openllm.types.TokenizerRunner:
+    """Get the runner for the tokenizer.
+
+    Args:
+        model_name: The name of the FLAN-T5 model to import.
+        embedded: Whether to use the embedded runner or not.
+        **kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
+
+    Returns:
+        The runner for the tokenizer.
+    """
+    if pretrained_or_path is None:
+        pretrained_or_path = FlanT5.default_model
+
+    return generate_tokenizer_runner(
+        import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
+    )
+
+
+FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
+
+
+class FlanT5(
+    LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
+):
+    default_model: str = "google/flan-t5-large"
+    config_class = FlanT5Config
+
+    ATTACH_TOKENIZER = False
+
+    _llm_config: FlanT5Config
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    @staticmethod
+    def pretrained_models() -> list[str]:
+        return [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ]
+
+    def _generate(
+        self,
+        input_ids: torch.Tensor,
+        max_length: int | None = None,
+        do_sample: bool = True,
+        temperature: float | None = None,
+        top_k: float | None = None,
+        top_p: float | None = None,
+        repetition_penalty: float | None = None,
+        **kwargs: t.Any,
+    ) -> torch.Tensor:
+        return self.model.generate(
+            input_ids,
+            max_length=max_length if max_length is not None else self._llm_config.max_length,
+            do_sample=do_sample,
+            temperature=temperature if temperature is not None else self._llm_config.temperature,
+            top_k=top_k if top_k is not None else self._llm_config.top_k,
+            top_p=top_p if top_p is not None else self._llm_config.top_p,
+            repetition_penalty=repetition_penalty
+            if repetition_penalty is not None
+            else self._llm_config.repetition_penalty,
+            **kwargs,
+        )
+
+
+class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
+    default_model: str = "google/flan-t5-large"
+
+    ATTACH_TOKENIZER = True
+
+    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
+        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+        input_ids = input_ids.to(self.device)
+        outputs = super()._generate(input_ids, **kwargs)
+        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
new file mode 100644
index 00000000..ddd10091
--- /dev/null
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -0,0 +1,119 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import typing as t
+
+import openllm
+
+from ...runner_utils import LLMRunnable
+from .configuration_flan_t5 import FlanT5Config as FlanT5Config
+
+if t.TYPE_CHECKING:
+    import bentoml
+    import jax.numpy as jnp
+    import transformers
+
+else:
+    jnp = openllm.utils.LazyLoader("jnp", globals(), "jax.numpy")
+    bentoml = openllm.utils.LazyLoader("bentoml", globals(), "bentoml")
+    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+
+def import_model(
+    pretrained_or_path: str,
+    model_kwargs: dict[str, t.Any] | None = None,
+    tokenizer_kwargs: dict[str, t.Any] | None = None,
+    config_kwargs: dict[str, t.Any] | None = None,
+) -> bentoml.Model:
+    """Import any Flax Flan-T5 pretrained models weight into the store.
+
+    Args:
+        model_name: The name of the model to import.
+        model_kwargs: Additional kwargs to pass to the ``transformers.AutoModelForSeq2SeqLM`` constructors.
+        tokenizer_kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
+        config_kwargs: Additional kwargs to pass to the ``transformers.AutoConfig`` constructors to determine the model tag.
+
+    Returns:
+        a ``bentoml.Model`` instance.
+    """
+    model_kwargs = model_kwargs or {}
+    tokenizer_kwargs = tokenizer_kwargs or {}
+    config_kwargs = config_kwargs or {}
+
+    tag = openllm.utils.generate_tag_from_model_name(pretrained_or_path, prefix="flax", **config_kwargs)
+    try:
+        return bentoml.transformers.get(tag)
+    except bentoml.exceptions.NotFound:
+        model = transformers.FlaxT5ForConditionalGeneration.from_pretrained(pretrained_or_path, **model_kwargs)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_or_path, **tokenizer_kwargs)
+        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
+
+
+class FlaxFlanT5(
+    LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
+):
+    default_model: str = "google/flan-t5-large"
+    config_class = FlanT5Config
+
+    ATTACH_TOKENIZER = False
+
+    _llm_config: FlanT5Config
+
+    @staticmethod
+    def pretrained_models() -> list[str]:
+        return [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ]
+
+    def _generate(
+        self,
+        input_ids: jnp.ndarray,
+        max_length: int | None = None,
+        do_sample: bool = True,
+        temperature: float | None = None,
+        top_k: float | None = None,
+        top_p: float | None = None,
+        repetition_penalty: float | None = None,
+        **kwargs: t.Any,
+    ) -> jnp.ndarray:
+        return self.model.generate(
+            input_ids,
+            max_length=max_length if max_length is not None else self._llm_config.max_length,
+            do_sample=do_sample,
+            temperature=temperature if temperature is not None else self._llm_config.temperature,
+            top_k=top_k if top_k is not None else self._llm_config.top_k,
+            top_p=top_p if top_p is not None else self._llm_config.top_p,
+            repetition_penalty=repetition_penalty
+            if repetition_penalty is not None
+            else self._llm_config.repetition_penalty,
+            **kwargs,
+        )
+
+
+class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
+    default_model: str = "google/flan-t5-large"
+
+    ATTACH_TOKENIZER = True
+
+    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
+        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
+        outputs = super()._generate(input_ids, **kwargs)
+        return self.tokenizer.batch_decode(
+            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
new file mode 100644
index 00000000..9f489f7a
--- /dev/null
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -0,0 +1,116 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import typing as t
+
+import openllm
+
+from ...runner_utils import LLMRunnable
+from .configuration_flan_t5 import FlanT5Config
+
+if t.TYPE_CHECKING:
+    import bentoml
+    import tensorflow as tf
+    import transformers
+else:
+    tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
+    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+    bentoml = openllm.utils.LazyLoader("bentoml", globals(), "bentoml")
+
+
+def import_model(
+    pretrained_or_path: str,
+    model_kwargs: dict[str, t.Any] | None = None,
+    tokenizer_kwargs: dict[str, t.Any] | None = None,
+    config_kwargs: dict[str, t.Any] | None = None,
+) -> bentoml.Model:
+    """Import any PyTorch Flan-T5 pretrained models weight into the store.
+
+    Args:
+        model_name: The name of the model to import.
+        model_kwargs: Additional kwargs to pass to the ``transformers.AutoModelForSeq2SeqLM`` constructors.
+        tokenizer_kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
+        config_kwargs: Additional kwargs to pass to the ``transformers.AutoConfig`` constructors to determine the model tag.
+
+    Returns:
+        a ``bentoml.Model`` instance.
+    """
+    model_kwargs = model_kwargs or {}
+    tokenizer_kwargs = tokenizer_kwargs or {}
+    config_kwargs = config_kwargs or {}
+
+    tag = openllm.utils.generate_tag_from_model_name(pretrained_or_path, prefix="tf", **config_kwargs)
+    try:
+        return bentoml.transformers.get(tag)
+    except bentoml.exceptions.NotFound:
+        model = transformers.TFT5ForConditionalGeneration.from_pretrained(pretrained_or_path, **model_kwargs)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_or_path, **tokenizer_kwargs)
+        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
+
+
+class TFFlanT5(
+    LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
+):
+    default_model: str = "google/flan-t5-large"
+    config_class = FlanT5Config
+
+    ATTACH_TOKENIZER = False
+
+    _llm_config: FlanT5Config
+
+    @staticmethod
+    def pretrained_models() -> list[str]:
+        return [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ]
+
+    def _generate(
+        self,
+        input_ids: tf.Tensor,
+        max_length: int | None = None,
+        do_sample: bool = True,
+        temperature: float | None = None,
+        top_k: float | None = None,
+        top_p: float | None = None,
+        repetition_penalty: float | None = None,
+        **kwargs: t.Any,
+    ) -> tf.Tensor:
+        return self.model.generate(
+            input_ids,
+            max_length=max_length if max_length is not None else self._llm_config.max_length,
+            do_sample=do_sample,
+            temperature=temperature if temperature is not None else self._llm_config.temperature,
+            top_k=top_k if top_k is not None else self._llm_config.top_k,
+            top_p=top_p if top_p is not None else self._llm_config.top_p,
+            repetition_penalty=repetition_penalty
+            if repetition_penalty is not None
+            else self._llm_config.repetition_penalty,
+            **kwargs,
+        )
+
+
+class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
+    default_model: str = "google/flan-t5-large"
+
+    ATTACH_TOKENIZER = True
+
+    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
+        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
+        outputs = super()._generate(input_ids, **kwargs)
+        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/prompts_flan_t5.py b/src/openllm/models/flan_t5/prompts_flan_t5.py
new file mode 100644
index 00000000..90193956
--- /dev/null
+++ b/src/openllm/models/flan_t5/prompts_flan_t5.py
@@ -0,0 +1,21 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DEFAULT_PROMPT_TEMPLATE = """\
+Please use the following piece of context to answer the question at the end.
+
+{context}
+
+Question: {question}
+Answer:"""
diff --git a/src/openllm/models/flan_t5/service_flan_t5.py b/src/openllm/models/flan_t5/service_flan_t5.py
new file mode 100644
index 00000000..3e4d66c7
--- /dev/null
+++ b/src/openllm/models/flan_t5/service_flan_t5.py
@@ -0,0 +1,59 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import bentoml
+
+import openllm
+
+framework = openllm.utils.get_framework_env("flan-t5")
+if framework == "flax":
+    klass = openllm.FlaxLLM
+elif framework == "pt":
+    klass = openllm.LLM
+elif framework == "tf":
+    klass = openllm.TFLLM
+else:
+    raise ValueError(f"Invalid framework {framework}")
+
+model_runner = klass.create_runner("flan-t5")
+tokenizer_runner = openllm.Tokenizer.create_runner("flan-t5")
+
+svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
+
+
+@svc.api(input=openllm.Prompt(default="flan-t5"), output=openllm.Prompt(default="flan-t5"))
+async def generate(qa: openllm.schema.PromptInput) -> openllm.schema.PromptOutput:
+    """Returns the generated text from given prompts."""
+    llm_config = model_runner.llm_config.dict()
+
+    return_tensors = "np" if framework == "flax" else framework
+    input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
+    if framework == "flax":
+        outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
+        responses = await tokenizer_runner.batch_decode.async_run(
+            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+    else:
+        outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
+        responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
+    return openllm.schema.PromptOutput(responses=responses, configuration=llm_config)
+
+
+@svc.api(input=bentoml.io.JSON(pydantic_model=openllm.FlanT5Config), output=bentoml.io.JSON())
+def update_llm_config(llm_config: openllm.FlanT5Config) -> str:
+    """Update the LLM configuration."""
+    object.__setattr__(model_runner, "llm_config", llm_config)
+    return model_runner.llm_config.dict()
diff --git a/src/openllm/prompts/__init__.py b/src/openllm/prompts/__init__.py
new file mode 100644
index 00000000..1e41c2fc
--- /dev/null
+++ b/src/openllm/prompts/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Prompt interface for OpenLLM.
+
+This module exposes the `Prompt` descriptor, which is used to create prompts as a ``bentoml.io.IODescriptor``.
+This ``Prompt`` can also be used to interact with the client and provide similar interface to ``langchain.PromptTemplate``.
+
+Example:
+    >>> from openllm.prompts import Prompt
+    >>> prompt = Prompt.from_template("Use the following as context: {context}!")
+"""
+
+from .descriptors import Prompt as Prompt
+from .formatter import default_formatter as default_formatter
diff --git a/src/openllm/prompts/descriptors.py b/src/openllm/prompts/descriptors.py
new file mode 100644
index 00000000..f01bbc39
--- /dev/null
+++ b/src/openllm/prompts/descriptors.py
@@ -0,0 +1,286 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Descriptor definition for OpenLLM. ``Prompt`` in hindsight extends the functionality of ``bentoml.io.JSON``.
+
+However, it uses ``orjson`` instead of ``json`` for faster serialization and deserialization.
+
+One distinct difference is that ``Prompt`` can only be initialized with ``Prompt.from_template``.
+
+Example:
+    >>> from openllm.prompts import Prompt
+    >>> prompt = Prompt.from_template("Use the following as context: {context}!")
+    >>> prompt = Prompt("Use the following as context: {context}\nQuestion: {question}\nAnswer:")
+
+It also adds a ``from_http`` and ``to_http`` method which wraps around ``from_http_request`` and ``to_http_response``.
+
+TODO: 
+- Add support for ``langchain`` PromptTemplate under ``template``.
+
+Prompt can also be set via ``OPENLLM_PROMPT_TEMPLATE`` environment variable.
+"""
+from __future__ import annotations
+
+import logging
+import os
+import typing as t
+
+import bentoml
+import orjson
+import pydantic
+import pydantic.schema
+from bentoml._internal.io_descriptors.json import parse_dict_to_proto
+from bentoml._internal.utils.http import set_cookies
+from starlette.requests import Request
+from starlette.responses import Response
+
+import openllm
+
+if t.TYPE_CHECKING:
+    from google.protobuf import struct_pb2
+
+    from openllm.types import OpenAPIResponse
+else:
+    struct_pb2 = openllm.utils.LazyLoader("struct_pb2", globals(), "google.protobuf.struct_pb2")
+
+logger = logging.getLogger(__name__)
+
+
+class Prompt(
+    bentoml.io.IODescriptor[pydantic.BaseModel],
+    descriptor_id="openllm.prompts.descriptors.Prompt",
+    proto_fields=("json",),
+):
+    _mime_type = "application/json"
+
+    # TODO: support langchain PromptTemplate under template
+    prompt_template: openllm.schema.PromptTemplate
+
+    input_schema = openllm.schema.PromptInput
+    output_schema = openllm.schema.PromptOutput
+
+    def __setattr__(self, attr_name: str, value: t.Any) -> None:
+        if attr_name in ("input_schema", "output_schema"):
+            raise openllm.exceptions.OpenLLMException(f"{attr_name} is frozen as custom schema is not yet supported.")
+        super().__setattr__(attr_name, value)
+
+    def __init__(
+        self,
+        template: openllm.schema.PromptTemplate | openllm.schema.PromptInput | str | None = None,
+        default: str | None = None,
+    ) -> None:
+        if default:
+            default = openllm.utils.kebab_to_snake_case(default)
+        template_from_env = os.environ.get("OPENLLM_PROMPT_TEMPLATE", None)
+        if template is None and template_from_env is None:
+            if default:
+                if default not in openllm.CONFIG_MAPPING.keys():
+                    raise ValueError(
+                        "Invalid default prompt template. Please choose from: "
+                        + ", ".join(openllm.CONFIG_MAPPING.keys())
+                    )
+                self.prompt_template = openllm.schema.PromptTemplate.from_default(default)
+                return
+            else:
+                raise ValueError(
+                    "Prompt template is not set. Please set it via 'OPENLLM_PROMPT_TEMPLATE' environment variable or pass it to 'Prompt'."
+                )
+
+        # This logic ensure that we will always respect the template from env if set, even if `template` is passed
+        template = template_from_env if template_from_env is not None else template
+
+        if isinstance(template, str):
+            self.prompt_template = openllm.schema.PromptTemplate.from_template(template)
+        elif isinstance(template, openllm.schema.PromptInput):
+            self.prompt_template = openllm.schema.PromptTemplate.from_template(template.prompt)
+        elif isinstance(template, openllm.schema.PromptTemplate):
+            self.prompt_template = template
+        else:
+            raise openllm.exceptions.OpenLLMException(
+                f"Invalid sample type: {type(template)}. Sample must be one of: {openllm.schema.PromptInput}, {openllm.schema.PromptTemplate} or str."
+            )
+
+    @classmethod
+    def from_template(
+        cls,
+        template: openllm.schema.PromptTemplate | openllm.schema.PromptInput | str | None = None,
+        default: str | None = None,
+    ) -> Prompt:
+        return cls(template=template, default=default)
+
+    @property
+    def template(self) -> str:
+        assert self.prompt_template, "Make sure to initialize Prompt with 'from_template' first."
+        return self.prompt_template.template
+
+    @property
+    def input_variables(self) -> t.Sequence[str]:
+        assert self.prompt_template, "Make sure to initialize Prompt with 'from_template' first."
+        return self.prompt_template.input_variables
+
+    async def from_http(self, request: Request) -> openllm.schema.PromptInput:
+        json_str = await request.body()
+        try:
+            parsed = orjson.loads(json_str)
+        except orjson.JSONDecodeError as e:
+            raise bentoml.exceptions.InvalidArgument(f"Invalid JSON: {e}")
+
+        if "prompt" in parsed:
+            #  In this branch, user pass in a full prompt.
+            #  We need to parse it and extract the input variables.
+            prompt = parsed.pop("prompt")
+            prompt_template = openllm.schema.PromptTemplate.from_template(prompt)
+            return self.input_schema(prompt=prompt_template.to_str(**parsed), inputs=parsed)
+        # In this branch, users only pass in the input variables for set prompt
+        # (either default or set via environment variables)
+        return self.input_schema(prompt=self.prompt_template.to_str(**parsed), inputs=parsed)
+
+    async def from_proto(self, field: struct_pb2.Value | bytes) -> openllm.schema.PromptInput:
+        from google.protobuf.json_format import MessageToDict
+
+        if isinstance(field, bytes):
+            content = field
+            try:
+                parsed = orjson.loads(content)
+            except orjson.JSONDecodeError as e:
+                raise bentoml.exceptions.BadInput(f"Invalid JSON: {e}")
+        else:
+            assert isinstance(field, struct_pb2.Value)
+            parsed = MessageToDict(field, preserving_proto_field_name=True)
+
+        if "prompt" in parsed:
+            #  In this branch, user pass in a full prompt.
+            #  We need to parse it and extract the input variables.
+            prompt = parsed.pop("prompt")
+            prompt_template = openllm.schema.PromptTemplate.from_template(prompt)
+            return self.input_schema(prompt=prompt_template.to_str(**parsed), inputs=parsed)
+        # In this branch, users only pass in the input variables for set prompt
+        # (either default or set via environment variables)
+        return self.input_schema(prompt=self.prompt_template.to_str(**parsed), inputs=parsed)
+
+    async def to_http(self, obj: pydantic.BaseModel, ctx: bentoml.Context | None = None):
+        if not isinstance(obj, openllm.schema.PromptOutput):
+            raise bentoml.exceptions.BadInput(f"Expected output of type {openllm.schema.PromptOutput}, got {type(obj)}")
+        logger.debug("Converting '%s' to '%s'", obj, self.output_schema)
+
+        json_str = orjson.dumps(self.output_schema(**obj.dict()).dict())
+        if ctx is not None:
+            res = Response(
+                json_str,
+                media_type=self._mime_type,
+                headers=ctx.response.metadata,
+                status_code=ctx.response.status_code,
+            )
+            set_cookies(res, ctx.response.cookies)
+        else:
+            res = Response(json_str, media_type=self._mime_type)
+
+        return res
+
+    async def to_proto(self, obj: pydantic.BaseModel) -> struct_pb2.Value:
+        if not isinstance(obj, openllm.schema.PromptOutput):
+            raise bentoml.exceptions.BadInput(f"Expected output of type {openllm.schema.PromptOutput}, got {type(obj)}")
+        logger.debug("Converting '%s' to '%s'", obj, self.output_schema)
+
+        json_ = self.output_schema(**obj.dict()).dict()
+
+        msg = struct_pb2.Value()
+        return parse_dict_to_proto(json_, msg)
+
+    # NOTE: OpenAPI specification for Prompt to be a bentoml.io.IODescriptor
+    def input_type(self) -> type[pydantic.BaseModel]:
+        return pydantic.BaseModel
+
+    def to_spec(self) -> dict[str, t.Any]:
+        return {
+            "id": self.descriptor_id,
+            "args": {"template": self.prompt_template.template},
+        }
+
+    @classmethod
+    def from_spec(cls, spec: dict[str, t.Any]) -> t.Self:
+        if "args" not in spec:
+            raise bentoml.exceptions.InvalidArgument(f"Missing args key in JSON spec: {spec}")
+        return cls.from_template(**spec["args"])
+
+    def openapi_schema(self) -> t.Any:
+        # NOTE: not used
+        return {"type": "object"}
+
+    @property
+    def _model_name_map(self) -> dict[pydantic.schema.TypeModelOrEnum, str]:
+        return pydantic.schema.get_model_name_map(
+            pydantic.schema.get_flat_models_from_models({self.input_schema, self.output_schema})
+        )
+
+    def openapi_input_schema(self) -> dict[str, t.Any]:
+        return pydantic.schema.model_process_schema(
+            self.input_schema, model_name_map=self._model_name_map, ref_prefix="#/components/schemas/"
+        )[0]
+
+    def openapi_output_schema(self) -> dict[str, t.Any]:
+        return pydantic.schema.model_process_schema(
+            self.output_schema, model_name_map=self._model_name_map, ref_prefix="#/components/schemas/"
+        )[0]
+
+    def openapi_components(self) -> dict[str, t.Any] | None:
+        # TODO: Support custom input and output schema in BentoML OpenAPI components parsing.
+        return
+
+    def openapi_example(self) -> dict[str, t.Any]:
+        return {k: "" for k in self.input_variables}
+
+    def openapi_input_example(self) -> dict[str, t.Any]:
+        return self.openapi_example()
+
+    def openapi_output_example(self) -> dict[str, t.Any]:
+        return {}
+
+    def openapi_request_body(self) -> dict[str, t.Any]:
+        return {
+            "content": {
+                self._mime_type: {
+                    "schema": self.openapi_input_schema(),
+                    "example": self.openapi_input_example(),
+                }
+            },
+            "required": True,
+            "x-bentoml-io-descriptor": self.to_spec(),
+        }
+
+    def openapi_responses(self) -> OpenAPIResponse:
+        return {
+            "description": "Prompt received successfully!",
+            "content": {
+                self._mime_type: {
+                    "schema": self.openapi_output_schema(),
+                    "example": self.openapi_output_example(),  # TODO: Support output example
+                }
+            },
+            "x-bentoml-io-descriptor": self.to_spec(),
+        }
+
+    # NOTE: The below override default the loose contract from bentoml.io.IODescriptor
+    @classmethod
+    def from_sample(cls, sample: openllm.schema.PromptInput | str) -> Prompt:
+        return cls.from_template(sample)
+
+    def _from_sample(self, sample: t.Any) -> pydantic.BaseModel:
+        return sample
+
+    async def from_http_request(self, request: Request) -> openllm.schema.PromptInput:
+        return await self.from_http(request)
+
+    async def to_http_response(self, obj: pydantic.BaseModel, ctx: bentoml.Context | None = None):
+        return await self.to_http(obj, ctx)
diff --git a/src/openllm/prompts/formatter.py b/src/openllm/prompts/formatter.py
new file mode 100644
index 00000000..1a4a5919
--- /dev/null
+++ b/src/openllm/prompts/formatter.py
@@ -0,0 +1,47 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Formatter for prompts. Currently, it is a strict wrapper from string.Formatter that process f-string.
+
+Inspired by langchain.formatting
+
+TODO:
+- Support jinja2 template, go template and possible other prompt template engine.
+"""
+from __future__ import annotations
+
+import string
+import typing as t
+
+
+class PromptFormatter(string.Formatter):
+    def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> str:
+        if len(args) > 0:
+            raise ValueError("Positional arguments are not supported")
+        return super().vformat(format_string, args, kwargs)
+
+    def check_unused_args(
+        self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
+    ) -> None:
+        """Check if extra params is passed."""
+        extras = set(kwargs).difference(used_args)
+        if extras:
+            raise KeyError(f"Extra params passed: {extras}")
+
+    def extract_template_variables(self, template: str) -> t.Sequence[str]:
+        """Extract template variables from a template string."""
+        return [field[1] for field in self.parse(template) if field[1] is not None]
+
+
+default_formatter = PromptFormatter()
diff --git a/src/openllm/py.typed b/src/openllm/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/src/openllm/runner_utils.py b/src/openllm/runner_utils.py
new file mode 100644
index 00000000..dd8cd560
--- /dev/null
+++ b/src/openllm/runner_utils.py
@@ -0,0 +1,381 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+OpenLLM Runnable interface. This define a loose contract for LLMRunnable, which can then be implemented per any model.
+
+LLMRunnable also follow a loose API definition from LangChain's LLM, therefore it can also be used in conjunction with LangChain.
+"""
+from __future__ import annotations
+
+import logging
+import typing as t
+from abc import ABC, abstractmethod
+
+import attr
+import bentoml
+
+import openllm
+
+from .configuration_utils import LLMConfig, ModelSignature
+from .utils import _object_setattr
+
+if t.TYPE_CHECKING:
+    import transformers
+
+    from openllm.types import (InferenceConfig, LLMModuleType,
+                               ModelSignatureDict, ModelSignaturesType,
+                               TokenizerRunner)
+
+else:
+    InferenceConfig = ModelSignatureDict = dict
+
+    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+
+logger = logging.getLogger(__name__)
+
+M = t.TypeVar("M")
+T = t.TypeVar("T")
+
+
+def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
+    def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
+        _object_setattr(fn, "start_model_name", start_model_name)
+        return fn
+
+    return wrapper
+
+
+def generate_tokenizer_runner(
+    tokenizer: transformers.PreTrainedTokenizer
+    | transformers.PreTrainedTokenizerBase
+    | transformers.PreTrainedTokenizerFast,
+    embedded: bool = False,
+) -> TokenizerRunner:
+    """Generate a runner from any given transformers.AutoTokenizer.
+
+    Args:
+        tokenizer: The tokenizer to generate the runner from.
+    """
+
+    # NOTE: I'm going to maintain this function from bentoml side, so internal imports here.
+    from bentoml._internal.frameworks.transformers import \
+        make_default_signatures
+
+    signatures: ModelSignaturesType = make_default_signatures(tokenizer)
+
+    def __init_runnable__(self: bentoml.Runnable):
+        # keep a reference object to the tokenizer
+        self._tokenizer = tokenizer
+
+        self._predict_fns = {}
+        for method_name in signatures:
+            self._predict_fns[method_name] = getattr(tokenizer, method_name)
+
+    def add_method(cls: type[bentoml.Runnable], method_name: str, options: ModelSignature):
+        def fn(self: bentoml.Runnable, *args: t.Any, **kwargs: t.Any) -> t.Any:
+            try:
+                return self._predict_fns[method_name](*args, **kwargs)
+            except KeyError:
+                raise bentoml.exceptions.BadInput(f"Method {method_name} is not supported by this tokenizer.")
+
+        cls.add_method(
+            fn,
+            method_name,
+            batchable=options.batchable,
+            batch_dim=options.batch_dim,
+            input_spec=options.input_spec,
+            output_spec=options.output_spec,
+        )
+
+    RunnableCls: type[bentoml.Runnable] = type(
+        f"{tokenizer.__class__.__qualname__}Runnable",
+        (bentoml.Runnable,),
+        {
+            "SUPPORTED_RESOURCES": ("cpu",),
+            "SUPPORTS_CPU_MULTI_THREADING": True,
+            "__init__": __init_runnable__,
+        },
+    )
+
+    for method_name, options in signatures.items():
+        add_method(RunnableCls, method_name, options)
+
+    return t.cast(
+        "TokenizerRunner",
+        bentoml.Runner(RunnableCls, name=f"{tokenizer.__class__.__qualname__.lower()}-runner", embedded=embedded),
+    )
+
+
+class BaseLLMRunnable(bentoml.Runnable, ABC):
+    SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
+    SUPPORTS_CPU_MULTI_THREADING = True
+
+    # NOTE: A loose contract for LLMRunnable implementations.
+    ATTACH_TOKENIZER: bool = True
+    """This boolean determine whether to attach a tokenizer runner to this runnable or not. 
+    By default, it is True for _generate."""
+
+    start_model_name: str
+    """The default model to use when using ``openllm start <model_name>``."""
+
+    default_model: str | None = None
+    """Return the default model to use when using ``openllm start <model_name>``.
+    This could be one of the keys in self.pretrained_models() or custom users model."""
+
+    inference_config: InferenceConfig = InferenceConfig(generate=ModelSignatureDict(batchable=False))
+    """The inference config for the two endpoints of this model.
+    This is the equivalent of args that is passed into ``bentoml.Runnable.method``.
+    """
+
+    config_class: type[LLMConfig] = LLMConfig
+    """The config class for any given LLMRunnable implementation."""
+
+    @staticmethod
+    @abstractmethod
+    def pretrained_models() -> list[str]:
+        """A list of supported pretrainede models tag for this given runnable.
+
+        For example:
+            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
+                                             "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def _generate(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+        """This method should be implemented to provide a generation interface for the given model."""
+        raise NotImplementedError
+
+
+# TODO: Add support for model validation
+class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
+    # The section below defines a loose contract with langchain's LLM interface.
+    @property
+    def _llm_type(self) -> str:
+        assert self.default_model is not None
+        return openllm.utils.convert_transformers_model_name(self.default_model)
+
+    @property
+    def _identifying_params(self) -> dict[str, t.Any]:
+        return {
+            "configuration": self._llm_config.dict(),
+            "inference_config": self.inference_config,
+            "pretrained": self.pretrained_models(),
+        }
+
+    # XXX: INTERNAL
+    _module: LLMModuleType
+    _model: M | None = None
+    _tokenizer: T | None = None
+
+    def __setattr__(self, attr_name: str, value: t.Any) -> None:
+        if attr_name in ("ATTACH_TOKENIZER",):
+            raise openllm.exceptions.OpenLLMException(
+                f"{attr_name} should not be set at runtime, as it determines how the runner is created. \
+                Please create a separate Runnable that extends '{self}' instead."
+            )
+        super().__setattr__(attr_name, value)
+
+    def __init_subclass__(cls, *, start_model_name: str, module: str | None = None):
+        cls.start_model_name = start_model_name
+
+        if module is None:
+            module = cls.__module__
+        cls._module = t.cast("LLMModuleType", openllm.utils.LazyLoader(module, globals(), module))
+
+    def __init__(
+        self,
+        /,
+        *,
+        model_name: str | None = None,
+        _bentomodel: bentoml.Model | None = None,
+        _llm_config: LLMConfig | None = None,
+        _internal: bool = False,
+        _dummy: bool = False,
+        **kwargs: t.Any,
+    ):
+        self._dummy = _dummy
+        if _dummy:
+            if not _internal:
+                raise openllm.exceptions.ForbiddenAttributeError(
+                    "'_dummy' should only be used for internal, not public facing."
+                )
+            return
+
+        if _bentomodel is not None:
+            if not _internal:
+                raise openllm.exceptions.ForbiddenAttributeError(
+                    "'_bentomodel' should only be used for internal, not public facing."
+                )
+            self._bentomodel = _bentomodel
+        else:
+            import_model_kwargs = kwargs.pop("import_model_kwargs", {})
+            import_tokenizer_kwargs = kwargs.pop("import_tokenizer_kwargs", {})
+            import_config_kwargs = kwargs.pop("import_config_kwargs", {})
+
+            assert self.default_model is not None
+            model_name = self.default_model if model_name is None else model_name
+
+            self._bentomodel = self._module.import_model(
+                model_name,
+                model_kwargs=import_model_kwargs,
+                tokenizer_kwargs=import_tokenizer_kwargs,
+                config_kwargs=import_config_kwargs,
+            )
+
+        if _llm_config is not None:
+            if not _internal:
+                raise openllm.exceptions.ForbiddenAttributeError(
+                    "'_llm_config' should only be used for internal, not public facing."
+                )
+            self._llm_config = _llm_config
+        else:
+            assert self.config_class is not None, "'config_class' is required."
+            self._llm_config = self.config_class(**kwargs)
+
+    def __getattribute__(self, item: t.Any):
+        is_dummy = object.__getattribute__(self, "_dummy")
+        if is_dummy and item in ("model", "tokenizer", "create_runner", "bentoml_runnable_methods__", "add_method"):
+            logger.warning(f"Accessing '{item}' on dummy object. Returning None.")
+            return
+        return super().__getattribute__(item)
+
+    @classmethod
+    def dummy_object(cls) -> LLMRunnable[M, T]:
+        return cls(_dummy=True, _internal=True)
+
+    @property
+    def model(self) -> M:
+        # NOTE: should we have support for nested runner here?
+        if self._model is None:
+            self._model = self._bentomodel.load_model()
+        return self._model
+
+    @property
+    def tokenizer(self) -> T:
+        # This is the runner generated from the bento model. This can
+        # then be used for implementation of _generate.
+        if self._tokenizer is None:
+            if not self.ATTACH_TOKENIZER:
+                raise openllm.exceptions.OpenLLMException(
+                    "Tokenizer runner is not attached. Please set ATTACH_TOKENIZER=True when creating this runnable."
+                )
+            self._tokenizer = self._bentomodel.custom_objects["tokenizer"]
+        return self._tokenizer
+
+    @classmethod
+    def create_runner(
+        cls,
+        pretrained_or_path: str | None = None,
+        config: LLMConfig | None = None,
+        runner_name: str | None = None,
+        models: list[bentoml.Model] | None = None,
+        max_batch_size: int | None = None,
+        max_latency_ms: int | None = None,
+        method_configs: ModelSignaturesType | None = None,
+        embedded: bool = False,
+        import_model_kwargs: dict[str, t.Any] | None = None,
+        import_tokenizer_kwargs: dict[str, t.Any] | None = None,
+        import_config_kwargs: dict[str, t.Any] | None = None,
+        **kwargs: t.Any,
+    ) -> LLMRunner:
+        """Convert this LLMRunnable to a bentoml.Runner.
+
+        Args:
+            model_name: The name of the model to generate the runner from.
+            runner_name: The name of the runner to generate. Optional as this will be generated based on the model_name.
+            models: Any additional ``bentoml.Model`` to be included in this given models. By default, this will be determined from the model_name.
+            max_batch_size: The maximum batch size for the runner.
+            max_latency_ms: The maximum latency for the runner.
+            method_configs: The method configs for the runner.
+            embedded: Whether to run this runner in embedded mode.
+            import_model_kwargs: To pass to model_kwargs in ``import_model``.
+            import_tokenizer_kwargs: To pass to tokenizer_kwargs in ``import_model``.
+            import_config_kwargs: To pass to config_kwargs in ``import_model``.
+            The rest of the kwargs will then be passed into ``self.config_class`` (which is of type ``LLMConfig``).
+
+        Returns:
+            A bentoml.Runner instance.
+        """
+        if pretrained_or_path is None:
+            if cls.default_model is None:
+                raise openllm.exceptions.OpenLLMException(
+                    "A default model is required for any LLMRunnable. Make sure to specify a default_model or pass in a model_name."
+                )
+            pretrained_or_path = cls.default_model
+            if pretrained_or_path not in cls.pretrained_models():
+                logger.debug("Creating runner for custom model '%s'", cls.default_model)
+
+        models = models or []
+        bentomodel = cls._module.import_model(
+            pretrained_or_path,
+            model_kwargs=import_model_kwargs,
+            tokenizer_kwargs=import_tokenizer_kwargs,
+            config_kwargs=import_config_kwargs,
+        )
+        models.append(bentomodel)
+
+        if runner_name is None:
+            runner_name = f"llm-{cls.start_model_name}-runner"
+
+        _bento_runnable_methods_map = {"generate": cls._generate}
+
+        for method_name, method_config in cls.inference_config.items():
+            signature = ModelSignature.from_dict(t.cast(ModelSignatureDict, method_config))
+            cls.add_method(
+                _bento_runnable_methods_map[method_name],
+                method_name,
+                batchable=signature.batchable,
+                batch_dim=signature.batch_dim,
+                input_spec=signature.input_spec,
+                output_spec=signature.output_spec,
+            )
+
+        # The rest of the kwargs can be then pass to LLMConfig
+        if config is not None:
+            llm_config = config.with_options(**kwargs)
+        else:
+            llm_config = cls.config_class(**kwargs)
+
+        return LLMRunner(
+            cls,
+            llm_config=llm_config,
+            runnable_init_params={
+                "model_name": pretrained_or_path,
+                "_bentomodel": bentomodel,
+                "_llm_config": llm_config,
+                "_internal": True,
+            },
+            name=runner_name,
+            models=models,
+            max_batch_size=max_batch_size,
+            max_latency_ms=max_latency_ms,
+            method_configs=method_configs,
+            embedded=embedded,
+        )
+
+
+class LLMRunner(bentoml.Runner):
+    llm_config: LLMConfig = attr.field(factory=lambda: LLMConfig())
+
+    def __init__(
+        self,
+        runnable_class: type[LLMRunnable[t.Any, t.Any]],
+        llm_config: LLMConfig,
+        **kwargs: t.Any,
+    ):
+        super().__init__(runnable_class, **kwargs)
+        # A hack around frozen attributes.
+        _object_setattr(self, "llm_config", llm_config)
diff --git a/src/openllm/schema.py b/src/openllm/schema.py
new file mode 100644
index 00000000..57ade728
--- /dev/null
+++ b/src/openllm/schema.py
@@ -0,0 +1,73 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Schema definition for OpenLLM. This can be use for client interaction.
+"""
+from __future__ import annotations
+
+import typing as t
+from abc import ABC
+
+import pydantic
+
+import openllm
+
+
+class PromptTemplate(pydantic.BaseModel):
+    template: str
+    input_variables: t.Sequence[str]
+
+    class Config:
+        extra = "forbid"
+
+    def to_str(self, **kwargs: str) -> str:
+        """Generate a prompt from the template and input variables"""
+        if not kwargs:
+            raise ValueError("Keyword arguments are required")
+        if not all(k in kwargs for k in self.input_variables):
+            raise ValueError(f"Missing required input variables: {self.input_variables}")
+        return openllm.prompts.default_formatter.format(self.template, **kwargs)
+
+    @classmethod
+    def from_template(cls, template: str) -> PromptTemplate:
+        input_variables = openllm.prompts.default_formatter.extract_template_variables(template)
+        return cls(template=template, input_variables=input_variables)
+
+    @classmethod
+    def from_default(cls, model: str) -> PromptTemplate:
+        template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE")
+        if template is None:
+            raise ValueError(f"Model {model} does not have a default prompt template.")
+        return cls.from_template(template)
+
+
+class BaseIO(pydantic.BaseModel, ABC):
+    class Config:
+        extra = "forbid"
+
+
+class PromptInput(BaseIO):
+    prompt: str
+    """The prompt to be sent to system."""
+
+    inputs: t.Dict[str, str]
+    """A mapping of given inputs value. Should be use for cache."""
+
+
+class PromptOutput(BaseIO):
+    responses: t.List[str]
+    """A list of responses from the system."""
+
+    configuration: t.Dict[str, t.Any]
+    """A mapping of configuration values for given system."""
diff --git a/src/openllm/server_utils.py b/src/openllm/server_utils.py
new file mode 100644
index 00000000..183ce407
--- /dev/null
+++ b/src/openllm/server_utils.py
@@ -0,0 +1,85 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Server utilities for OpenLLM. This extends bentoml.Server.
+
+It independently manage processes and threads for runners and servers separately. 
+This is an experimental feature and can also be merged to upstream BentoML.
+"""
+from __future__ import annotations
+
+import logging
+import subprocess
+import typing as t
+from io import StringIO
+
+import bentoml
+
+import openllm
+
+logger = logging.getLogger(__name__)
+
+
+def start(
+    model_name: str,
+    framework: t.Literal["flax", "tf", "pt"] | None = None,
+    server_args: dict[str, t.Any] | None = None,
+    serve_grpc: bool = False,
+    **llm_config_args: t.Any,
+):
+    # NOTE: We need the below imports so that the client can use the custom IO Descriptor.
+    from openllm.prompts import Prompt as Prompt
+
+    if framework is None:
+        framework = openllm.utils.get_framework_env(model_name)
+
+    start_env = {
+        openllm.utils.FRAMEWORK_ENV_VAR(model_name): framework,
+    }
+
+    server_args = server_args or {}
+    server_args.update(
+        {
+            "working_dir": openllm.utils.get_working_dir(model_name),
+            "bento": f'service_{model_name.replace("-", "_")}:svc',
+        }
+    )
+    # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
+    development = server_args.pop("development")
+    server_args.setdefault("production", not development)
+    server = getattr(bentoml, "HTTPServer" if not serve_grpc else "GrpcServer")(**server_args)
+    server.timeout = 90
+
+    server.start(env=start_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    assert server.process is not None
+    client = server.get_client()
+    llm_config_args = llm_config_args or {}
+    if llm_config_args:
+        res = client.update_llm_config(llm_config_args)
+        assert res
+
+    logger.info("Server for running '%s' can now be accessed at %s", model_name, client.server_url)
+    # TODO: Add generated instruction for using client in JS, Python and Go here.
+
+    def log_output(pipe: t.TextIO):
+        for line in iter(pipe.readline, b""):  # b'\n'-separated lines
+            logger.info(line)
+
+    try:
+        stdout, _ = server.process.communicate()
+        log_output(StringIO(stdout))
+    except Exception as err:
+        logger.error("Exception occured while running '%s':\n", model_name)
+        logger.error(err)
+        raise
diff --git a/src/openllm/types.py b/src/openllm/types.py
new file mode 100644
index 00000000..999ed5ec
--- /dev/null
+++ b/src/openllm/types.py
@@ -0,0 +1,68 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Types definition for OpenLLM.
+
+Note that this module SHOULD NOT BE IMPORTED DURING RUNTIME, as this serve only for typing purposes.
+"""
+from __future__ import annotations
+
+import typing as t
+
+if not t.TYPE_CHECKING:
+    raise RuntimeError(f"{__name__} should not be imported during runtime")
+
+import bentoml
+from bentoml._internal.io_descriptors.base import OpenAPIResponse
+from bentoml._internal.models.model import \
+    ModelSignaturesType as ModelSignaturesType
+from bentoml.types import ModelSignatureDict
+
+from openllm.configuration_utils import LLMConfig
+from openllm.runner_utils import LLMRunnable
+from openllm.utils import LazyLoader
+
+P = t.ParamSpec("P")
+
+F = t.Callable[P, t.Any]
+
+
+class InferenceConfig(t.TypedDict):
+    generate: ModelSignatureDict
+
+
+class LLMModuleType(LazyLoader):
+    @staticmethod
+    def import_model(
+        model_name: str,
+        model_kwargs: dict[str, t.Any] | None = None,
+        tokenizer_kwargs: dict[str, t.Any] | None = None,
+        config_kwargs: dict[str, t.Any] | None = None,
+    ) -> bentoml.Model:
+        ...
+
+    class LLMConfigImpl(LLMConfig):
+        ...
+
+    class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
+        ...
+
+    @staticmethod
+    def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
+        ...
+
+
+# The following type definition are extensions of bentoml.Runner
+class TokenizerRunner(bentoml.Runner):
+    ...
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
new file mode 100644
index 00000000..25b4648a
--- /dev/null
+++ b/src/openllm/utils/__init__.py
@@ -0,0 +1,198 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities function for OpenLLM. User can import these function for convenience, but
+we won't ensure backward compatibility for these functions. So use with caution.
+"""
+from __future__ import annotations
+
+import importlib
+import importlib.machinery
+import itertools
+import logging
+import os
+import re
+import types
+import typing as t
+
+import bentoml
+# NOTE: The following exports useful utils from bentoml
+from bentoml._internal.utils import LazyLoader as LazyLoader
+from bentoml._internal.utils import pkg as packaging_utils
+from bentoml._internal.utils import reserve_free_port as reserve_free_port
+from bentoml._internal.utils import \
+    resolve_user_filepath as resolve_user_filepath
+
+if t.TYPE_CHECKING:
+    import transformers
+    import transformers.utils as import_utils_shim
+
+    from openllm.runner_utils import LLMRunner
+
+else:
+    transformers = LazyLoader("transformers", globals(), "transformers")
+
+    if packaging_utils.pkg_version_info("transformers")[:2] >= (4, 18):
+        import_utils_shim = LazyLoader("import_utils_shim", globals(), "transformers.utils")
+    else:
+        # NOTE: This logic below handle 4.18 compatibility for checking tf, jax, and torch availability.
+        import_utils_shim = LazyLoader(
+            "import_utils_shim", globals(), "bentoml._internal.frameworks.utils.transformers"
+        )
+
+logger = logging.getLogger(__name__)
+
+_object_setattr = object.__setattr__
+
+
+def kebab_to_snake_case(name: str) -> str:
+    """Convert a given kebab-case name to snake_case"""
+    return re.sub(r"-", "_", name)
+
+
+def get_pretrained_env(model_name: str) -> str | None:
+    """Convert a given runnable start model name (kebab-case) to a
+    ENV variable snake_case (OPENLLM_FLAN_T5_PRETRAINED)"""
+    return os.environ.get(f"OPENLLM_{kebab_to_snake_case(model_name.upper())}_PRETRAINED", None)
+
+
+def get_lazy_module(model_name: str) -> LazyLoader:
+    snaked_model_name = kebab_to_snake_case(model_name)
+    return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}")
+
+
+def get_working_dir(model_name: str) -> str:
+    """Get the working directory for a given model name"""
+    return os.path.dirname(importlib.import_module(f".{kebab_to_snake_case(model_name)}", "openllm.models").__file__)
+
+
+def FRAMEWORK_ENV_VAR(model_name: str) -> str:
+    return f"OPENLLM_{kebab_to_snake_case(model_name.upper())}_FRAMEWORK"
+
+
+def get_framework_env(model_name: str) -> str:
+    envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt")
+    if envvar not in ("pt", "tf", "flax"):
+        raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
+    return envvar
+
+
+def generate_service_name(runner: LLMRunner) -> str:
+    dummy_object = runner.runnable_class.dummy_object()
+    return f"llm-{dummy_object.start_model_name}-service"
+
+
+def convert_transformers_model_name(name: str) -> str:
+    return re.sub("[^a-zA-Z0-9]+", "-", name)
+
+
+def generate_tag_from_model_name(model_name: str, prefix: str | None = None, **kwargs: t.Any) -> bentoml.Tag:
+    """Generate a ``bentoml.Tag`` from a given transformers model name.
+
+    Note that this depends on your model to have a config class available.
+
+    Args:
+        model_name: The transformers model name.
+        **kwargs: Additional kwargs to pass to the ``transformers.AutoConfig`` constructor.
+                  If your pass ``return_unused_kwargs=True``, it will be ignored.
+    """
+    if "return_unused_kwargs" in kwargs:
+        logger.debug("Ignoring 'return_unused_kwargs' in 'generate_tag_from_model_name'.")
+        kwargs.pop("return_unused_kwargs")
+    config = transformers.AutoConfig.from_pretrained(model_name, **kwargs)
+    commit_hash = getattr(config, "_commit_hash", None)
+    if commit_hash is None:
+        logger.warning(
+            "Given %s from '%s' doesn't contain a commit hash. We will generate the tag without specific version.",
+            config.__class__,
+            model_name,
+        )
+    tag_str = (
+        convert_transformers_model_name(model_name)
+        if commit_hash is None
+        else f"{convert_transformers_model_name(model_name)}:{commit_hash}"
+    )
+    return bentoml.Tag.from_taglike((f"{prefix}-" if prefix is not None else "") + tag_str)
+
+
+class LazyModule(types.ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    This is a direct port from transformers.utils.import_utils._LazyModule for backwards compatibility with transformers <4.18
+
+    This is an extension a more powerful LazyLoader.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(
+        self,
+        name: str,
+        module_file: str,
+        import_structure: dict[str, list[str]],
+        module_spec: importlib.machinery.ModuleSpec | None = None,
+        extra_objects: dict[str, t.Any] | None = None,
+    ):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module: dict[str, str] = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(itertools.chain(*import_structure.values()))
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = t.cast("list[str]", super().__dir__())
+        # The elements of self.__all__ that are submodules may or
+        # may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the
+        # elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> t.Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/src/openllm/utils/logging.py b/src/openllm/utils/logging.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/openllm_client/__init__.py b/src/openllm_client/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/bazel b/tools/bazel
new file mode 100755
index 00000000..4295d6c8
--- /dev/null
+++ b/tools/bazel
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -e
+
+# courtesy of https://github.com/grpc/grpc
+GIT_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/.."
+
+# DISABLE_BAZEL_WRAPPER can be set to eliminate the wrapper logic
+if [ "${DISABLE_BAZEL_WRAPPER}" != "" ] && [ "${OVERRIDE_BAZEL_VERSION}" == "" ]; then
+    if [ "${BAZEL_REAL}" != "" ]; then
+        # use BAZEL_REAL as set by
+        # https://github.com/bazelbuild/bazel/blob/master/scripts/packages/bazel.sh
+        # that originally invoked this script (this is what happens when you
+        # run "bazel" in our workspace)
+        exec -a "$0" "${BAZEL_REAL}" "$@"
+    else
+        # if BAZEL_REAL is not set, just invoke the default system bazel
+        exec bazel "$@"
+    fi
+fi
+
+VERSION=${OVERRIDE_BAZEL_VERSION:-$(< "$GIT_ROOT/.bazelversion")}
+echo "INFO: Running bazel wrapper (see //tools/bazel for details), bazel version $VERSION will be used instead of system-wide bazel installation." >&2
+
+BASEURL_MIRROR="https://storage.googleapis.com/grpc-bazel-mirror/github.com/bazelbuild/bazel/releases/download"
+BASEURL="https://github.com/bazelbuild/bazel/releases/download"
+pushd "$(dirname "$0")" > /dev/null
+# bazel binary will be downloaded to GIT_ROOT/tools directory by default
+DOWNLOAD_DIR=${OVERRIDE_BAZEL_WRAPPER_DOWNLOAD_DIR:-$GIT_ROOT/tools}
+
+case $(uname -sm) in
+    "Linux x86_64")
+        suffix=linux-x86_64
+        ;;
+    "Linux aarch64")
+        suffix=linux-arm64
+        ;;
+    "Darwin x86_64")
+        suffix=darwin-x86_64
+        ;;
+    "Darwin arm64")
+        suffix=darwin-arm64
+        ;;
+    "MINGW"* | "MSYS_NT"*)
+        suffix=windows-x86_64.exe
+        ;;
+    *)
+        echo "Unsupported architecture: $(uname -sm)" >&2
+        exit 1
+        ;;
+esac
+
+filename="bazel-$VERSION-$suffix"
+filename_abs="${DOWNLOAD_DIR}/${filename}"
+
+if [ ! -x "${filename_abs}" ]; then
+    # first try to download using mirror, fallback to download from github
+    echo "Downloading bazel, will try URLs: ${BASEURL_MIRROR}/${VERSION}/${filename} ${BASEURL}/${VERSION}/${filename}" >&2
+    curl --fail -L --output "${filename_abs}" "${BASEURL_MIRROR}/${VERSION}/${filename}" || curl --fail -L --output "${filename_abs}" "${BASEURL}/${VERSION}/${filename}"
+    chmod a+x "${filename_abs}"
+fi
+
+popd > /dev/null
+
+exec "${filename_abs}" "$@"
diff --git a/tools/bazel.rc b/tools/bazel.rc
new file mode 100644
index 00000000..98348ae8
--- /dev/null
+++ b/tools/bazel.rc
@@ -0,0 +1,58 @@
+# Coloring for error messages.
+common --color=yes
+
+# Work around https://github.com/bazelbuild/bazel/issues/6293 by setting a dummy lcov.
+coverage --combined_report=lcov --coverage_report_generator=@bazel_tools//tools/test:lcov_merger
+
+# prevent creation of empty __init__.py
+# see: https://github.com/bazelbuild/bazel/issues/10076, https://github.com/bazelbuild/bazel/issues/7386
+build --incompatible_default_to_explicit_init_py
+test --incompatible_default_to_explicit_init_py
+
+# Windows requires enable_runfiles
+build --enable_runfiles
+startup --windows_enable_symlinks
+
+## Build ##
+
+# Turn off legacy external runfiles
+# This prevents accidentally depending on this feature, which Bazel will remove.
+build --nolegacy_external_runfiles
+# Use clang as our complier instead of gcc
+build --client_env=CC=clang
+build --cxxopt=-std=c++14 --host_cxxopt=-std=c++14
+# specify fast_cpp_protos for protobuf
+build --define=use_fast_cpp_protos=true
+# don't zip, see: https://github.com/bazelbuild/bazel/issues/8981
+build --build_python_zip=false
+
+build --verbose_failures
+build --worker_verbose
+
+# --config=opt
+build:opt --compilation_mode=opt
+build:opt --copt=-Wframe-larger-than=16384
+
+# --config=dbg
+build:dbg --compilation_mode=dbg
+build:dbg --copt=-Werror=return-stack-address
+
+# Dynamic link cause issues like: `dyld: malformed mach-o: load commands size (59272) > 32768`
+# https://github.com/bazelbuild/bazel/issues/9190
+build:macos --dynamic_mode=off
+# Address https://github.com/bazelbuild/rules_swift/issues/776
+build:macos --host_swiftcopt=-wmo --swiftcopt=-wmo
+# The default strategy is worker, which has sandboxing disabled by default,
+# which can hide issues with non-hermetic bugs.
+build:macos --strategy=SwiftCompile=sandboxed
+
+## Test ##
+
+# --test_output=errors: Printout test error
+test --test_output=errors
+
+# with dbg mode we will stream the output
+test:dbg --test_output=streamed
+
+# nix config
+build:nix --copt="-I$(nix eval --impure --raw --expr 'let pkgs = import <nixpkgs> { }; in pkgs.llvmPackages.openmp.out')/include -L$(nix eval --impure --raw --expr 'let pkgs = import <nixpkgs> { }; in pkgs.llvmPackages.openmp.out')/lib"
diff --git a/tools/dependencies b/tools/dependencies
new file mode 100755
index 00000000..d2c4af24
--- /dev/null
+++ b/tools/dependencies
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$GIT_ROOT" || exit 1
+
+# NOTE that we are using local bazel instead of system bazel.
+bazel run //requirements:pypi.update
+bazel run //:vendor-pypi
+bazel run //requirements:tensorflow.update
+bazel run //:vendor-tensorflow
+bazel run //requirements:tests.update
+bazel run //:vendor-tests
diff --git a/tools/dev.Dockerfile b/tools/dev.Dockerfile
new file mode 100644
index 00000000..bbb09747
--- /dev/null
+++ b/tools/dev.Dockerfile
@@ -0,0 +1,187 @@
+# syntax=docker/dockerfile-upstream:master
+
+FROM python:3.10-slim as base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /workspace
+
+RUN --mount=type=cache,target=/var/lib/apt \
+    --mount=type=cache,target=/var/cache/apt \
+    apt-get update && \
+    apt-get install -q -y --no-install-recommends --allow-remove-essential \
+        bash build-essential ca-certificates git tree
+
+FROM base as protobuf-3
+
+COPY <<-EOT requirements.txt
+    protobuf>=3.5.0,<4.0dev
+    grpcio-tools
+    mypy-protobuf
+EOT
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+
+FROM base as protobuf-4
+
+COPY <<-EOT requirements.txt
+    protobuf>=4.0,<5.0dev
+    grpcio-tools
+    mypy-protobuf
+EOT
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+
+############################################
+
+# BentoML gRPC protobuf 3 generation
+
+FROM protobuf-3 as run-grpcio-tools-3
+
+ARG PROTOCOL_VERSION
+ARG GENERATED_PB3_DIR
+
+RUN mkdir -p /result/${GENERATED_PB3_DIR}
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+
+mkdir -p ${GENERATED_PB3_DIR}
+
+python -m grpc_tools.protoc \
+    -Isrc --grpc_python_out=${GENERATED_PB3_DIR} --python_out=${GENERATED_PB3_DIR} \
+    --mypy_out=${GENERATED_PB3_DIR} --mypy_grpc_out=${GENERATED_PB3_DIR} \
+    src/bentoml/grpc/${PROTOCOL_VERSION}/service.proto
+
+mv ${GENERATED_PB3_DIR}/bentoml/grpc/${PROTOCOL_VERSION}/* /result/${GENERATED_PB3_DIR}
+touch /result/${GENERATED_PB3_DIR}/__init__.py
+rm -rf /result/${GENERATED_PB3_DIR}/${PROTOCOL_VERSION}
+
+EOT
+
+FROM scratch as protobuf-3-output
+
+ARG GENERATED_PB3_DIR
+
+COPY --from=run-grpcio-tools-3 /result/${GENERATED_PB3_DIR} /
+
+############################################
+
+# Triton stubs generation
+
+FROM protobuf-3 as generate-triton-stubs
+
+ARG GENERATED_DIR
+
+RUN mkdir -p /result/${GENERATED_DIR}
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+
+git clone --depth 1 --filter=blob:none --sparse https://github.com/triton-inference-server/common.git
+cd common && git sparse-checkout set protobuf
+cp protobuf/model_config.proto model_config.proto
+cp protobuf/grpc_service.proto service.proto
+
+mkdir -p ${GENERATED_DIR}
+
+python -m grpc_tools.protoc -I. --mypy_out=${GENERATED_DIR} model_config.proto
+
+python -m grpc_tools.protoc -I. --mypy_out=${GENERATED_DIR} --mypy_grpc_out=${GENERATED_DIR} service.proto
+
+tree typings/tritonclient || exit 1
+
+mv typings/tritonclient/grpc/* /result/${GENERATED_DIR}
+EOT
+
+FROM scratch as triton-protobuf-output
+
+ARG GENERATED_DIR
+
+COPY --from=generate-triton-stubs /result/${GENERATED_DIR} /
+
+############################################
+
+# Tests protobuf 3 generation
+
+FROM protobuf-3 as generate-tests-proto-3
+
+RUN mkdir -p /result/tests/proto/_generated_pb3
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+mkdir -p tests/proto/_generated_pb3
+
+python -m grpc_tools.protoc \
+    -I. --grpc_python_out=tests/proto/_generated_pb3 --python_out=tests/proto/_generated_pb3 \
+    --mypy_out=tests/proto/_generated_pb3 --mypy_grpc_out=tests/proto/_generated_pb3 \
+    tests/proto/service_test.proto
+
+mv tests/proto/_generated_pb3/tests/proto/* /result/tests/proto/_generated_pb3
+
+touch /result/tests/proto/_generated_pb3/__init__.py
+
+EOT
+
+FROM scratch as generate-tests-proto-3-output
+
+COPY --from=generate-tests-proto-3 /result/* /
+
+############################################
+
+# BentoML gRPC protobuf 4 generation
+
+FROM protobuf-4 as run-grpcio-tools-4
+
+ARG PROTOCOL_VERSION
+ARG GENERATED_PB4_DIR
+
+RUN mkdir -p /result/${GENERATED_PB4_DIR}
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+
+mkdir -p ${GENERATED_PB4_DIR}
+
+python -m grpc_tools.protoc \
+    -Isrc --grpc_python_out=${GENERATED_PB4_DIR} --python_out=${GENERATED_PB4_DIR} \
+    --mypy_out=${GENERATED_PB4_DIR} --mypy_grpc_out=${GENERATED_PB4_DIR} \
+    src/bentoml/grpc/${PROTOCOL_VERSION}/service.proto
+
+mv ${GENERATED_PB4_DIR}/bentoml/grpc/${PROTOCOL_VERSION}/* /result/${GENERATED_PB4_DIR}
+touch /result/${GENERATED_PB4_DIR}/__init__.py
+rm -rf /result/${GENERATED_PB4_DIR}/${PROTOCOL_VERSION}
+EOT
+
+FROM scratch as protobuf-4-output
+
+ARG GENERATED_PB4_DIR
+
+COPY --from=run-grpcio-tools-4 /result/${GENERATED_PB4_DIR} /
+
+############################################
+
+# Tests protobuf 4 generation
+
+FROM protobuf-4 as generate-tests-proto-4
+
+RUN mkdir -p /result/tests/proto/_generated_pb4
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+mkdir -p tests/proto/_generated_pb4
+
+python -m grpc_tools.protoc \
+    -I. --grpc_python_out=tests/proto/_generated_pb4 --python_out=tests/proto/_generated_pb4 \
+    --mypy_out=tests/proto/_generated_pb4 --mypy_grpc_out=tests/proto/_generated_pb4 \
+    tests/proto/service_test.proto
+
+mv tests/proto/_generated_pb4/tests/proto/* /result/tests/proto/_generated_pb4
+
+touch /result/tests/proto/_generated_pb4/__init__.py
+
+EOT
+
+FROM scratch as generate-tests-proto-4-output
+
+COPY --from=generate-tests-proto-4 /result/* /
diff --git a/tools/lint b/tools/lint
new file mode 100755
index 00000000..f98ea945
--- /dev/null
+++ b/tools/lint
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+
+BASEDIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &> /dev/null && pwd 2> /dev/null)"
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$GIT_ROOT" || exit 1
+
+# NOTE that we are using local bazel instead of system bazel.
+bazel run //:buildcheck
+bazel run //:black -- --check "$GIT_ROOT/src"
+bazel run //:isort -- --check "$GIT_ROOT/src"
+# linting protos
+bazel run //:v1_proto_lint
+bazel run //:v1alpha1_proto_lint
+
+# Running ruff for whole codebase.
+ruff src examples docs tests
diff --git a/tools/style b/tools/style
new file mode 100755
index 00000000..4e0b953b
--- /dev/null
+++ b/tools/style
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+BASEDIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &> /dev/null && pwd 2> /dev/null)"
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$GIT_ROOT" || exit 1
+
+bazel run //:buildfmt
+bazel run //:black -- "$GIT_ROOT/src"
+bazel run //:black -- --pyi "$GIT_ROOT/typings" "$GIT_ROOT/src/bentoml/metrics.pyi"
+bazel run //:isort -- "$GIT_ROOT/src"
+
+# Running ruff for whole codebase.
+ruff --fix src examples docs tests
+
+if command -v buf > /dev/null 2>&1; then
+    buf format --config "$GIT_ROOT/src/bentoml/grpc/buf.yaml" -w src/bentoml/grpc
+else
+    if command -v docker > /dev/null 2>&1; then
+        docker run --init --rm --volume "$GIT_ROOT/src":/workspace --workdir /workspace bufbuild/buf format --config "/workspace/bentoml/grpc/buf.yaml" -w bentoml/grpc
+    fi
+fi