From 1539c3f7dc35bf60c2431316a64f11ac7aba9988 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Thu, 12 Oct 2023 17:21:54 -0400
Subject: [PATCH] feat(client): simple implementation and streaming (#256)

---
 .editorconfig                                 |   2 +
 .gitattributes                                |   4 +
 cz.py                                         |   9 +-
 hatch.toml                                    |   1 +
 openllm-client/README.md                      |   7 +-
 openllm-client/dev.Dockerfile                 |  98 +++
 openllm-client/generate-grpc-stubs            |  66 ++
 openllm-client/protos/service.proto           | 297 +++++++++
 openllm-client/pyproject.toml                 |  12 +-
 openllm-client/src/openllm_client/__init__.py |  12 +-
 openllm-client/src/openllm_client/_base.py    | 298 ---------
 openllm-client/src/openllm_client/_http.py    | 137 ++++
 openllm-client/src/openllm_client/_schemas.py |  24 +
 .../src/openllm_client/benmin/__init__.py     | 124 ----
 .../src/openllm_client/benmin/_grpc.py        | 291 --------
 .../src/openllm_client/benmin/_http.py        | 201 ------
 openllm-client/src/openllm_client/client.py   |  36 -
 .../src/openllm_client/pb/__init__.py         |   2 +
 .../{_adapters.py => pb/v1/__init__.py}       |   0
 .../pb/v1/_generated_pb3/__init__.py          |   0
 .../pb/v1/_generated_pb3/service_pb2.py       | 205 ++++++
 .../pb/v1/_generated_pb3/service_pb2.pyi      | 611 +++++++++++++++++
 .../pb/v1/_generated_pb3/service_pb2_grpc.py  | 104 +++
 .../pb/v1/_generated_pb3/service_pb2_grpc.pyi |  42 ++
 .../pb/v1/_generated_pb4/__init__.py          |   0
 .../pb/v1/_generated_pb4/service_pb2.py       |  84 +++
 .../pb/v1/_generated_pb4/service_pb2.pyi      | 624 ++++++++++++++++++
 .../pb/v1/_generated_pb4/service_pb2_grpc.py  | 104 +++
 .../pb/v1/_generated_pb4/service_pb2_grpc.pyi |  67 ++
 .../src/openllm_client/pb/v1/service_pb2.py   |   6 +
 .../src/openllm_client/pb/v1/service_pb2.pyi  |   6 +
 .../openllm_client/pb/v1/service_pb2_grpc.py  |   6 +
 .../openllm_client/pb/v1/service_pb2_grpc.pyi |   6 +
 openllm-core/src/openllm_core/_schema.py      |   2 +
 .../src/openllm_core/_typing_compat.py        |   2 +
 .../config/configuration_baichuan.py          |   8 +
 .../config/configuration_llama.py             |  10 +-
 openllm-python/src/openllm/_llm.py            |  12 +-
 openllm-python/src/openllm/_service.py        |  19 +-
 openllm-python/src/openllm/cli/entrypoint.py  |  32 +-
 openllm-python/src/openllm/client.py          |   6 +-
 pyproject.toml                                |   1 +
 42 files changed, 2581 insertions(+), 997 deletions(-)
 create mode 100644 openllm-client/dev.Dockerfile
 create mode 100755 openllm-client/generate-grpc-stubs
 create mode 100644 openllm-client/protos/service.proto
 delete mode 100644 openllm-client/src/openllm_client/_base.py
 create mode 100644 openllm-client/src/openllm_client/_http.py
 create mode 100644 openllm-client/src/openllm_client/_schemas.py
 delete mode 100644 openllm-client/src/openllm_client/benmin/__init__.py
 delete mode 100644 openllm-client/src/openllm_client/benmin/_grpc.py
 delete mode 100644 openllm-client/src/openllm_client/benmin/_http.py
 delete mode 100644 openllm-client/src/openllm_client/client.py
 create mode 100644 openllm-client/src/openllm_client/pb/__init__.py
 rename openllm-client/src/openllm_client/{_adapters.py => pb/v1/__init__.py} (100%)
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb3/__init__.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.pyi
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.pyi
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb4/__init__.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.pyi
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.pyi
 create mode 100644 openllm-client/src/openllm_client/pb/v1/service_pb2.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/service_pb2.pyi
 create mode 100644 openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.py
 create mode 100644 openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.pyi

diff --git a/.editorconfig b/.editorconfig
index 3fcb4be2..0d53e419 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -11,3 +11,5 @@ indent_size = 2
 
 [openllm-python/src/openllm/cli/entrypoint.py]
 indent_size = unset
+[openllm-client/src/openllm_client/pb/v1/*]
+indent_size = unset
diff --git a/.gitattributes b/.gitattributes
index a6e8db87..f25ee99a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,6 +2,10 @@
 contrib/clojure/pnpm-lock.yaml linguist-generated=true
 contrib/clojure/src/generated/** linguist-generated=true
 
+# Python Client
+*_pb2*.py linguist-generated=true
+*_pb2*.pyi linguist-generated=true
+
 # Python sdk
 openllm-python/tests/models/__snapshots__/* linguist-generated=true
 openllm-python/src/openllm/utils/dummy_*.py linguist-generated=true
diff --git a/cz.py b/cz.py
index a1fd3cb4..ac604cfb 100755
--- a/cz.py
+++ b/cz.py
@@ -20,10 +20,13 @@ def run_cz(dir: str, package: str):
         tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
         token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
         table.append([filepath.replace(os.path.join(dir, 'src'), ''), line_count, token_count / line_count if line_count != 0 else 0])
+  print(f'\n{"=" * 80}\n')
   print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers='firstrow', floatfmt='.1f') + '\n')
-  for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
-    print(f'{dir_name:35s} : {sum([x[1] for x in group]):6d}')
-  print(f'\ntotal line count: {sum([x[1] for x in table])}')
+  print(
+      tabulate([(dir_name, sum([x[1] for x in group])) for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0])],
+               headers=['Directory', 'LOC'],
+               floatfmt='.1f'))
+  print(f'total line count for {package}: {sum([x[1] for x in table])}\n')
 
 def main() -> int:
   run_cz('openllm-python', 'openllm')
diff --git a/hatch.toml b/hatch.toml
index 87bdaf16..10162e77 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -95,6 +95,7 @@ clojure = ["bash openllm-contrib/clojure/run-clojure-ui.sh"]
 [envs.ci]
 detached = true
 [envs.ci.scripts]
+client-stubs = "bash openllm-client/generate-grpc-stubs"
 compile = "bash ./compile.sh {args}"
 recompile = ["bash ./clean.sh", "compile"]
 edi = "bash local.sh"
diff --git a/openllm-client/README.md b/openllm-client/README.md
index 417dc2e7..c20d4db7 100644
--- a/openllm-client/README.md
+++ b/openllm-client/README.md
@@ -45,12 +45,13 @@ This package holds the underlying client implementation for OpenLLM. If you are
 coming from OpenLLM, the client can be accessed via `openllm.client`.
 
 It provides somewhat of a "similar" APIs to [`bentoml.Client`](https://docs.bentoml.com/en/latest/guides/client.html)
-(via `openllm_client.benmin`) for interacting with OpenLLM server. This can also be extended to use with general
+(via `openllm_client.min`) for interacting with OpenLLM server. This can also be extended to use with general
 BentoML server as well.
 
 > [!NOTE]
-> The component of interop with generic BentoML server will be considered as experimental that will/can be merged back to BentoML.
-> If you are just using this package for interacting with OpenLLM server, nothing should change from `openllm.client` namespace.
+> The component of interop with generic BentoML server will be considered as _EXPERIMENTAL_ and
+> will be refactored to new client implementation soon!
+> If you are just using this package for interacting with OpenLLM server, The API should be the same as `openllm.client` namespace.
 
 ```python
 import openllm
diff --git a/openllm-client/dev.Dockerfile b/openllm-client/dev.Dockerfile
new file mode 100644
index 00000000..211e1fc0
--- /dev/null
+++ b/openllm-client/dev.Dockerfile
@@ -0,0 +1,98 @@
+# syntax=docker/dockerfile-upstream:master
+
+FROM python:3.10-slim as base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /workspace
+
+RUN --mount=type=cache,target=/var/lib/apt \
+    --mount=type=cache,target=/var/cache/apt \
+    apt-get update && \
+    apt-get install -q -y --no-install-recommends --allow-remove-essential \
+        bash build-essential ca-certificates git tree
+
+FROM base as protobuf-3
+
+COPY <<-EOT requirements.txt
+    protobuf>=3.5.0,<4.0dev
+    grpcio-tools
+    mypy-protobuf
+EOT
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+
+FROM base as protobuf-4
+
+COPY <<-EOT requirements.txt
+    protobuf>=4.0,<5.0dev
+    grpcio-tools
+    mypy-protobuf
+EOT
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+
+############################################
+
+# BentoML gRPC protobuf 3 generation
+
+FROM protobuf-3 as run-grpcio-tools-3
+
+ARG PROTOCOL_VERSION
+ARG GENERATED_PB3_DIR
+
+RUN mkdir -p /result/${GENERATED_PB3_DIR}
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+
+mkdir -p ${GENERATED_PB3_DIR}
+
+python -m grpc_tools.protoc \
+    -Iprotos  --grpc_python_out=${GENERATED_PB3_DIR} --python_out=${GENERATED_PB3_DIR} \
+    --mypy_out=${GENERATED_PB3_DIR} --mypy_grpc_out=${GENERATED_PB3_DIR} \
+    protos/service.proto
+
+mv ${GENERATED_PB3_DIR}/* /result/${GENERATED_PB3_DIR}
+touch /result/${GENERATED_PB3_DIR}/__init__.py
+rm -rf /result/${GENERATED_PB3_DIR}/${PROTOCOL_VERSION}
+
+EOT
+
+FROM scratch as protobuf-3-output
+
+ARG GENERATED_PB3_DIR
+
+COPY --from=run-grpcio-tools-3 /result/${GENERATED_PB3_DIR} /
+
+############################################
+
+# BentoML gRPC protobuf 4 generation
+
+FROM protobuf-4 as run-grpcio-tools-4
+
+ARG PROTOCOL_VERSION
+ARG GENERATED_PB4_DIR
+
+RUN mkdir -p /result/${GENERATED_PB4_DIR}
+
+RUN --mount=type=bind,target=.,rw <<EOT
+set -ex
+
+mkdir -p ${GENERATED_PB4_DIR}
+
+python -m grpc_tools.protoc \
+    -Iprotos --grpc_python_out=${GENERATED_PB4_DIR} --python_out=${GENERATED_PB4_DIR} \
+    --mypy_out=${GENERATED_PB4_DIR} --mypy_grpc_out=${GENERATED_PB4_DIR} \
+    protos/service.proto
+
+mv ${GENERATED_PB4_DIR}/* /result/${GENERATED_PB4_DIR}
+touch /result/${GENERATED_PB4_DIR}/__init__.py
+rm -rf /result/${GENERATED_PB4_DIR}/${PROTOCOL_VERSION}
+EOT
+
+FROM scratch as protobuf-4-output
+
+ARG GENERATED_PB4_DIR
+
+COPY --from=run-grpcio-tools-4 /result/${GENERATED_PB4_DIR} /
diff --git a/openllm-client/generate-grpc-stubs b/openllm-client/generate-grpc-stubs
new file mode 100755
index 00000000..57a2617c
--- /dev/null
+++ b/openllm-client/generate-grpc-stubs
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+#
+export BUILDX_NO_DEFAULT_LOAD=true
+
+: ${DEBUG=}
+
+progress=""
+if [ "$DEBUG" = "true" ]; then
+    progress="--progress=plain"
+fi
+
+buildxCmd() {
+    if docker buildx version > /dev/null 2>&1; then
+        set -x
+        docker buildx "$@" $progress
+    elif buildx version > /dev/null 2>&1; then
+        buildx "$@" $progress
+    else
+        echo "Make sure to have Docker Buildx installed."
+        exit 1
+    fi
+}
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$GIT_ROOT/openllm-client" || exit 1
+
+main() {
+    VERSION="${1:-v1}"
+    echo "Generating gRPC stubs for $VERSION..."
+
+    mkdir -p "src/openllm_client/pb/${VERSION}"
+
+    GENERATED_PB3_DIR="src/openllm_client/pb/${VERSION}/_generated_pb3"
+    \rm -rf "$GENERATED_PB3_DIR"
+    buildxCmd build --build-arg PROTOCOL_VERSION="$VERSION" \
+        --build-arg BUILDKIT_CONTEXT_KEEP_GIT_DIR=1 \
+        --build-arg GENERATED_PB3_DIR="${GENERATED_PB3_DIR}" \
+        --target "protobuf-3-output" --output "type=local,dest=${GENERATED_PB3_DIR}" --file "dev.Dockerfile" .
+
+    GENERATED_PB4_DIR="src/openllm_client/pb/${VERSION}/_generated_pb4"
+    \rm -rf "$GENERATED_PB4_DIR"
+    buildxCmd build --build-arg PROTOCOL_VERSION="$VERSION" \
+        --build-arg BUILDKIT_CONTEXT_KEEP_GIT_DIR=1 \
+        --build-arg GENERATED_PB4_DIR="${GENERATED_PB4_DIR}" \
+        --target "protobuf-4-output" --output "type=local,dest=${GENERATED_PB4_DIR}" --file "dev.Dockerfile" .
+    touch "src/openllm_client/pb/${VERSION}/__init__.py"
+    files=("service_pb2.py" "service_pb2.pyi" "service_pb2_grpc.py" "service_pb2_grpc.pyi")
+    for file in "${files[@]}";do
+      module="${file%.*}"
+      cat <<EOF> "src/openllm_client/pb/${VERSION}/${file}"
+from __future__ import annotations
+from google.protobuf import __version__
+if __version__.startswith("4"):
+    from ._generated_pb4.$module import *
+else:
+    from ._generated_pb3.$module import *
+EOF
+    done
+}
+
+if [ "${#}" -gt 1 ]; then
+    echo "$0 takes one optional argument. Usage: $0 [v1]"
+    exit 1
+fi
+main "$@"
diff --git a/openllm-client/protos/service.proto b/openllm-client/protos/service.proto
new file mode 100644
index 00000000..be24c8ec
--- /dev/null
+++ b/openllm-client/protos/service.proto
@@ -0,0 +1,297 @@
+// Vendorred from: https://github.com/bentoml/BentoML/blob/main/src/bentoml/grpc/v1/service.proto
+syntax = "proto3";
+
+package bentoml.grpc.v1;
+
+import "google/protobuf/struct.proto";
+import "google/protobuf/wrappers.proto";
+
+// cc_enable_arenas pre-allocate memory for given message to improve speed. (C++ only)
+option cc_enable_arenas = true;
+option go_package = "github.com/bentoml/bentoml/grpc/v1;service";
+option java_multiple_files = true;
+option java_outer_classname = "ServiceProto";
+option java_package = "com.bentoml.grpc.v1";
+option objc_class_prefix = "SVC";
+option py_generic_services = true;
+
+// a gRPC BentoServer.
+service BentoService {
+  // Call handles methodcaller of given API entrypoint.
+  rpc Call(Request) returns (Response) {}
+  // ServiceMetadata returns metadata of bentoml.Service.
+  rpc ServiceMetadata(ServiceMetadataRequest) returns (ServiceMetadataResponse) {}
+}
+
+// ServiceMetadataRequest message doesn't take any arguments.
+message ServiceMetadataRequest {}
+
+// ServiceMetadataResponse returns metadata of bentoml.Service.
+// Currently it includes name, version, apis, and docs.
+message ServiceMetadataResponse {
+  // DescriptorMetadata is a metadata of any given IODescriptor.
+  message DescriptorMetadata {
+    // descriptor_id describes the given ID of the descriptor, which matches with our OpenAPI definition.
+    optional string descriptor_id = 1;
+
+    // attributes is the kwargs of the given descriptor.
+    google.protobuf.Struct attributes = 2;
+  }
+  // InferenceAPI is bentoml._internal.service.inferece_api.InferenceAPI
+  // that is exposed to gRPC client.
+  // There is no way for reflection to get information of given @svc.api.
+  message InferenceAPI {
+    // name is the name of the API.
+    string name = 1;
+    // input is the input descriptor of the API.
+    optional DescriptorMetadata input = 2;
+    // output is the output descriptor of the API.
+    optional DescriptorMetadata output = 3;
+    // docs is the optional documentation of the API.
+    optional string docs = 4;
+  }
+  // name is the service name.
+  string name = 1;
+  // apis holds a list of InferenceAPI of the service.
+  repeated InferenceAPI apis = 2;
+  // docs is the documentation of the service.
+  string docs = 3;
+}
+
+// Request message for incoming Call.
+message Request {
+  // api_name defines the API entrypoint to call.
+  // api_name is the name of the function defined in bentoml.Service.
+  // Example:
+  //
+  //     @svc.api(input=NumpyNdarray(), output=File())
+  //     def predict(input: NDArray[float]) -> bytes:
+  //         ...
+  //
+  //     api_name is "predict" in this case.
+  string api_name = 1;
+
+  oneof content {
+    // NDArray represents a n-dimensional array of arbitrary type.
+    NDArray ndarray = 3;
+
+    // DataFrame represents any tabular data type. We are using
+    // DataFrame as a trivial representation for tabular type.
+    DataFrame dataframe = 5;
+
+    // Series portrays a series of values. This can be used for
+    // representing Series types in tabular data.
+    Series series = 6;
+
+    // File represents for any arbitrary file type. This can be
+    // plaintext, image, video, audio, etc.
+    File file = 7;
+
+    // Text represents a string inputs.
+    google.protobuf.StringValue text = 8;
+
+    // JSON is represented by using google.protobuf.Value.
+    // see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+    google.protobuf.Value json = 9;
+
+    // Multipart represents a multipart message.
+    // It comprises of a mapping from given type name to a subset of aforementioned types.
+    Multipart multipart = 10;
+
+    // serialized_bytes is for data serialized in BentoML's internal serialization format.
+    bytes serialized_bytes = 2;
+  }
+
+  // Tensor is similiar to ndarray but with a name
+  // We are reserving it for now for future use.
+  // repeated Tensor tensors = 4;
+  reserved 4, 11 to 13;
+}
+
+// Request message for incoming Call.
+message Response {
+  oneof content {
+    // NDArray represents a n-dimensional array of arbitrary type.
+    NDArray ndarray = 1;
+
+    // DataFrame represents any tabular data type. We are using
+    // DataFrame as a trivial representation for tabular type.
+    DataFrame dataframe = 3;
+
+    // Series portrays a series of values. This can be used for
+    // representing Series types in tabular data.
+    Series series = 5;
+
+    // File represents for any arbitrary file type. This can be
+    // plaintext, image, video, audio, etc.
+    File file = 6;
+
+    // Text represents a string inputs.
+    google.protobuf.StringValue text = 7;
+
+    // JSON is represented by using google.protobuf.Value.
+    // see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+    google.protobuf.Value json = 8;
+
+    // Multipart represents a multipart message.
+    // It comprises of a mapping from given type name to a subset of aforementioned types.
+    Multipart multipart = 9;
+
+    // serialized_bytes is for data serialized in BentoML's internal serialization format.
+    bytes serialized_bytes = 2;
+  }
+  // Tensor is similiar to ndarray but with a name
+  // We are reserving it for now for future use.
+  // repeated Tensor tensors = 4;
+  reserved 4, 10 to 13;
+}
+
+// Part represents possible value types for multipart message.
+// These are the same as the types in Request message.
+message Part {
+  oneof representation {
+    // NDArray represents a n-dimensional array of arbitrary type.
+    NDArray ndarray = 1;
+
+    // DataFrame represents any tabular data type. We are using
+    // DataFrame as a trivial representation for tabular type.
+    DataFrame dataframe = 3;
+
+    // Series portrays a series of values. This can be used for
+    // representing Series types in tabular data.
+    Series series = 5;
+
+    // File represents for any arbitrary file type. This can be
+    // plaintext, image, video, audio, etc.
+    File file = 6;
+
+    // Text represents a string inputs.
+    google.protobuf.StringValue text = 7;
+
+    // JSON is represented by using google.protobuf.Value.
+    // see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+    google.protobuf.Value json = 8;
+
+    // serialized_bytes is for data serialized in BentoML's internal serialization format.
+    bytes serialized_bytes = 4;
+  }
+
+  // Tensor is similiar to ndarray but with a name
+  // We are reserving it for now for future use.
+  // Tensor tensors = 4;
+  reserved 2, 9 to 13;
+}
+
+// Multipart represents a multipart message.
+// It comprises of a mapping from given type name to a subset of aforementioned types.
+message Multipart {
+  map<string, Part> fields = 1;
+}
+
+// File represents for any arbitrary file type. This can be
+// plaintext, image, video, audio, etc.
+message File {
+  // optional file type, let it be csv, text, parquet, etc.
+  // v1alpha1 uses 1 as FileType enum.
+  optional string kind = 3;
+  // contents of file as bytes.
+  bytes content = 2;
+}
+
+// DataFrame represents any tabular data type. We are using
+// DataFrame as a trivial representation for tabular type.
+// This message carries given implementation of tabular data based on given orientation.
+// TODO: support index, records, etc.
+message DataFrame {
+  // columns name
+  repeated string column_names = 1;
+
+  // columns orient.
+  // { column ↠ { index ↠ value } }
+  repeated Series columns = 2;
+}
+
+// Series portrays a series of values. This can be used for
+// representing Series types in tabular data.
+message Series {
+  // A bool parameter value
+  repeated bool bool_values = 1 [packed = true];
+
+  // A float parameter value
+  repeated float float_values = 2 [packed = true];
+
+  // A int32 parameter value
+  repeated int32 int32_values = 3 [packed = true];
+
+  // A int64 parameter value
+  repeated int64 int64_values = 6 [packed = true];
+
+  // A string parameter value
+  repeated string string_values = 5;
+
+  // represents a double parameter value.
+  repeated double double_values = 4 [packed = true];
+}
+
+// NDArray represents a n-dimensional array of arbitrary type.
+message NDArray {
+  // Represents data type of a given array.
+  enum DType {
+    // Represents a None type.
+    DTYPE_UNSPECIFIED = 0;
+
+    // Represents an float type.
+    DTYPE_FLOAT = 1;
+
+    // Represents an double type.
+    DTYPE_DOUBLE = 2;
+
+    // Represents a bool type.
+    DTYPE_BOOL = 3;
+
+    // Represents an int32 type.
+    DTYPE_INT32 = 4;
+
+    // Represents an int64 type.
+    DTYPE_INT64 = 5;
+
+    // Represents a uint32 type.
+    DTYPE_UINT32 = 6;
+
+    // Represents a uint64 type.
+    DTYPE_UINT64 = 7;
+
+    // Represents a string type.
+    DTYPE_STRING = 8;
+  }
+
+  // DTYPE is the data type of given array
+  DType dtype = 1;
+
+  // shape is the shape of given array.
+  repeated int32 shape = 2;
+
+  // represents a string parameter value.
+  repeated string string_values = 5;
+
+  // represents a float parameter value.
+  repeated float float_values = 3 [packed = true];
+
+  // represents a double parameter value.
+  repeated double double_values = 4 [packed = true];
+
+  // represents a bool parameter value.
+  repeated bool bool_values = 6 [packed = true];
+
+  // represents a int32 parameter value.
+  repeated int32 int32_values = 7 [packed = true];
+
+  // represents a int64 parameter value.
+  repeated int64 int64_values = 8 [packed = true];
+
+  // represents a uint32 parameter value.
+  repeated uint32 uint32_values = 9 [packed = true];
+
+  // represents a uint64 parameter value.
+  repeated uint64 uint64_values = 10 [packed = true];
+}
diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml
index 573f73cf..988573ec 100644
--- a/openllm-client/pyproject.toml
+++ b/openllm-client/pyproject.toml
@@ -57,7 +57,7 @@ keywords = [
   "PyTorch",
   "Transformers",
 ]
-dependencies = ["openllm-core", "httpx"]
+dependencies = ["orjson", "httpx", "attrs>=23.1.0", "cattrs>=23.1.0"]
 license = "Apache-2.0"
 name = "openllm-client"
 requires-python = ">=3.8"
@@ -72,7 +72,7 @@ Tracker = "https://github.com/bentoml/OpenLLM/issues"
 Twitter = "https://twitter.com/bentomlai"
 [project.optional-dependencies]
 full = ["openllm-client[grpc,agents]"]
-grpc = ["bentoml[grpc]>=1.0.25"]
+grpc = ["bentoml[grpc]>=1.1.6"]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
 
 [tool.hatch.version]
@@ -97,7 +97,13 @@ allow-direct-references = true
 only-include = ["src/openllm_client"]
 sources = ["src"]
 [tool.hatch.build.targets.sdist]
-exclude = ["/.git_archival.txt", "tests", "/.python-version-default"]
+exclude = [
+  "/.git_archival.txt",
+  "tests",
+  "/.python-version-default",
+  "/generate-grpc-stubs",
+  "/dev.Dockerfile",
+]
 [tool.hatch.build.targets.wheel.hooks.mypyc]
 dependencies = [
   "hatch-mypyc==0.16.0",
diff --git a/openllm-client/src/openllm_client/__init__.py b/openllm-client/src/openllm_client/__init__.py
index 1bb51e3d..1147e6e7 100644
--- a/openllm-client/src/openllm_client/__init__.py
+++ b/openllm-client/src/openllm_client/__init__.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 
-from . import benmin as benmin
-from ._base import BaseAsyncClient as BaseAsyncClient
-from ._base import BaseClient as BaseClient
-from .client import AsyncGrpcClient as AsyncGrpcClient
-from .client import AsyncHTTPClient as AsyncHTTPClient
-from .client import GrpcClient as GrpcClient
-from .client import HTTPClient as HTTPClient
+from ._http import AsyncHTTPClient as AsyncHTTPClient
+from ._http import HTTPClient as HTTPClient
+
+# from ._grpc import GrpcClient as GrpcClient
+# from ._grpc import AsyncGrpcClient as AsyncGrpcClient
diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py
deleted file mode 100644
index 268c0176..00000000
--- a/openllm-client/src/openllm_client/_base.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# mypy: disable-error-code="override,no-redef"
-from __future__ import annotations
-import abc
-import functools
-import logging
-import typing as t
-
-from http import HTTPStatus
-from urllib.parse import urljoin
-
-import attr
-import httpx
-import orjson
-
-import openllm_core
-
-from openllm_core._typing_compat import LiteralString
-from openllm_core._typing_compat import overload
-from openllm_core.utils import bentoml_cattr
-from openllm_core.utils import ensure_exec_coro
-from openllm_core.utils import is_transformers_available
-
-from .benmin import AsyncClient as AsyncBentoClient
-from .benmin import Client as BentoClient
-
-if t.TYPE_CHECKING:
-  import transformers
-
-  from openllm_core._typing_compat import DictStrAny
-  from openllm_core._typing_compat import LiteralBackend
-
-logger = logging.getLogger(__name__)
-
-@attr.define(slots=False, init=False)
-class _ClientAttr:
-  _address: str
-  _timeout: float = attr.field(default=30)
-  _api_version: str = attr.field(default='v1')
-
-  def __init__(self, address: str, timeout: float = 30, api_version: str = 'v1'):
-    self.__attrs_init__(address, timeout, api_version)
-
-  @abc.abstractmethod
-  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  @overload
-  @abc.abstractmethod
-  def query(self, prompt: str, *, return_response: t.Literal['processed'], **attrs: t.Any) -> str:
-    ...
-
-  @overload
-  @abc.abstractmethod
-  def query(self, prompt: str, *, return_response: t.Literal['raw'], **attrs: t.Any) -> DictStrAny:
-    ...
-
-  @overload
-  @abc.abstractmethod
-  def query(self, prompt: str, *, return_response: t.Literal['attrs'], **attrs: t.Any) -> openllm_core.GenerationOutput:
-    ...
-
-  @abc.abstractmethod
-  def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  # NOTE: Scikit interface
-  @overload
-  @abc.abstractmethod
-  def predict(self, prompt: str, *, return_response: t.Literal['processed'], **attrs: t.Any) -> str:
-    ...
-
-  @overload
-  @abc.abstractmethod
-  def predict(self, prompt: str, *, return_response: t.Literal['raw'], **attrs: t.Any) -> DictStrAny:
-    ...
-
-  @overload
-  @abc.abstractmethod
-  def predict(self, prompt: str, *, return_response: t.Literal['attrs'], **attrs: t.Any) -> openllm_core.GenerationOutput:
-    ...
-
-  @abc.abstractmethod
-  def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  @functools.cached_property
-  def _hf_agent(self) -> transformers.HfAgent:
-    if not is_transformers_available():
-      raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
-    if not self.supports_hf_agent:
-      raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
-    import transformers
-    return transformers.HfAgent(urljoin(self._address, '/hf/agent'))
-
-  @property
-  def _metadata(self) -> t.Any:
-    return self.call('metadata')
-
-  @property
-  def model_name(self) -> str:
-    try:
-      return self._metadata['model_name']
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def model_id(self) -> str:
-    try:
-      return self._metadata['model_id']
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def backend(self) -> LiteralBackend:
-    try:
-      return self._metadata['backend']
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def timeout(self) -> int:
-    try:
-      return self._metadata['timeout']
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def configuration(self) -> dict[str, t.Any]:
-    try:
-      return orjson.loads(self._metadata['configuration'])
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def supports_embeddings(self) -> bool:
-    try:
-      return self._metadata.get('supports_embeddings', False)
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def supports_hf_agent(self) -> bool:
-    try:
-      return self._metadata.get('supports_hf_agent', False)
-    except KeyError:
-      raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
-
-  @property
-  def config(self) -> openllm_core.LLMConfig:
-    return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration)
-
-  @functools.cached_property
-  def inner(self) -> t.Any:
-    raise NotImplementedError("'inner' client is not implemented.")
-
-class _Client(_ClientAttr):
-  _host: str
-  _port: str
-
-  def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any:
-    return self.inner.call(f'{api_name}_{self._api_version}', *args, **attrs)
-
-  def health(self) -> t.Any:
-    return self.inner.health()
-
-  @functools.cached_property
-  def inner(self) -> BentoClient:
-    BentoClient.wait_until_server_ready(self._address, timeout=self._timeout)
-    return BentoClient.from_url(self._address)
-
-  # Agent integration
-  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any:
-    if agent_type == 'hf': return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-
-  def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    task = kwargs.pop('task', args[0])
-    return_code = kwargs.pop('return_code', False)
-    remote = kwargs.pop('remote', False)
-    try:
-      return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
-    except Exception as err:
-      logger.error('Exception caught while sending instruction to HF agent: %s', err, exc_info=err)
-      logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
-
-class _AsyncClient(_ClientAttr):
-  _host: str
-  _port: str
-
-  def __init__(self, address: str, timeout: float = 30):
-    self._address, self._timeout = address, timeout
-
-  async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any:
-    return await self.inner.call(f'{api_name}_{self._api_version}', *args, **attrs)
-
-  async def health(self) -> t.Any:
-    return await self.inner.health()
-
-  @functools.cached_property
-  def inner(self) -> AsyncBentoClient:
-    ensure_exec_coro(AsyncBentoClient.wait_until_server_ready(self._address, timeout=self._timeout))
-    return ensure_exec_coro(AsyncBentoClient.from_url(self._address))
-
-  # Agent integration
-  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any:
-    if agent_type == 'hf': return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
-    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
-
-  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
-    from transformers.tools.agents import clean_code_for_run
-    from transformers.tools.agents import get_tool_creation_code
-    from transformers.tools.agents import resolve_tools
-    from transformers.tools.python_interpreter import evaluate
-
-    task = kwargs.pop('task', args[0])
-    return_code = kwargs.pop('return_code', False)
-    remote = kwargs.pop('remote', False)
-    stop = ['Task:']
-    prompt = t.cast(str, self._hf_agent.format_prompt(task))
-    async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
-      response = await client.post(self._hf_agent.url_endpoint, json={'inputs': prompt, 'parameters': {'max_new_tokens': 200, 'return_full_text': False, 'stop': stop}})
-      if response.status_code != HTTPStatus.OK: raise ValueError(f'Error {response.status_code}: {response.json()}')
-
-    result = response.json()[0]['generated_text']
-    # Inference API returns the stop sequence
-    for stop_seq in stop:
-      if result.endswith(stop_seq):
-        result = result[:-len(stop_seq)]
-        break
-    # the below have the same logic as agent.run API
-    explanation, code = clean_code_for_run(result)
-    self._hf_agent.log(f'==Explanation from the agent==\n{explanation}')
-    self._hf_agent.log(f'\n\n==Code generated by the agent==\n{code}')
-    if not return_code:
-      self._hf_agent.log('\n\n==Result==')
-      self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools)
-      return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy())
-    else:
-      tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
-      return f'{tool_code}\n{code}'
-
-class BaseClient(_Client):
-  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
-    raise NotImplementedError
-
-  def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput:
-    return openllm_core.EmbeddingsOutput(**self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt)))
-
-  def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str:
-    return self.query(prompt, **attrs)
-
-  def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
-    return_raw_response = attrs.pop('return_raw_response', None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = 'raw'
-    return_attrs = attrs.pop('return_attrs', None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = 'attrs'
-    use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
-    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-    r = openllm_core.GenerationOutput(**self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
-    if return_response == 'attrs': return r
-    elif return_response == 'raw': return bentoml_cattr.unstructure(r)
-    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
-
-class BaseAsyncClient(_AsyncClient):
-  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
-    raise NotImplementedError
-
-  async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput:
-    return openllm_core.EmbeddingsOutput(**(await self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt))))
-
-  async def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
-    return await self.query(prompt, **attrs)
-
-  async def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
-    return_raw_response = attrs.pop('return_raw_response', None)
-    if return_raw_response is not None:
-      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
-      if return_raw_response is True: return_response = 'raw'
-    return_attrs = attrs.pop('return_attrs', None)
-    if return_attrs is not None:
-      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
-      if return_attrs is True: return_response = 'attrs'
-    use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
-    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-    r = openllm_core.GenerationOutput(**(await self.call('generate', openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
-    if return_response == 'attrs': return r
-    elif return_response == 'raw': return bentoml_cattr.unstructure(r)
-    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
diff --git a/openllm-client/src/openllm_client/_http.py b/openllm-client/src/openllm_client/_http.py
new file mode 100644
index 00000000..4e6f425a
--- /dev/null
+++ b/openllm-client/src/openllm_client/_http.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+import typing as t
+
+from urllib.parse import urlparse
+
+import attr
+import httpx
+import orjson
+
+from ._schemas import Request
+from ._schemas import Response
+from ._schemas import StreamResponse
+
+def _address_validator(_: t.Any, attr: attr.Attribute[t.Any], value: str) -> None:
+  if not isinstance(value, str): raise TypeError(f'{attr.name} must be a string')
+  if not urlparse(value).netloc: raise ValueError(f'{attr.name} must be a valid URL')
+
+@attr.define
+class HTTPClient:
+  address: str = attr.field(validator=_address_validator, converter=lambda addr: addr if '://' in addr else 'http://' + addr)
+  api_version: str = 'v1'
+  timeout: int = 30
+  client_args: t.Dict[str, t.Any] = attr.field(factory=dict)
+  __metadata: dict[str, t.Any] = attr.field(default=None)
+  __config: dict[str, t.Any] = attr.field(default=None)
+  _inner: httpx.Client = attr.field(init=False, repr=False)
+
+  def __attrs_post_init__(self) -> None:
+    self._inner = httpx.Client(base_url=self.address, timeout=self.timeout, **self.client_args)
+
+  def _metadata(self) -> dict[str, t.Any]:
+    if self.__metadata is None: self.__metadata = self._inner.post(self._build_endpoint('metadata')).json()
+    return self.__metadata
+
+  def _config(self) -> dict[str, t.Any]:
+    if self.__config is None:
+      config = orjson.loads(self._metadata()['configuration'])
+      generation_config = config.pop('generation_config')
+      self.__config = {**config, **generation_config}
+    return self.__config
+
+  def health(self):
+    return self._inner.get('/readyz')
+
+  def _build_endpoint(self, endpoint: str):
+    return '/' + f'{self.api_version}/{endpoint}'
+
+  def query(self, prompt: str, **attrs: t.Any) -> Response:
+    req = Request(prompt=self._metadata()['prompt_template'].format(system_message=self._metadata()['system_message'], instruction=prompt), llm_config={**self._config(), **attrs})
+    r = self._inner.post(self._build_endpoint('generate'), json=req.json(), **self.client_args)
+    payload = r.json()
+    if r.status_code != 200: raise ValueError("Failed to get generation from '/v1/generate'. Check server logs for more details.")
+    return Response(**payload)
+
+  def generate(self, prompt: str, **attrs: t.Any) -> Response:
+    return self.query(prompt, **attrs)
+
+  def generate_stream(self, prompt: str, **attrs: t.Any) -> t.Iterator[StreamResponse]:
+    req = Request(prompt=self._metadata()['prompt_template'].format(system_message=self._metadata()['system_message'], instruction=prompt), llm_config={**self._config(), **attrs})
+    with self._inner.stream('POST', self._build_endpoint('generate_stream'), json=req.json(), **self.client_args) as r:
+      for payload in r.iter_bytes():
+        # Skip line
+        payload = payload.decode('utf-8')
+        yield StreamResponse(text=payload)
+        # TODO: make it SSE correct for streaming
+        # if payload == b"\n": continue
+        # payload = payload.decode("utf-8")
+        # if payload.startswith("data:"):
+        #   json_payload = orjson.loads(payload.lstrip('data:').rstrip("\n"))
+        #   print(json_payload)
+        #   try: resp = StreamResponse(text=json_payload)
+        #   except Exception as e: print(e)
+        #   yield resp
+
+  def __del__(self) -> None:
+    self._inner.close()
+
+@attr.define
+class AsyncHTTPClient:
+  address: str = attr.field(validator=_address_validator, converter=lambda addr: addr if '://' in addr else 'http://' + addr)
+  api_version: str = 'v1'
+  timeout: int = 30
+  client_args: t.Dict[str, t.Any] = attr.field(factory=dict)
+  __metadata: dict[str, t.Any] = attr.field(default=None)
+  __config: dict[str, t.Any] = attr.field(default=None)
+  _inner: httpx.AsyncClient = attr.field(init=False, repr=False)
+
+  def __attrs_post_init__(self) -> None:
+    self._inner = httpx.AsyncClient(base_url=self.address, timeout=self.timeout, **self.client_args)
+
+  async def _metadata(self) -> dict[str, t.Any]:
+    if self.__metadata is None: self.__metadata = (await self._inner.post(self._build_endpoint('metadata'))).json()
+    return self.__metadata
+
+  async def _config(self) -> dict[str, t.Any]:
+    if self.__config is None:
+      config = orjson.loads((await self._metadata())['configuration'])
+      generation_config = config.pop('generation_config')
+      self.__config = {**config, **generation_config}
+    return self.__config
+
+  async def health(self):
+    return await self._inner.get('/readyz')
+
+  def _build_endpoint(self, endpoint: str):
+    return '/' + f'{self.api_version}/{endpoint}'
+
+  async def query(self, prompt: str, **attrs: t.Any) -> Response:
+    _meta, _config = await self._metadata(), await self._config()
+    client = httpx.AsyncClient(base_url=self.address, timeout=self.timeout, **self.client_args)
+    req = Request(prompt=_meta['prompt_template'].format(system_message=_meta['system_message'], instruction=prompt), llm_config={**_config, **attrs})
+    r = await client.post(self._build_endpoint('generate'), json=req.json(), **self.client_args)
+    payload = r.json()
+    if r.status_code != 200: raise ValueError("Failed to get generation from '/v1/generate'. Check server logs for more details.")
+    return Response(**payload)
+
+  async def generate(self, prompt: str, **attrs: t.Any) -> Response:
+    return await self.query(prompt, **attrs)
+
+  async def generate_stream(self, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[StreamResponse, t.Any]:
+    _meta, _config = await self._metadata(), await self._config()
+    client = httpx.AsyncClient(base_url=self.address, timeout=self.timeout, **self.client_args)
+    req = Request(prompt=_meta['prompt_template'].format(system_message=_meta['system_message'], instruction=prompt), llm_config={**_config, **attrs})
+    async with client.stream('POST', self._build_endpoint('generate_stream'), json=req.json(), **self.client_args) as r:
+      async for payload in r.aiter_bytes():
+        # Skip line
+        payload = payload.decode('utf-8')
+        yield StreamResponse(text=payload)
+        # TODO: make it SSE correct for streaming
+        # if payload == b"\n": continue
+        # payload = payload.decode("utf-8")
+        # if payload.startswith("data:"):
+        #   json_payload = orjson.loads(payload.lstrip('data:').rstrip("\n"))
+        #   print(json_payload)
+        #   try: resp = StreamResponse(text=json_payload)
+        #   except Exception as e: print(e)
+        #   yield resp
diff --git a/openllm-client/src/openllm_client/_schemas.py b/openllm-client/src/openllm_client/_schemas.py
new file mode 100644
index 00000000..32594956
--- /dev/null
+++ b/openllm-client/src/openllm_client/_schemas.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+import typing as t
+
+import attr
+import cattr
+
+class _Mixin:
+  def json(self) -> dict[str, t.Any]:
+    if not attr.has(self.__class__): raise TypeError(f'Class {self.__class__} must be attr class')
+    return cattr.unstructure(self)
+
+@attr.define
+class Request(_Mixin):
+  prompt: str
+  llm_config: t.Dict[str, t.Any]
+
+@attr.define
+class Response(_Mixin):
+  responses: t.List[str]
+  configuration: t.Dict[str, t.Any]
+
+@attr.define
+class StreamResponse(_Mixin):
+  text: str
diff --git a/openllm-client/src/openllm_client/benmin/__init__.py b/openllm-client/src/openllm_client/benmin/__init__.py
deleted file mode 100644
index a731696f..00000000
--- a/openllm-client/src/openllm_client/benmin/__init__.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""This holds a simple client implementation, somewhat similar to `bentoml.client`.
-
-This module is subjected to change and to be merged upstream to BentoML.
-
-```python
-import openllm_client
-
-client = openllm_client.benmin.Client.from_url("http://localhost:3000")
-```
-
-The client implementation won't include a dynamic assignment of the service endpoints, rather this should be called
-via `client.call` or `await client.call`.
-"""
-from __future__ import annotations
-import typing as t
-
-from abc import abstractmethod
-
-import attr
-import httpx
-
-import bentoml
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.service.inference_api import InferenceAPI
-
-__all__ = ['Client', 'AsyncClient']
-
-@attr.define(init=False)
-class Client:
-  server_url: str
-  endpoints: t.List[str]
-  svc: bentoml.Service
-  timeout: int = attr.field(default=30)
-
-  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
-    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException('No APIs was found while constructing clients.')
-    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
-    for it, val in kwargs.items():
-      object.__setattr__(self, it, val)
-
-  def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any:
-    return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
-
-  @abstractmethod
-  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  @abstractmethod
-  def health(self) -> t.Any:
-    raise NotImplementedError
-
-  @classmethod
-  def from_url(cls, url: str, **kwargs: t.Any) -> Client:
-    try:
-      from ._http import HttpClient
-      return HttpClient.from_url(url, **kwargs)
-    except httpx.RemoteProtocolError:
-      from ._grpc import GrpcClient
-      return GrpcClient.from_url(url, **kwargs)
-    except Exception as err:
-      raise bentoml.exceptions.BentoMLException('Failed to create client from url: %s' % url) from err
-
-  @staticmethod
-  def wait_until_server_ready(server: str, port: int | None = None, timeout: float = 30, **kwargs: t.Any) -> None:
-    try:
-      from ._http import HttpClient
-      return HttpClient.wait_until_server_ready(server, port, timeout, **kwargs)
-    except httpx.RemoteProtocolError:
-      if port is None:
-        raise
-      from ._grpc import GrpcClient
-      return GrpcClient.wait_until_server_ready(server, port, timeout, **kwargs)
-    except Exception as err:
-      if port is not None:
-        raise bentoml.exceptions.BentoMLException('Failed to wait until server ready: %s:%d' % (server, port)) from err
-      else:
-        raise bentoml.exceptions.BentoMLException('Failed to wait until server ready: %s' % (server)) from err
-
-@attr.define(init=False)
-class AsyncClient:
-  server_url: str
-  endpoints: t.List[str]
-  svc: bentoml.Service
-  timeout: int = attr.field(default=30)
-
-  def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
-    if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException('No APIs was found while constructing clients.')
-    self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
-    for it, val in kwargs.items():
-      object.__setattr__(self, it, val)
-
-  async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any:
-    return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
-
-  @abstractmethod
-  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    raise NotImplementedError
-
-  @abstractmethod
-  async def health(self) -> t.Any:
-    raise NotImplementedError
-
-  @classmethod
-  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncClient:
-    try:
-      from ._http import AsyncHttpClient
-      return await AsyncHttpClient.from_url(url, **kwargs)
-    except httpx.RemoteProtocolError:
-      from ._grpc import AsyncGrpcClient
-      return await AsyncGrpcClient.from_url(url, **kwargs)
-    except Exception as err:
-      raise bentoml.exceptions.BentoMLException('Failed to create client from url: %s' % url) from err
-
-  @staticmethod
-  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
-    try:
-      from ._http import AsyncHttpClient
-      await AsyncHttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
-    except httpx.RemoteProtocolError:
-      from ._grpc import AsyncGrpcClient
-      await AsyncGrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
-    except Exception as err:
-      raise bentoml.exceptions.BentoMLException('Failed to wait until server ready: %s:%d' % (host, port)) from err
diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py
deleted file mode 100644
index 9d381d4a..00000000
--- a/openllm-client/src/openllm_client/benmin/_grpc.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# mypy: disable-error-code="no-redef"
-from __future__ import annotations
-import functools
-import logging
-import time
-import typing as t
-
-import bentoml
-
-from bentoml._internal.service.inference_api import InferenceAPI
-from bentoml.grpc.utils import import_generated_stubs
-from bentoml.grpc.utils import load_from_file
-from openllm_client.benmin import AsyncClient
-from openllm_client.benmin import Client
-from openllm_core._typing_compat import NotRequired
-from openllm_core._typing_compat import overload
-from openllm_core.utils import ensure_exec_coro
-from openllm_core.utils import is_grpc_available
-from openllm_core.utils import is_grpc_health_available
-
-if not is_grpc_available() or not is_grpc_health_available():
-  raise ImportError("gRPC is required to use gRPC client. Install with 'pip install \"openllm-client[grpc]\"'.")
-import grpc
-import grpc_health.v1.health_pb2 as pb_health
-import grpc_health.v1.health_pb2_grpc as services_health
-
-from google.protobuf import json_format
-from grpc import aio
-
-pb, services = import_generated_stubs('v1')
-
-if t.TYPE_CHECKING:
-  from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse
-
-logger = logging.getLogger(__name__)
-
-class ClientCredentials(t.TypedDict):
-  root_certificates: NotRequired[t.Union[bytes, str]]
-  private_key: NotRequired[t.Union[bytes, str]]
-  certificate_chain: NotRequired[t.Union[bytes, str]]
-
-@overload
-def dispatch_channel(server_url: str,
-                     typ: t.Literal['async'],
-                     ssl: bool = ...,
-                     ssl_client_credentials: ClientCredentials | None = ...,
-                     options: t.Any | None = ...,
-                     compression: grpc.Compression | None = ...,
-                     interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel:
-  ...
-
-@overload
-def dispatch_channel(server_url: str,
-                     typ: t.Literal['sync'],
-                     ssl: bool = ...,
-                     ssl_client_credentials: ClientCredentials | None = ...,
-                     options: t.Any | None = ...,
-                     compression: grpc.Compression | None = ...,
-                     interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel:
-  ...
-
-def dispatch_channel(server_url: str,
-                     typ: t.Literal['async', 'sync'] = 'sync',
-                     ssl: bool = False,
-                     ssl_client_credentials: ClientCredentials | None = None,
-                     options: t.Any | None = None,
-                     compression: grpc.Compression | None = None,
-                     interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel:
-  credentials = None
-  if ssl:
-    if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-    credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()})
-
-  if typ == 'async' and ssl:
-    return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors)
-  elif typ == 'async':
-    return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors)
-  elif typ == 'sync' and ssl:
-    return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression)
-  elif typ == 'sync':
-    return grpc.insecure_channel(server_url, options=options, compression=compression)
-  else:
-    raise ValueError(f'Unknown type: {typ}')
-
-class GrpcClient(Client):
-  ssl: bool
-  ssl_client_credentials: t.Optional[ClientCredentials]
-  options: t.Any
-  compression: t.Optional[grpc.Compression]
-
-  def __init__(self,
-               server_url: str,
-               svc: bentoml.Service,  # gRPC specific options
-               ssl: bool = False,
-               options: t.Any | None = None,
-               compression: grpc.Compression | None = None,
-               ssl_client_credentials: ClientCredentials | None = None,
-               **kwargs: t.Any) -> None:
-    self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression
-    super().__init__(server_url, svc, **kwargs)
-
-  @functools.cached_property
-  def inner(self) -> grpc.Channel:
-    if self.ssl:
-      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
-      return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression)
-    return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression)
-
-  @staticmethod
-  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
-    with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}",
-                          typ='sync',
-                          options=kwargs.get('options', None),
-                          compression=kwargs.get('compression', None),
-                          ssl=kwargs.get('ssl', False),
-                          ssl_client_credentials=kwargs.get('ssl_client_credentials', None)) as channel:
-      req = pb_health.HealthCheckRequest()
-      req.service = 'bentoml.grpc.v1.BentoService'
-      health_stub = services_health.HealthStub(channel)
-      start_time = time.time()
-      while time.time() - start_time < timeout:
-        try:
-          resp = health_stub.Check(req)
-          if resp.status == pb_health.HealthCheckResponse.SERVING: break
-          else: time.sleep(check_interval)
-        except grpc.RpcError:
-          logger.debug('Waiting for server to be ready...')
-          time.sleep(check_interval)
-      try:
-        resp = health_stub.Check(req)
-        if resp.status != pb_health.HealthCheckResponse.SERVING:
-          raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
-      except grpc.RpcError as err:
-        logger.error('Caught RpcError while connecting to %s:%s:\n', host, port)
-        logger.error(err)
-        raise
-
-  @classmethod
-  def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient:
-    with dispatch_channel(url.replace(r'localhost', '0.0.0.0'),
-                          typ='sync',
-                          options=kwargs.get('options', None),
-                          compression=kwargs.get('compression', None),
-                          ssl=kwargs.get('ssl', False),
-                          ssl_client_credentials=kwargs.get('ssl_client_credentials', None)) as channel:
-      metadata = t.cast(
-          'ServiceMetadataResponse',
-          channel.unary_unary('/bentoml.grpc.v1.BentoService/ServiceMetadata',
-                              request_serializer=pb.ServiceMetadataRequest.SerializeToString,
-                              response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
-    reflection = bentoml.Service(metadata.name)
-    for api in metadata.apis:
-      try:
-        reflection.apis[api.name] = InferenceAPI[t.Any](None,
-                                                        bentoml.io.from_spec({
-                                                            'id': api.input.descriptor_id,
-                                                            'args': json_format.MessageToDict(api.input.attributes).get('args', None)
-                                                        }),
-                                                        bentoml.io.from_spec({
-                                                            'id': api.output.descriptor_id,
-                                                            'args': json_format.MessageToDict(api.output.attributes).get('args', None)
-                                                        }),
-                                                        name=api.name,
-                                                        doc=api.docs)
-      except Exception as e:
-        logger.error('Failed to instantiate client for API %s: ', api.name, e)
-    return cls(url, reflection, **kwargs)
-
-  def health(self) -> t.Any:
-    return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=''))
-
-  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}}
-    if _inference_api.multi_input:
-      if data is not None:
-        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
-      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs))
-    else:
-      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data))
-    api_fn = {v: k for k, v in self.svc.apis.items()}
-    stubs = services.BentoServiceStub(self.inner)
-    proto = stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
-    return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content'))))
-
-class AsyncGrpcClient(AsyncClient):
-  ssl: bool
-  ssl_client_credentials: t.Optional[ClientCredentials]
-  options: aio.ChannelArgumentType
-  interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]]
-  compression: t.Optional[grpc.Compression]
-
-  def __init__(self,
-               server_url: str,
-               svc: bentoml.Service,  # gRPC specific options
-               ssl: bool = False,
-               options: aio.ChannelArgumentType | None = None,
-               interceptors: t.Sequence[aio.ClientInterceptor] | None = None,
-               compression: grpc.Compression | None = None,
-               ssl_client_credentials: ClientCredentials | None = None,
-               **kwargs: t.Any) -> None:
-    self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression
-    super().__init__(server_url, svc, **kwargs)
-
-  @functools.cached_property
-  def inner(self) -> aio.Channel:
-    if self.ssl:
-      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
-      return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors)
-    return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)
-
-  @staticmethod
-  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
-    async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}",
-                                typ='async',
-                                options=kwargs.get('options', None),
-                                compression=kwargs.get('compression', None),
-                                ssl=kwargs.get('ssl', False),
-                                ssl_client_credentials=kwargs.get('ssl_client_credentials', None)) as channel:
-      req = pb_health.HealthCheckRequest()
-      req.service = 'bentoml.grpc.v1.BentoService'
-      health_stub = services_health.HealthStub(channel)
-      start_time = time.time()
-      while time.time() - start_time < timeout:
-        try:
-          resp = health_stub.Check(req)
-          if resp.status == pb_health.HealthCheckResponse.SERVING: break
-          else: time.sleep(check_interval)
-        except grpc.RpcError:
-          logger.debug('Waiting for server to be ready...')
-          time.sleep(check_interval)
-      try:
-        resp = health_stub.Check(req)
-        if resp.status != pb_health.HealthCheckResponse.SERVING:
-          raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
-      except grpc.RpcError as err:
-        logger.error('Caught RpcError while connecting to %s:%s:\n', host, port)
-        logger.error(err)
-        raise
-
-  @classmethod
-  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient:
-    async with dispatch_channel(url.replace(r'localhost', '0.0.0.0'),
-                                typ='async',
-                                options=kwargs.get('options', None),
-                                compression=kwargs.get('compression', None),
-                                ssl=kwargs.get('ssl', False),
-                                ssl_client_credentials=kwargs.get('ssl_client_credentials', None),
-                                interceptors=kwargs.get('interceptors', None)) as channel:
-      metadata = t.cast(
-          'ServiceMetadataResponse',
-          channel.unary_unary('/bentoml.grpc.v1.BentoService/ServiceMetadata',
-                              request_serializer=pb.ServiceMetadataRequest.SerializeToString,
-                              response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
-    reflection = bentoml.Service(metadata.name)
-    for api in metadata.apis:
-      try:
-        reflection.apis[api.name] = InferenceAPI[t.Any](None,
-                                                        bentoml.io.from_spec({
-                                                            'id': api.input.descriptor_id,
-                                                            'args': json_format.MessageToDict(api.input.attributes).get('args', None)
-                                                        }),
-                                                        bentoml.io.from_spec({
-                                                            'id': api.output.descriptor_id,
-                                                            'args': json_format.MessageToDict(api.output.attributes).get('args', None)
-                                                        }),
-                                                        name=api.name,
-                                                        doc=api.docs)
-      except Exception as e:
-        logger.error('Failed to instantiate client for API %s: ', api.name, e)
-    return cls(url, reflection, **kwargs)
-
-  async def health(self) -> t.Any:
-    return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=''))
-
-  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}}
-    state = self.inner.get_state(try_to_connect=True)
-    if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready()
-    if _inference_api.multi_input:
-      if data is not None:
-        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
-      fake_resp = await _inference_api.input.to_proto(kwargs)
-    else:
-      fake_resp = await _inference_api.input.to_proto(data)
-    api_fn = {v: k for k, v in self.svc.apis.items()}
-    async with self.inner:
-      stubs = services.BentoServiceStub(self.inner)
-      proto = await stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
-    return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content')))
diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py
deleted file mode 100644
index 86477e5a..00000000
--- a/openllm-client/src/openllm_client/benmin/_http.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from __future__ import annotations
-import asyncio
-import functools
-import logging
-import time
-import typing as t
-import urllib.error
-
-from urllib.parse import urlparse
-
-import httpx
-import orjson
-import starlette.datastructures
-import starlette.requests
-import starlette.responses
-
-import bentoml
-
-from bentoml._internal.service.inference_api import InferenceAPI
-from openllm_client.benmin import AsyncClient
-from openllm_client.benmin import Client
-from openllm_core.utils import ensure_exec_coro
-
-logger = logging.getLogger(__name__)
-
-class HttpClient(Client):
-  @functools.cached_property
-  def inner(self) -> httpx.Client:
-    if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}')
-    return httpx.Client(base_url=self.server_url)
-
-  @staticmethod
-  def wait_until_server_ready(host: str, port: int | None = None, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
-    host = host if '://' in host else 'http://' + host
-    server = host if port is None else f'{host}:{port}'
-    logger.debug("Waiting for server @ '%s' to be ready...", server)
-    start = time.time()
-    while time.time() - start < timeout:
-      try:
-        with httpx.Client(base_url=server) as sess:
-          status = sess.get('/readyz').status_code
-        if status == 200: break
-        else: time.sleep(check_interval)
-      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
-        logger.debug('Server is not ready yet, retrying in %d seconds...', check_interval)
-        time.sleep(check_interval)
-    # Try once more and raise for exception
-    try:
-      with httpx.Client(base_url=server) as sess:
-        status = sess.get('/readyz').status_code
-    except httpx.HTTPStatusError as err:
-      logger.error('Failed to wait until server ready: %s', server)
-      logger.error(err)
-      raise
-
-  def health(self) -> httpx.Response:
-    return self.inner.get('/readyz')
-
-  @classmethod
-  def from_url(cls, url: str, **kwargs: t.Any) -> HttpClient:
-    url = url if '://' in url else 'http://' + url
-    with httpx.Client(base_url=url) as sess:
-      resp = sess.get('/docs.json')
-      if resp.status_code != 200:
-        raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}')
-      _spec = orjson.loads(resp.content)
-
-    reflection = bentoml.Service(_spec['info']['title'])
-
-    for route, spec in _spec['paths'].items():
-      for meth_spec in spec.values():
-        if 'tags' in meth_spec and 'Service APIs' in meth_spec['tags']:
-          if 'x-bentoml-io-descriptor' not in meth_spec['requestBody']:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          if 'x-bentoml-io-descriptor' not in meth_spec['responses']['200']:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          if 'x-bentoml-name' not in meth_spec:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          try:
-            reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None,
-                                                                               bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
-                                                                               bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
-                                                                               name=meth_spec['x-bentoml-name'],
-                                                                               doc=meth_spec['description'],
-                                                                               route=route.lstrip('/'))
-          except Exception as e:
-            logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e)
-    return cls(url, reflection)
-
-  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    # All gRPC kwargs should be popped out.
-    kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')}
-    if _inference_api.multi_input:
-      if data is not None:
-        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
-      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None))
-    else:
-      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None))
-
-    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
-    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
-    else: body = fake_resp.body
-
-    resp = self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
-                           data=body,
-                           headers={'content-type': fake_resp.headers['content-type']},
-                           timeout=self.timeout)
-    if resp.status_code != 200: raise ValueError(f'Error while making request: {resp.status_code}: {resp.content!s}')
-    fake_req = starlette.requests.Request(scope={'type': 'http'})
-    headers = starlette.datastructures.Headers(headers=resp.headers)
-    fake_req._body = resp.content
-    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
-    fake_req._headers = headers
-    return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))
-
-class AsyncHttpClient(AsyncClient):
-  @functools.cached_property
-  def inner(self) -> httpx.AsyncClient:
-    if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}')
-    return httpx.AsyncClient(base_url=self.server_url)
-
-  @staticmethod
-  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
-    host = host if '://' in host else 'http://' + host
-    server = host if port is None else f'{host}:{port}'
-    logger.debug("Waiting for server @ '%s' to be ready...", server)
-    start = time.time()
-    while time.time() - start < timeout:
-      try:
-        async with httpx.AsyncClient(base_url=server) as sess:
-          resp = await sess.get('/readyz')
-          if resp.status_code == 200: break
-          else: await asyncio.sleep(check_interval)
-      except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
-        logger.debug('Server is not ready yet, retrying in %d seconds...', check_interval)
-        await asyncio.sleep(check_interval)
-    # Try once more and raise for exception
-    async with httpx.AsyncClient(base_url=server) as sess:
-      resp = await sess.get('/readyz')
-      if resp.status_code != 200:
-        raise TimeoutError(f"Timeout while waiting for server @ '{server}' to be ready: {resp.status_code}: {resp.content!s}")
-
-  async def health(self) -> httpx.Response:
-    return await self.inner.get('/readyz')
-
-  @classmethod
-  async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncHttpClient:
-    url = url if '://' in url else 'http://' + url
-    async with httpx.AsyncClient(base_url=url) as sess:
-      resp = await sess.get('/docs.json')
-      if resp.status_code != 200:
-        raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}')
-      _spec = orjson.loads(await resp.aread())
-
-    reflection = bentoml.Service(_spec['info']['title'])
-
-    for route, spec in _spec['paths'].items():
-      for meth_spec in spec.values():
-        if 'tags' in meth_spec and 'Service APIs' in meth_spec['tags']:
-          if 'x-bentoml-io-descriptor' not in meth_spec['requestBody']:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          if 'x-bentoml-io-descriptor' not in meth_spec['responses']['200']:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          if 'x-bentoml-name' not in meth_spec:
-            raise ValueError(f'Malformed BentoML spec received from BentoML server {url}')
-          try:
-            reflection.apis[meth_spec['x-bentoml-name']] = InferenceAPI[t.Any](None,
-                                                                               bentoml.io.from_spec(meth_spec['requestBody']['x-bentoml-io-descriptor']),
-                                                                               bentoml.io.from_spec(meth_spec['responses']['200']['x-bentoml-io-descriptor']),
-                                                                               name=meth_spec['x-bentoml-name'],
-                                                                               doc=meth_spec['description'],
-                                                                               route=route.lstrip('/'))
-          except ValueError as e:
-            logger.error('Failed to instantiate client for API %s: ', meth_spec['x-bentoml-name'], e)
-    return cls(url, reflection)
-
-  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    # All gRPC kwargs should be popped out.
-    kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')}
-    if _inference_api.multi_input:
-      if data is not None:
-        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
-      fake_resp = await _inference_api.input.to_http_response(kwargs, None)
-    else:
-      fake_resp = await _inference_api.input.to_http_response(data, None)
-
-    # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
-    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
-    else: body = t.cast(t.Any, fake_resp.body)
-
-    resp = await self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
-                                 data=body,
-                                 headers={'content-type': fake_resp.headers['content-type']},
-                                 timeout=self.timeout)
-    if resp.status_code != 200: raise ValueError(f'Error making request: {resp.status_code}: {(await resp.aread())!s}')
-    fake_req = starlette.requests.Request(scope={'type': 'http'})
-    headers = starlette.datastructures.Headers(headers=resp.headers)
-    fake_req._body = resp.content
-    # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
-    fake_req._headers = headers
-    return await _inference_api.output.from_http_request(fake_req)
diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py
deleted file mode 100644
index 4bee4248..00000000
--- a/openllm-client/src/openllm_client/client.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from __future__ import annotations
-import logging
-
-from urllib.parse import urlparse
-
-from ._base import BaseAsyncClient
-from ._base import BaseClient
-
-logger = logging.getLogger(__name__)
-
-def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
-  address = address if '://' in address else 'http://' + address
-  parsed = urlparse(address)
-  self._host, *_port = parsed.netloc.split(':')
-  if len(_port) == 0: self._port = '80' if parsed.scheme == 'http' else '443'
-  else: self._port = next(iter(_port))
-
-class HTTPClient(BaseClient):
-  def __init__(self, address: str, timeout: int = 30):
-    process_http_address(self, address)
-    super().__init__(address, timeout)
-
-class AsyncHTTPClient(BaseAsyncClient):
-  def __init__(self, address: str, timeout: int = 30):
-    process_http_address(self, address)
-    super().__init__(address, timeout)
-
-class GrpcClient(BaseClient):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(':')
-    super().__init__(address, timeout)
-
-class AsyncGrpcClient(BaseAsyncClient):
-  def __init__(self, address: str, timeout: int = 30):
-    self._host, self._port = address.split(':')
-    super().__init__(address, timeout)
diff --git a/openllm-client/src/openllm_client/pb/__init__.py b/openllm-client/src/openllm_client/pb/__init__.py
new file mode 100644
index 00000000..0e8df59b
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/__init__.py
@@ -0,0 +1,2 @@
+# NOTE: This whole module is generated with `generate-grpc-stubs`
+# and shouldn't be modified manually.
diff --git a/openllm-client/src/openllm_client/_adapters.py b/openllm-client/src/openllm_client/pb/v1/__init__.py
similarity index 100%
rename from openllm-client/src/openllm_client/_adapters.py
rename to openllm-client/src/openllm_client/pb/v1/__init__.py
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb3/__init__.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.py
new file mode 100644
index 00000000..4c97b9e2
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: service.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import service as _service
+from google.protobuf import service_reflection
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
+from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rservice.proto\x12\x0f\x62\x65ntoml.grpc.v1\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1egoogle/protobuf/wrappers.proto\"\x18\n\x16ServiceMetadataRequest\"\xde\x03\n\x17ServiceMetadataResponse\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x43\n\x04\x61pis\x18\x02 \x03(\x0b\x32\x35.bentoml.grpc.v1.ServiceMetadataResponse.InferenceAPI\x12\x0c\n\x04\x64ocs\x18\x03 \x01(\t\x1ao\n\x12\x44\x65scriptorMetadata\x12\x1a\n\rdescriptor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12+\n\nattributes\x18\x02 \x01(\x0b\x32\x17.google.protobuf.StructB\x10\n\x0e_descriptor_id\x1a\xf0\x01\n\x0cInferenceAPI\x12\x0c\n\x04name\x18\x01 \x01(\t\x12O\n\x05input\x18\x02 \x01(\x0b\x32;.bentoml.grpc.v1.ServiceMetadataResponse.DescriptorMetadataH\x00\x88\x01\x01\x12P\n\x06output\x18\x03 \x01(\x0b\x32;.bentoml.grpc.v1.ServiceMetadataResponse.DescriptorMetadataH\x01\x88\x01\x01\x12\x11\n\x04\x64ocs\x18\x04 \x01(\tH\x02\x88\x01\x01\x42\x08\n\x06_inputB\t\n\x07_outputB\x07\n\x05_docs\"\x85\x03\n\x07Request\x12\x10\n\x08\x61pi_name\x18\x01 \x01(\t\x12+\n\x07ndarray\x18\x03 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x05 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x06 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x07 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x08 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\t \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12/\n\tmultipart\x18\n \x01(\x0b\x32\x1a.bentoml.grpc.v1.MultipartH\x00\x12\x1a\n\x10serialized_bytes\x18\x02 \x01(\x0cH\x00\x42\t\n\x07\x63ontentJ\x04\x08\x04\x10\x05J\x04\x08\x0b\x10\x0e\"\xf4\x02\n\x08Response\x12+\n\x07ndarray\x18\x01 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x03 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x05 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x06 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\x08 \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12/\n\tmultipart\x18\t \x01(\x0b\x32\x1a.bentoml.grpc.v1.MultipartH\x00\x12\x1a\n\x10serialized_bytes\x18\x02 \x01(\x0cH\x00\x42\t\n\x07\x63ontentJ\x04\x08\x04\x10\x05J\x04\x08\n\x10\x0e\"\xc6\x02\n\x04Part\x12+\n\x07ndarray\x18\x01 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x03 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x05 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x06 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\x08 \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12\x1a\n\x10serialized_bytes\x18\x04 \x01(\x0cH\x00\x42\x10\n\x0erepresentationJ\x04\x08\x02\x10\x03J\x04\x08\t\x10\x0e\"\x89\x01\n\tMultipart\x12\x36\n\x06\x66ields\x18\x01 \x03(\x0b\x32&.bentoml.grpc.v1.Multipart.FieldsEntry\x1a\x44\n\x0b\x46ieldsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.bentoml.grpc.v1.Part:\x02\x38\x01\"3\n\x04\x46ile\x12\x11\n\x04kind\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\x0c\x42\x07\n\x05_kind\"K\n\tDataFrame\x12\x14\n\x0c\x63olumn_names\x18\x01 \x03(\t\x12(\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x17.bentoml.grpc.v1.Series\"\xa1\x01\n\x06Series\x12\x17\n\x0b\x62ool_values\x18\x01 \x03(\x08\x42\x02\x10\x01\x12\x18\n\x0c\x66loat_values\x18\x02 \x03(\x02\x42\x02\x10\x01\x12\x18\n\x0cint32_values\x18\x03 \x03(\x05\x42\x02\x10\x01\x12\x18\n\x0cint64_values\x18\x06 \x03(\x03\x42\x02\x10\x01\x12\x15\n\rstring_values\x18\x05 \x03(\t\x12\x19\n\rdouble_values\x18\x04 \x03(\x01\x42\x02\x10\x01\"\xc2\x03\n\x07NDArray\x12-\n\x05\x64type\x18\x01 \x01(\x0e\x32\x1e.bentoml.grpc.v1.NDArray.DType\x12\r\n\x05shape\x18\x02 \x03(\x05\x12\x15\n\rstring_values\x18\x05 \x03(\t\x12\x18\n\x0c\x66loat_values\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x19\n\rdouble_values\x18\x04 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x62ool_values\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x18\n\x0cint32_values\x18\x07 \x03(\x05\x42\x02\x10\x01\x12\x18\n\x0cint64_values\x18\x08 \x03(\x03\x42\x02\x10\x01\x12\x19\n\ruint32_values\x18\t \x03(\rB\x02\x10\x01\x12\x19\n\ruint64_values\x18\n \x03(\x04\x42\x02\x10\x01\"\xa9\x01\n\x05\x44Type\x12\x15\n\x11\x44TYPE_UNSPECIFIED\x10\x00\x12\x0f\n\x0b\x44TYPE_FLOAT\x10\x01\x12\x10\n\x0c\x44TYPE_DOUBLE\x10\x02\x12\x0e\n\nDTYPE_BOOL\x10\x03\x12\x0f\n\x0b\x44TYPE_INT32\x10\x04\x12\x0f\n\x0b\x44TYPE_INT64\x10\x05\x12\x10\n\x0c\x44TYPE_UINT32\x10\x06\x12\x10\n\x0c\x44TYPE_UINT64\x10\x07\x12\x10\n\x0c\x44TYPE_STRING\x10\x08\x32\xb5\x01\n\x0c\x42\x65ntoService\x12=\n\x04\x43\x61ll\x12\x18.bentoml.grpc.v1.Request\x1a\x19.bentoml.grpc.v1.Response\"\x00\x12\x66\n\x0fServiceMetadata\x12\'.bentoml.grpc.v1.ServiceMetadataRequest\x1a(.bentoml.grpc.v1.ServiceMetadataResponse\"\x00\x42]\n\x13\x63om.bentoml.grpc.v1B\x0cServiceProtoP\x01Z*github.com/bentoml/bentoml/grpc/v1;service\x90\x01\x01\xf8\x01\x01\xa2\x02\x03SVCb\x06proto3')
+
+
+
+_SERVICEMETADATAREQUEST = DESCRIPTOR.message_types_by_name['ServiceMetadataRequest']
+_SERVICEMETADATARESPONSE = DESCRIPTOR.message_types_by_name['ServiceMetadataResponse']
+_SERVICEMETADATARESPONSE_DESCRIPTORMETADATA = _SERVICEMETADATARESPONSE.nested_types_by_name['DescriptorMetadata']
+_SERVICEMETADATARESPONSE_INFERENCEAPI = _SERVICEMETADATARESPONSE.nested_types_by_name['InferenceAPI']
+_REQUEST = DESCRIPTOR.message_types_by_name['Request']
+_RESPONSE = DESCRIPTOR.message_types_by_name['Response']
+_PART = DESCRIPTOR.message_types_by_name['Part']
+_MULTIPART = DESCRIPTOR.message_types_by_name['Multipart']
+_MULTIPART_FIELDSENTRY = _MULTIPART.nested_types_by_name['FieldsEntry']
+_FILE = DESCRIPTOR.message_types_by_name['File']
+_DATAFRAME = DESCRIPTOR.message_types_by_name['DataFrame']
+_SERIES = DESCRIPTOR.message_types_by_name['Series']
+_NDARRAY = DESCRIPTOR.message_types_by_name['NDArray']
+_NDARRAY_DTYPE = _NDARRAY.enum_types_by_name['DType']
+ServiceMetadataRequest = _reflection.GeneratedProtocolMessageType('ServiceMetadataRequest', (_message.Message,), {
+  'DESCRIPTOR' : _SERVICEMETADATAREQUEST,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.ServiceMetadataRequest)
+  })
+_sym_db.RegisterMessage(ServiceMetadataRequest)
+
+ServiceMetadataResponse = _reflection.GeneratedProtocolMessageType('ServiceMetadataResponse', (_message.Message,), {
+
+  'DescriptorMetadata' : _reflection.GeneratedProtocolMessageType('DescriptorMetadata', (_message.Message,), {
+    'DESCRIPTOR' : _SERVICEMETADATARESPONSE_DESCRIPTORMETADATA,
+    '__module__' : 'service_pb2'
+    # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.ServiceMetadataResponse.DescriptorMetadata)
+    })
+  ,
+
+  'InferenceAPI' : _reflection.GeneratedProtocolMessageType('InferenceAPI', (_message.Message,), {
+    'DESCRIPTOR' : _SERVICEMETADATARESPONSE_INFERENCEAPI,
+    '__module__' : 'service_pb2'
+    # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.ServiceMetadataResponse.InferenceAPI)
+    })
+  ,
+  'DESCRIPTOR' : _SERVICEMETADATARESPONSE,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.ServiceMetadataResponse)
+  })
+_sym_db.RegisterMessage(ServiceMetadataResponse)
+_sym_db.RegisterMessage(ServiceMetadataResponse.DescriptorMetadata)
+_sym_db.RegisterMessage(ServiceMetadataResponse.InferenceAPI)
+
+Request = _reflection.GeneratedProtocolMessageType('Request', (_message.Message,), {
+  'DESCRIPTOR' : _REQUEST,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Request)
+  })
+_sym_db.RegisterMessage(Request)
+
+Response = _reflection.GeneratedProtocolMessageType('Response', (_message.Message,), {
+  'DESCRIPTOR' : _RESPONSE,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Response)
+  })
+_sym_db.RegisterMessage(Response)
+
+Part = _reflection.GeneratedProtocolMessageType('Part', (_message.Message,), {
+  'DESCRIPTOR' : _PART,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Part)
+  })
+_sym_db.RegisterMessage(Part)
+
+Multipart = _reflection.GeneratedProtocolMessageType('Multipart', (_message.Message,), {
+
+  'FieldsEntry' : _reflection.GeneratedProtocolMessageType('FieldsEntry', (_message.Message,), {
+    'DESCRIPTOR' : _MULTIPART_FIELDSENTRY,
+    '__module__' : 'service_pb2'
+    # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Multipart.FieldsEntry)
+    })
+  ,
+  'DESCRIPTOR' : _MULTIPART,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Multipart)
+  })
+_sym_db.RegisterMessage(Multipart)
+_sym_db.RegisterMessage(Multipart.FieldsEntry)
+
+File = _reflection.GeneratedProtocolMessageType('File', (_message.Message,), {
+  'DESCRIPTOR' : _FILE,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.File)
+  })
+_sym_db.RegisterMessage(File)
+
+DataFrame = _reflection.GeneratedProtocolMessageType('DataFrame', (_message.Message,), {
+  'DESCRIPTOR' : _DATAFRAME,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.DataFrame)
+  })
+_sym_db.RegisterMessage(DataFrame)
+
+Series = _reflection.GeneratedProtocolMessageType('Series', (_message.Message,), {
+  'DESCRIPTOR' : _SERIES,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.Series)
+  })
+_sym_db.RegisterMessage(Series)
+
+NDArray = _reflection.GeneratedProtocolMessageType('NDArray', (_message.Message,), {
+  'DESCRIPTOR' : _NDARRAY,
+  '__module__' : 'service_pb2'
+  # @@protoc_insertion_point(class_scope:bentoml.grpc.v1.NDArray)
+  })
+_sym_db.RegisterMessage(NDArray)
+
+_BENTOSERVICE = DESCRIPTOR.services_by_name['BentoService']
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\023com.bentoml.grpc.v1B\014ServiceProtoP\001Z*github.com/bentoml/bentoml/grpc/v1;service\220\001\001\370\001\001\242\002\003SVC'
+  _MULTIPART_FIELDSENTRY._options = None
+  _MULTIPART_FIELDSENTRY._serialized_options = b'8\001'
+  _SERIES.fields_by_name['bool_values']._options = None
+  _SERIES.fields_by_name['bool_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['float_values']._options = None
+  _SERIES.fields_by_name['float_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['int32_values']._options = None
+  _SERIES.fields_by_name['int32_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['int64_values']._options = None
+  _SERIES.fields_by_name['int64_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['double_values']._options = None
+  _SERIES.fields_by_name['double_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['float_values']._options = None
+  _NDARRAY.fields_by_name['float_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['double_values']._options = None
+  _NDARRAY.fields_by_name['double_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['bool_values']._options = None
+  _NDARRAY.fields_by_name['bool_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['int32_values']._options = None
+  _NDARRAY.fields_by_name['int32_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['int64_values']._options = None
+  _NDARRAY.fields_by_name['int64_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['uint32_values']._options = None
+  _NDARRAY.fields_by_name['uint32_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['uint64_values']._options = None
+  _NDARRAY.fields_by_name['uint64_values']._serialized_options = b'\020\001'
+  _SERVICEMETADATAREQUEST._serialized_start=96
+  _SERVICEMETADATAREQUEST._serialized_end=120
+  _SERVICEMETADATARESPONSE._serialized_start=123
+  _SERVICEMETADATARESPONSE._serialized_end=601
+  _SERVICEMETADATARESPONSE_DESCRIPTORMETADATA._serialized_start=247
+  _SERVICEMETADATARESPONSE_DESCRIPTORMETADATA._serialized_end=358
+  _SERVICEMETADATARESPONSE_INFERENCEAPI._serialized_start=361
+  _SERVICEMETADATARESPONSE_INFERENCEAPI._serialized_end=601
+  _REQUEST._serialized_start=604
+  _REQUEST._serialized_end=993
+  _RESPONSE._serialized_start=996
+  _RESPONSE._serialized_end=1368
+  _PART._serialized_start=1371
+  _PART._serialized_end=1697
+  _MULTIPART._serialized_start=1700
+  _MULTIPART._serialized_end=1837
+  _MULTIPART_FIELDSENTRY._serialized_start=1769
+  _MULTIPART_FIELDSENTRY._serialized_end=1837
+  _FILE._serialized_start=1839
+  _FILE._serialized_end=1890
+  _DATAFRAME._serialized_start=1892
+  _DATAFRAME._serialized_end=1967
+  _SERIES._serialized_start=1970
+  _SERIES._serialized_end=2131
+  _NDARRAY._serialized_start=2134
+  _NDARRAY._serialized_end=2584
+  _NDARRAY_DTYPE._serialized_start=2415
+  _NDARRAY_DTYPE._serialized_end=2584
+  _BENTOSERVICE._serialized_start=2587
+  _BENTOSERVICE._serialized_end=2768
+BentoService = service_reflection.GeneratedServiceType('BentoService', (_service.Service,), dict(
+  DESCRIPTOR = _BENTOSERVICE,
+  __module__ = 'service_pb2'
+  ))
+
+BentoService_Stub = service_reflection.GeneratedServiceStubType('BentoService_Stub', (BentoService,), dict(
+  DESCRIPTOR = _BENTOSERVICE,
+  __module__ = 'service_pb2'
+  ))
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.pyi b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.pyi
new file mode 100644
index 00000000..ea497f31
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2.pyi
@@ -0,0 +1,611 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+Vendorred from: https://github.com/bentoml/BentoML/blob/main/src/bentoml/grpc/v1/service.proto"""
+import abc
+import builtins
+import collections.abc
+import concurrent.futures
+import google.protobuf.descriptor
+import google.protobuf.internal.containers
+import google.protobuf.internal.enum_type_wrapper
+import google.protobuf.message
+import google.protobuf.service
+import google.protobuf.struct_pb2
+import google.protobuf.wrappers_pb2
+import sys
+import typing
+
+if sys.version_info >= (3, 10):
+    import typing as typing_extensions
+else:
+    import typing_extensions
+
+DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
+
+class ServiceMetadataRequest(google.protobuf.message.Message):
+    """ServiceMetadataRequest message doesn't take any arguments."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    def __init__(
+        self,
+    ) -> None: ...
+
+global___ServiceMetadataRequest = ServiceMetadataRequest
+
+class ServiceMetadataResponse(google.protobuf.message.Message):
+    """ServiceMetadataResponse returns metadata of bentoml.Service.
+    Currently it includes name, version, apis, and docs.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    class DescriptorMetadata(google.protobuf.message.Message):
+        """DescriptorMetadata is a metadata of any given IODescriptor."""
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        DESCRIPTOR_ID_FIELD_NUMBER: builtins.int
+        ATTRIBUTES_FIELD_NUMBER: builtins.int
+        descriptor_id: builtins.str
+        """descriptor_id describes the given ID of the descriptor, which matches with our OpenAPI definition."""
+        @property
+        def attributes(self) -> google.protobuf.struct_pb2.Struct:
+            """attributes is the kwargs of the given descriptor."""
+        def __init__(
+            self,
+            *,
+            descriptor_id: builtins.str | None = ...,
+            attributes: google.protobuf.struct_pb2.Struct | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["_descriptor_id", b"_descriptor_id", "attributes", b"attributes", "descriptor_id", b"descriptor_id"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["_descriptor_id", b"_descriptor_id", "attributes", b"attributes", "descriptor_id", b"descriptor_id"]) -> None: ...
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_descriptor_id", b"_descriptor_id"]) -> typing_extensions.Literal["descriptor_id"] | None: ...
+
+    class InferenceAPI(google.protobuf.message.Message):
+        """InferenceAPI is bentoml._internal.service.inferece_api.InferenceAPI
+        that is exposed to gRPC client.
+        There is no way for reflection to get information of given @svc.api.
+        """
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        NAME_FIELD_NUMBER: builtins.int
+        INPUT_FIELD_NUMBER: builtins.int
+        OUTPUT_FIELD_NUMBER: builtins.int
+        DOCS_FIELD_NUMBER: builtins.int
+        name: builtins.str
+        """name is the name of the API."""
+        @property
+        def input(self) -> global___ServiceMetadataResponse.DescriptorMetadata:
+            """input is the input descriptor of the API."""
+        @property
+        def output(self) -> global___ServiceMetadataResponse.DescriptorMetadata:
+            """output is the output descriptor of the API."""
+        docs: builtins.str
+        """docs is the optional documentation of the API."""
+        def __init__(
+            self,
+            *,
+            name: builtins.str = ...,
+            input: global___ServiceMetadataResponse.DescriptorMetadata | None = ...,
+            output: global___ServiceMetadataResponse.DescriptorMetadata | None = ...,
+            docs: builtins.str | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["_docs", b"_docs", "_input", b"_input", "_output", b"_output", "docs", b"docs", "input", b"input", "output", b"output"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["_docs", b"_docs", "_input", b"_input", "_output", b"_output", "docs", b"docs", "input", b"input", "name", b"name", "output", b"output"]) -> None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_docs", b"_docs"]) -> typing_extensions.Literal["docs"] | None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_input", b"_input"]) -> typing_extensions.Literal["input"] | None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_output", b"_output"]) -> typing_extensions.Literal["output"] | None: ...
+
+    NAME_FIELD_NUMBER: builtins.int
+    APIS_FIELD_NUMBER: builtins.int
+    DOCS_FIELD_NUMBER: builtins.int
+    name: builtins.str
+    """name is the service name."""
+    @property
+    def apis(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___ServiceMetadataResponse.InferenceAPI]:
+        """apis holds a list of InferenceAPI of the service."""
+    docs: builtins.str
+    """docs is the documentation of the service."""
+    def __init__(
+        self,
+        *,
+        name: builtins.str = ...,
+        apis: collections.abc.Iterable[global___ServiceMetadataResponse.InferenceAPI] | None = ...,
+        docs: builtins.str = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["apis", b"apis", "docs", b"docs", "name", b"name"]) -> None: ...
+
+global___ServiceMetadataResponse = ServiceMetadataResponse
+
+class Request(google.protobuf.message.Message):
+    """Request message for incoming Call."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    API_NAME_FIELD_NUMBER: builtins.int
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    MULTIPART_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    api_name: builtins.str
+    """api_name defines the API entrypoint to call.
+    api_name is the name of the function defined in bentoml.Service.
+    Example:
+
+        @svc.api(input=NumpyNdarray(), output=File())
+        def predict(input: NDArray[float]) -> bytes:
+            ...
+
+        api_name is "predict" in this case.
+    """
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    @property
+    def multipart(self) -> global___Multipart:
+        """Multipart represents a multipart message.
+        It comprises of a mapping from given type name to a subset of aforementioned types.
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        api_name: builtins.str = ...,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        multipart: global___Multipart | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["api_name", b"api_name", "content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["content", b"content"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "multipart", "serialized_bytes"] | None: ...
+
+global___Request = Request
+
+class Response(google.protobuf.message.Message):
+    """Request message for incoming Call."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    MULTIPART_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    @property
+    def multipart(self) -> global___Multipart:
+        """Multipart represents a multipart message.
+        It comprises of a mapping from given type name to a subset of aforementioned types.
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        multipart: global___Multipart | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["content", b"content"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "multipart", "serialized_bytes"] | None: ...
+
+global___Response = Response
+
+class Part(google.protobuf.message.Message):
+    """Part represents possible value types for multipart message.
+    These are the same as the types in Request message.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["dataframe", b"dataframe", "file", b"file", "json", b"json", "ndarray", b"ndarray", "representation", b"representation", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["dataframe", b"dataframe", "file", b"file", "json", b"json", "ndarray", b"ndarray", "representation", b"representation", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["representation", b"representation"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "serialized_bytes"] | None: ...
+
+global___Part = Part
+
+class Multipart(google.protobuf.message.Message):
+    """Multipart represents a multipart message.
+    It comprises of a mapping from given type name to a subset of aforementioned types.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    class FieldsEntry(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        KEY_FIELD_NUMBER: builtins.int
+        VALUE_FIELD_NUMBER: builtins.int
+        key: builtins.str
+        @property
+        def value(self) -> global___Part: ...
+        def __init__(
+            self,
+            *,
+            key: builtins.str = ...,
+            value: global___Part | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["value", b"value"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ...
+
+    FIELDS_FIELD_NUMBER: builtins.int
+    @property
+    def fields(self) -> google.protobuf.internal.containers.MessageMap[builtins.str, global___Part]: ...
+    def __init__(
+        self,
+        *,
+        fields: collections.abc.Mapping[builtins.str, global___Part] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["fields", b"fields"]) -> None: ...
+
+global___Multipart = Multipart
+
+class File(google.protobuf.message.Message):
+    """File represents for any arbitrary file type. This can be
+    plaintext, image, video, audio, etc.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    KIND_FIELD_NUMBER: builtins.int
+    CONTENT_FIELD_NUMBER: builtins.int
+    kind: builtins.str
+    """optional file type, let it be csv, text, parquet, etc.
+    v1alpha1 uses 1 as FileType enum.
+    """
+    content: builtins.bytes
+    """contents of file as bytes."""
+    def __init__(
+        self,
+        *,
+        kind: builtins.str | None = ...,
+        content: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["_kind", b"_kind", "kind", b"kind"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["_kind", b"_kind", "content", b"content", "kind", b"kind"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["_kind", b"_kind"]) -> typing_extensions.Literal["kind"] | None: ...
+
+global___File = File
+
+class DataFrame(google.protobuf.message.Message):
+    """DataFrame represents any tabular data type. We are using
+    DataFrame as a trivial representation for tabular type.
+    This message carries given implementation of tabular data based on given orientation.
+    TODO: support index, records, etc.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    COLUMN_NAMES_FIELD_NUMBER: builtins.int
+    COLUMNS_FIELD_NUMBER: builtins.int
+    @property
+    def column_names(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """columns name"""
+    @property
+    def columns(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Series]:
+        """columns orient.
+        { column ↠ { index ↠ value } }
+        """
+    def __init__(
+        self,
+        *,
+        column_names: collections.abc.Iterable[builtins.str] | None = ...,
+        columns: collections.abc.Iterable[global___Series] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["column_names", b"column_names", "columns", b"columns"]) -> None: ...
+
+global___DataFrame = DataFrame
+
+class Series(google.protobuf.message.Message):
+    """Series portrays a series of values. This can be used for
+    representing Series types in tabular data.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    BOOL_VALUES_FIELD_NUMBER: builtins.int
+    FLOAT_VALUES_FIELD_NUMBER: builtins.int
+    INT32_VALUES_FIELD_NUMBER: builtins.int
+    INT64_VALUES_FIELD_NUMBER: builtins.int
+    STRING_VALUES_FIELD_NUMBER: builtins.int
+    DOUBLE_VALUES_FIELD_NUMBER: builtins.int
+    @property
+    def bool_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]:
+        """A bool parameter value"""
+    @property
+    def float_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """A float parameter value"""
+    @property
+    def int32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """A int32 parameter value"""
+    @property
+    def int64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """A int64 parameter value"""
+    @property
+    def string_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """A string parameter value"""
+    @property
+    def double_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a double parameter value."""
+    def __init__(
+        self,
+        *,
+        bool_values: collections.abc.Iterable[builtins.bool] | None = ...,
+        float_values: collections.abc.Iterable[builtins.float] | None = ...,
+        int32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        int64_values: collections.abc.Iterable[builtins.int] | None = ...,
+        string_values: collections.abc.Iterable[builtins.str] | None = ...,
+        double_values: collections.abc.Iterable[builtins.float] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["bool_values", b"bool_values", "double_values", b"double_values", "float_values", b"float_values", "int32_values", b"int32_values", "int64_values", b"int64_values", "string_values", b"string_values"]) -> None: ...
+
+global___Series = Series
+
+class NDArray(google.protobuf.message.Message):
+    """NDArray represents a n-dimensional array of arbitrary type."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    class _DType:
+        ValueType = typing.NewType("ValueType", builtins.int)
+        V: typing_extensions.TypeAlias = ValueType
+
+    class _DTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[NDArray._DType.ValueType], builtins.type):  # noqa: F821
+        DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+        DTYPE_UNSPECIFIED: NDArray._DType.ValueType  # 0
+        """Represents a None type."""
+        DTYPE_FLOAT: NDArray._DType.ValueType  # 1
+        """Represents an float type."""
+        DTYPE_DOUBLE: NDArray._DType.ValueType  # 2
+        """Represents an double type."""
+        DTYPE_BOOL: NDArray._DType.ValueType  # 3
+        """Represents a bool type."""
+        DTYPE_INT32: NDArray._DType.ValueType  # 4
+        """Represents an int32 type."""
+        DTYPE_INT64: NDArray._DType.ValueType  # 5
+        """Represents an int64 type."""
+        DTYPE_UINT32: NDArray._DType.ValueType  # 6
+        """Represents a uint32 type."""
+        DTYPE_UINT64: NDArray._DType.ValueType  # 7
+        """Represents a uint64 type."""
+        DTYPE_STRING: NDArray._DType.ValueType  # 8
+        """Represents a string type."""
+
+    class DType(_DType, metaclass=_DTypeEnumTypeWrapper):
+        """Represents data type of a given array."""
+
+    DTYPE_UNSPECIFIED: NDArray.DType.ValueType  # 0
+    """Represents a None type."""
+    DTYPE_FLOAT: NDArray.DType.ValueType  # 1
+    """Represents an float type."""
+    DTYPE_DOUBLE: NDArray.DType.ValueType  # 2
+    """Represents an double type."""
+    DTYPE_BOOL: NDArray.DType.ValueType  # 3
+    """Represents a bool type."""
+    DTYPE_INT32: NDArray.DType.ValueType  # 4
+    """Represents an int32 type."""
+    DTYPE_INT64: NDArray.DType.ValueType  # 5
+    """Represents an int64 type."""
+    DTYPE_UINT32: NDArray.DType.ValueType  # 6
+    """Represents a uint32 type."""
+    DTYPE_UINT64: NDArray.DType.ValueType  # 7
+    """Represents a uint64 type."""
+    DTYPE_STRING: NDArray.DType.ValueType  # 8
+    """Represents a string type."""
+
+    DTYPE_FIELD_NUMBER: builtins.int
+    SHAPE_FIELD_NUMBER: builtins.int
+    STRING_VALUES_FIELD_NUMBER: builtins.int
+    FLOAT_VALUES_FIELD_NUMBER: builtins.int
+    DOUBLE_VALUES_FIELD_NUMBER: builtins.int
+    BOOL_VALUES_FIELD_NUMBER: builtins.int
+    INT32_VALUES_FIELD_NUMBER: builtins.int
+    INT64_VALUES_FIELD_NUMBER: builtins.int
+    UINT32_VALUES_FIELD_NUMBER: builtins.int
+    UINT64_VALUES_FIELD_NUMBER: builtins.int
+    dtype: global___NDArray.DType.ValueType
+    """DTYPE is the data type of given array"""
+    @property
+    def shape(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """shape is the shape of given array."""
+    @property
+    def string_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """represents a string parameter value."""
+    @property
+    def float_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a float parameter value."""
+    @property
+    def double_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a double parameter value."""
+    @property
+    def bool_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]:
+        """represents a bool parameter value."""
+    @property
+    def int32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a int32 parameter value."""
+    @property
+    def int64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a int64 parameter value."""
+    @property
+    def uint32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a uint32 parameter value."""
+    @property
+    def uint64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a uint64 parameter value."""
+    def __init__(
+        self,
+        *,
+        dtype: global___NDArray.DType.ValueType = ...,
+        shape: collections.abc.Iterable[builtins.int] | None = ...,
+        string_values: collections.abc.Iterable[builtins.str] | None = ...,
+        float_values: collections.abc.Iterable[builtins.float] | None = ...,
+        double_values: collections.abc.Iterable[builtins.float] | None = ...,
+        bool_values: collections.abc.Iterable[builtins.bool] | None = ...,
+        int32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        int64_values: collections.abc.Iterable[builtins.int] | None = ...,
+        uint32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        uint64_values: collections.abc.Iterable[builtins.int] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["bool_values", b"bool_values", "double_values", b"double_values", "dtype", b"dtype", "float_values", b"float_values", "int32_values", b"int32_values", "int64_values", b"int64_values", "shape", b"shape", "string_values", b"string_values", "uint32_values", b"uint32_values", "uint64_values", b"uint64_values"]) -> None: ...
+
+global___NDArray = NDArray
+
+class BentoService(google.protobuf.service.Service, metaclass=abc.ABCMeta):
+    """a gRPC BentoServer."""
+
+    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
+    @abc.abstractmethod
+    def Call(
+        inst: BentoService,
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___Request,
+        callback: collections.abc.Callable[[global___Response], None] | None,
+    ) -> concurrent.futures.Future[global___Response]:
+        """Call handles methodcaller of given API entrypoint."""
+    @abc.abstractmethod
+    def ServiceMetadata(
+        inst: BentoService,
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___ServiceMetadataRequest,
+        callback: collections.abc.Callable[[global___ServiceMetadataResponse], None] | None,
+    ) -> concurrent.futures.Future[global___ServiceMetadataResponse]:
+        """ServiceMetadata returns metadata of bentoml.Service."""
+
+class BentoService_Stub(BentoService):
+    """a gRPC BentoServer."""
+
+    def __init__(self, rpc_channel: google.protobuf.service.RpcChannel) -> None: ...
+    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
+    def Call(
+        inst: BentoService_Stub,
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___Request,
+        callback: collections.abc.Callable[[global___Response], None] | None = ...,
+    ) -> concurrent.futures.Future[global___Response]:
+        """Call handles methodcaller of given API entrypoint."""
+    def ServiceMetadata(
+        inst: BentoService_Stub,
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___ServiceMetadataRequest,
+        callback: collections.abc.Callable[[global___ServiceMetadataResponse], None] | None = ...,
+    ) -> concurrent.futures.Future[global___ServiceMetadataResponse]:
+        """ServiceMetadata returns metadata of bentoml.Service."""
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.py
new file mode 100644
index 00000000..cff4c957
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.py
@@ -0,0 +1,104 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import service_pb2 as service__pb2
+
+
+class BentoServiceStub(object):
+    """a gRPC BentoServer.
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Call = channel.unary_unary(
+                '/bentoml.grpc.v1.BentoService/Call',
+                request_serializer=service__pb2.Request.SerializeToString,
+                response_deserializer=service__pb2.Response.FromString,
+                )
+        self.ServiceMetadata = channel.unary_unary(
+                '/bentoml.grpc.v1.BentoService/ServiceMetadata',
+                request_serializer=service__pb2.ServiceMetadataRequest.SerializeToString,
+                response_deserializer=service__pb2.ServiceMetadataResponse.FromString,
+                )
+
+
+class BentoServiceServicer(object):
+    """a gRPC BentoServer.
+    """
+
+    def Call(self, request, context):
+        """Call handles methodcaller of given API entrypoint.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ServiceMetadata(self, request, context):
+        """ServiceMetadata returns metadata of bentoml.Service.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BentoServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Call': grpc.unary_unary_rpc_method_handler(
+                    servicer.Call,
+                    request_deserializer=service__pb2.Request.FromString,
+                    response_serializer=service__pb2.Response.SerializeToString,
+            ),
+            'ServiceMetadata': grpc.unary_unary_rpc_method_handler(
+                    servicer.ServiceMetadata,
+                    request_deserializer=service__pb2.ServiceMetadataRequest.FromString,
+                    response_serializer=service__pb2.ServiceMetadataResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'bentoml.grpc.v1.BentoService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class BentoService(object):
+    """a gRPC BentoServer.
+    """
+
+    @staticmethod
+    def Call(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/bentoml.grpc.v1.BentoService/Call',
+            service__pb2.Request.SerializeToString,
+            service__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ServiceMetadata(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/bentoml.grpc.v1.BentoService/ServiceMetadata',
+            service__pb2.ServiceMetadataRequest.SerializeToString,
+            service__pb2.ServiceMetadataResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.pyi b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.pyi
new file mode 100644
index 00000000..423ed190
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb3/service_pb2_grpc.pyi
@@ -0,0 +1,42 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+Vendorred from: https://github.com/bentoml/BentoML/blob/main/src/bentoml/grpc/v1/service.proto"""
+import abc
+import grpc
+import service_pb2
+
+class BentoServiceStub:
+    """a gRPC BentoServer."""
+
+    def __init__(self, channel: grpc.Channel) -> None: ...
+    Call: grpc.UnaryUnaryMultiCallable[
+        service_pb2.Request,
+        service_pb2.Response,
+    ]
+    """Call handles methodcaller of given API entrypoint."""
+    ServiceMetadata: grpc.UnaryUnaryMultiCallable[
+        service_pb2.ServiceMetadataRequest,
+        service_pb2.ServiceMetadataResponse,
+    ]
+    """ServiceMetadata returns metadata of bentoml.Service."""
+
+class BentoServiceServicer(metaclass=abc.ABCMeta):
+    """a gRPC BentoServer."""
+
+    @abc.abstractmethod
+    def Call(
+        self,
+        request: service_pb2.Request,
+        context: grpc.ServicerContext,
+    ) -> service_pb2.Response:
+        """Call handles methodcaller of given API entrypoint."""
+    @abc.abstractmethod
+    def ServiceMetadata(
+        self,
+        request: service_pb2.ServiceMetadataRequest,
+        context: grpc.ServicerContext,
+    ) -> service_pb2.ServiceMetadataResponse:
+        """ServiceMetadata returns metadata of bentoml.Service."""
+
+def add_BentoServiceServicer_to_server(servicer: BentoServiceServicer, server: grpc.Server) -> None: ...
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb4/__init__.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.py
new file mode 100644
index 00000000..21fd2d2f
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: service.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
+from google.protobuf import wrappers_pb2 as google_dot_protobuf_dot_wrappers__pb2
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rservice.proto\x12\x0f\x62\x65ntoml.grpc.v1\x1a\x1cgoogle/protobuf/struct.proto\x1a\x1egoogle/protobuf/wrappers.proto\"\x18\n\x16ServiceMetadataRequest\"\xde\x03\n\x17ServiceMetadataResponse\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x43\n\x04\x61pis\x18\x02 \x03(\x0b\x32\x35.bentoml.grpc.v1.ServiceMetadataResponse.InferenceAPI\x12\x0c\n\x04\x64ocs\x18\x03 \x01(\t\x1ao\n\x12\x44\x65scriptorMetadata\x12\x1a\n\rdescriptor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12+\n\nattributes\x18\x02 \x01(\x0b\x32\x17.google.protobuf.StructB\x10\n\x0e_descriptor_id\x1a\xf0\x01\n\x0cInferenceAPI\x12\x0c\n\x04name\x18\x01 \x01(\t\x12O\n\x05input\x18\x02 \x01(\x0b\x32;.bentoml.grpc.v1.ServiceMetadataResponse.DescriptorMetadataH\x00\x88\x01\x01\x12P\n\x06output\x18\x03 \x01(\x0b\x32;.bentoml.grpc.v1.ServiceMetadataResponse.DescriptorMetadataH\x01\x88\x01\x01\x12\x11\n\x04\x64ocs\x18\x04 \x01(\tH\x02\x88\x01\x01\x42\x08\n\x06_inputB\t\n\x07_outputB\x07\n\x05_docs\"\x85\x03\n\x07Request\x12\x10\n\x08\x61pi_name\x18\x01 \x01(\t\x12+\n\x07ndarray\x18\x03 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x05 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x06 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x07 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x08 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\t \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12/\n\tmultipart\x18\n \x01(\x0b\x32\x1a.bentoml.grpc.v1.MultipartH\x00\x12\x1a\n\x10serialized_bytes\x18\x02 \x01(\x0cH\x00\x42\t\n\x07\x63ontentJ\x04\x08\x04\x10\x05J\x04\x08\x0b\x10\x0e\"\xf4\x02\n\x08Response\x12+\n\x07ndarray\x18\x01 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x03 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x05 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x06 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\x08 \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12/\n\tmultipart\x18\t \x01(\x0b\x32\x1a.bentoml.grpc.v1.MultipartH\x00\x12\x1a\n\x10serialized_bytes\x18\x02 \x01(\x0cH\x00\x42\t\n\x07\x63ontentJ\x04\x08\x04\x10\x05J\x04\x08\n\x10\x0e\"\xc6\x02\n\x04Part\x12+\n\x07ndarray\x18\x01 \x01(\x0b\x32\x18.bentoml.grpc.v1.NDArrayH\x00\x12/\n\tdataframe\x18\x03 \x01(\x0b\x32\x1a.bentoml.grpc.v1.DataFrameH\x00\x12)\n\x06series\x18\x05 \x01(\x0b\x32\x17.bentoml.grpc.v1.SeriesH\x00\x12%\n\x04\x66ile\x18\x06 \x01(\x0b\x32\x15.bentoml.grpc.v1.FileH\x00\x12,\n\x04text\x18\x07 \x01(\x0b\x32\x1c.google.protobuf.StringValueH\x00\x12&\n\x04json\x18\x08 \x01(\x0b\x32\x16.google.protobuf.ValueH\x00\x12\x1a\n\x10serialized_bytes\x18\x04 \x01(\x0cH\x00\x42\x10\n\x0erepresentationJ\x04\x08\x02\x10\x03J\x04\x08\t\x10\x0e\"\x89\x01\n\tMultipart\x12\x36\n\x06\x66ields\x18\x01 \x03(\x0b\x32&.bentoml.grpc.v1.Multipart.FieldsEntry\x1a\x44\n\x0b\x46ieldsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.bentoml.grpc.v1.Part:\x02\x38\x01\"3\n\x04\x46ile\x12\x11\n\x04kind\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\x0c\x42\x07\n\x05_kind\"K\n\tDataFrame\x12\x14\n\x0c\x63olumn_names\x18\x01 \x03(\t\x12(\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x17.bentoml.grpc.v1.Series\"\xa1\x01\n\x06Series\x12\x17\n\x0b\x62ool_values\x18\x01 \x03(\x08\x42\x02\x10\x01\x12\x18\n\x0c\x66loat_values\x18\x02 \x03(\x02\x42\x02\x10\x01\x12\x18\n\x0cint32_values\x18\x03 \x03(\x05\x42\x02\x10\x01\x12\x18\n\x0cint64_values\x18\x06 \x03(\x03\x42\x02\x10\x01\x12\x15\n\rstring_values\x18\x05 \x03(\t\x12\x19\n\rdouble_values\x18\x04 \x03(\x01\x42\x02\x10\x01\"\xc2\x03\n\x07NDArray\x12-\n\x05\x64type\x18\x01 \x01(\x0e\x32\x1e.bentoml.grpc.v1.NDArray.DType\x12\r\n\x05shape\x18\x02 \x03(\x05\x12\x15\n\rstring_values\x18\x05 \x03(\t\x12\x18\n\x0c\x66loat_values\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x19\n\rdouble_values\x18\x04 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x62ool_values\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x18\n\x0cint32_values\x18\x07 \x03(\x05\x42\x02\x10\x01\x12\x18\n\x0cint64_values\x18\x08 \x03(\x03\x42\x02\x10\x01\x12\x19\n\ruint32_values\x18\t \x03(\rB\x02\x10\x01\x12\x19\n\ruint64_values\x18\n \x03(\x04\x42\x02\x10\x01\"\xa9\x01\n\x05\x44Type\x12\x15\n\x11\x44TYPE_UNSPECIFIED\x10\x00\x12\x0f\n\x0b\x44TYPE_FLOAT\x10\x01\x12\x10\n\x0c\x44TYPE_DOUBLE\x10\x02\x12\x0e\n\nDTYPE_BOOL\x10\x03\x12\x0f\n\x0b\x44TYPE_INT32\x10\x04\x12\x0f\n\x0b\x44TYPE_INT64\x10\x05\x12\x10\n\x0c\x44TYPE_UINT32\x10\x06\x12\x10\n\x0c\x44TYPE_UINT64\x10\x07\x12\x10\n\x0c\x44TYPE_STRING\x10\x08\x32\xb5\x01\n\x0c\x42\x65ntoService\x12=\n\x04\x43\x61ll\x12\x18.bentoml.grpc.v1.Request\x1a\x19.bentoml.grpc.v1.Response\"\x00\x12\x66\n\x0fServiceMetadata\x12\'.bentoml.grpc.v1.ServiceMetadataRequest\x1a(.bentoml.grpc.v1.ServiceMetadataResponse\"\x00\x42]\n\x13\x63om.bentoml.grpc.v1B\x0cServiceProtoP\x01Z*github.com/bentoml/bentoml/grpc/v1;service\x90\x01\x01\xf8\x01\x01\xa2\x02\x03SVCb\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'service_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\023com.bentoml.grpc.v1B\014ServiceProtoP\001Z*github.com/bentoml/bentoml/grpc/v1;service\220\001\001\370\001\001\242\002\003SVC'
+  _MULTIPART_FIELDSENTRY._options = None
+  _MULTIPART_FIELDSENTRY._serialized_options = b'8\001'
+  _SERIES.fields_by_name['bool_values']._options = None
+  _SERIES.fields_by_name['bool_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['float_values']._options = None
+  _SERIES.fields_by_name['float_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['int32_values']._options = None
+  _SERIES.fields_by_name['int32_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['int64_values']._options = None
+  _SERIES.fields_by_name['int64_values']._serialized_options = b'\020\001'
+  _SERIES.fields_by_name['double_values']._options = None
+  _SERIES.fields_by_name['double_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['float_values']._options = None
+  _NDARRAY.fields_by_name['float_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['double_values']._options = None
+  _NDARRAY.fields_by_name['double_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['bool_values']._options = None
+  _NDARRAY.fields_by_name['bool_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['int32_values']._options = None
+  _NDARRAY.fields_by_name['int32_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['int64_values']._options = None
+  _NDARRAY.fields_by_name['int64_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['uint32_values']._options = None
+  _NDARRAY.fields_by_name['uint32_values']._serialized_options = b'\020\001'
+  _NDARRAY.fields_by_name['uint64_values']._options = None
+  _NDARRAY.fields_by_name['uint64_values']._serialized_options = b'\020\001'
+  _globals['_SERVICEMETADATAREQUEST']._serialized_start=96
+  _globals['_SERVICEMETADATAREQUEST']._serialized_end=120
+  _globals['_SERVICEMETADATARESPONSE']._serialized_start=123
+  _globals['_SERVICEMETADATARESPONSE']._serialized_end=601
+  _globals['_SERVICEMETADATARESPONSE_DESCRIPTORMETADATA']._serialized_start=247
+  _globals['_SERVICEMETADATARESPONSE_DESCRIPTORMETADATA']._serialized_end=358
+  _globals['_SERVICEMETADATARESPONSE_INFERENCEAPI']._serialized_start=361
+  _globals['_SERVICEMETADATARESPONSE_INFERENCEAPI']._serialized_end=601
+  _globals['_REQUEST']._serialized_start=604
+  _globals['_REQUEST']._serialized_end=993
+  _globals['_RESPONSE']._serialized_start=996
+  _globals['_RESPONSE']._serialized_end=1368
+  _globals['_PART']._serialized_start=1371
+  _globals['_PART']._serialized_end=1697
+  _globals['_MULTIPART']._serialized_start=1700
+  _globals['_MULTIPART']._serialized_end=1837
+  _globals['_MULTIPART_FIELDSENTRY']._serialized_start=1769
+  _globals['_MULTIPART_FIELDSENTRY']._serialized_end=1837
+  _globals['_FILE']._serialized_start=1839
+  _globals['_FILE']._serialized_end=1890
+  _globals['_DATAFRAME']._serialized_start=1892
+  _globals['_DATAFRAME']._serialized_end=1967
+  _globals['_SERIES']._serialized_start=1970
+  _globals['_SERIES']._serialized_end=2131
+  _globals['_NDARRAY']._serialized_start=2134
+  _globals['_NDARRAY']._serialized_end=2584
+  _globals['_NDARRAY_DTYPE']._serialized_start=2415
+  _globals['_NDARRAY_DTYPE']._serialized_end=2584
+  _globals['_BENTOSERVICE']._serialized_start=2587
+  _globals['_BENTOSERVICE']._serialized_end=2768
+_builder.BuildServices(DESCRIPTOR, 'service_pb2', _globals)
+# @@protoc_insertion_point(module_scope)
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.pyi b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.pyi
new file mode 100644
index 00000000..3290dc12
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2.pyi
@@ -0,0 +1,624 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+Vendorred from: https://github.com/bentoml/BentoML/blob/main/src/bentoml/grpc/v1/service.proto"""
+import abc
+import builtins
+import collections.abc
+import concurrent.futures
+import google.protobuf.descriptor
+import google.protobuf.internal.containers
+import google.protobuf.internal.enum_type_wrapper
+import google.protobuf.message
+import google.protobuf.service
+import google.protobuf.struct_pb2
+import google.protobuf.wrappers_pb2
+import sys
+import typing
+
+if sys.version_info >= (3, 10):
+    import typing as typing_extensions
+else:
+    import typing_extensions
+
+DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
+
+@typing_extensions.final
+class ServiceMetadataRequest(google.protobuf.message.Message):
+    """ServiceMetadataRequest message doesn't take any arguments."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    def __init__(
+        self,
+    ) -> None: ...
+
+global___ServiceMetadataRequest = ServiceMetadataRequest
+
+@typing_extensions.final
+class ServiceMetadataResponse(google.protobuf.message.Message):
+    """ServiceMetadataResponse returns metadata of bentoml.Service.
+    Currently it includes name, version, apis, and docs.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    @typing_extensions.final
+    class DescriptorMetadata(google.protobuf.message.Message):
+        """DescriptorMetadata is a metadata of any given IODescriptor."""
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        DESCRIPTOR_ID_FIELD_NUMBER: builtins.int
+        ATTRIBUTES_FIELD_NUMBER: builtins.int
+        descriptor_id: builtins.str
+        """descriptor_id describes the given ID of the descriptor, which matches with our OpenAPI definition."""
+        @property
+        def attributes(self) -> google.protobuf.struct_pb2.Struct:
+            """attributes is the kwargs of the given descriptor."""
+        def __init__(
+            self,
+            *,
+            descriptor_id: builtins.str | None = ...,
+            attributes: google.protobuf.struct_pb2.Struct | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["_descriptor_id", b"_descriptor_id", "attributes", b"attributes", "descriptor_id", b"descriptor_id"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["_descriptor_id", b"_descriptor_id", "attributes", b"attributes", "descriptor_id", b"descriptor_id"]) -> None: ...
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_descriptor_id", b"_descriptor_id"]) -> typing_extensions.Literal["descriptor_id"] | None: ...
+
+    @typing_extensions.final
+    class InferenceAPI(google.protobuf.message.Message):
+        """InferenceAPI is bentoml._internal.service.inferece_api.InferenceAPI
+        that is exposed to gRPC client.
+        There is no way for reflection to get information of given @svc.api.
+        """
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        NAME_FIELD_NUMBER: builtins.int
+        INPUT_FIELD_NUMBER: builtins.int
+        OUTPUT_FIELD_NUMBER: builtins.int
+        DOCS_FIELD_NUMBER: builtins.int
+        name: builtins.str
+        """name is the name of the API."""
+        @property
+        def input(self) -> global___ServiceMetadataResponse.DescriptorMetadata:
+            """input is the input descriptor of the API."""
+        @property
+        def output(self) -> global___ServiceMetadataResponse.DescriptorMetadata:
+            """output is the output descriptor of the API."""
+        docs: builtins.str
+        """docs is the optional documentation of the API."""
+        def __init__(
+            self,
+            *,
+            name: builtins.str = ...,
+            input: global___ServiceMetadataResponse.DescriptorMetadata | None = ...,
+            output: global___ServiceMetadataResponse.DescriptorMetadata | None = ...,
+            docs: builtins.str | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["_docs", b"_docs", "_input", b"_input", "_output", b"_output", "docs", b"docs", "input", b"input", "output", b"output"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["_docs", b"_docs", "_input", b"_input", "_output", b"_output", "docs", b"docs", "input", b"input", "name", b"name", "output", b"output"]) -> None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_docs", b"_docs"]) -> typing_extensions.Literal["docs"] | None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_input", b"_input"]) -> typing_extensions.Literal["input"] | None: ...
+        @typing.overload
+        def WhichOneof(self, oneof_group: typing_extensions.Literal["_output", b"_output"]) -> typing_extensions.Literal["output"] | None: ...
+
+    NAME_FIELD_NUMBER: builtins.int
+    APIS_FIELD_NUMBER: builtins.int
+    DOCS_FIELD_NUMBER: builtins.int
+    name: builtins.str
+    """name is the service name."""
+    @property
+    def apis(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___ServiceMetadataResponse.InferenceAPI]:
+        """apis holds a list of InferenceAPI of the service."""
+    docs: builtins.str
+    """docs is the documentation of the service."""
+    def __init__(
+        self,
+        *,
+        name: builtins.str = ...,
+        apis: collections.abc.Iterable[global___ServiceMetadataResponse.InferenceAPI] | None = ...,
+        docs: builtins.str = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["apis", b"apis", "docs", b"docs", "name", b"name"]) -> None: ...
+
+global___ServiceMetadataResponse = ServiceMetadataResponse
+
+@typing_extensions.final
+class Request(google.protobuf.message.Message):
+    """Request message for incoming Call."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    API_NAME_FIELD_NUMBER: builtins.int
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    MULTIPART_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    api_name: builtins.str
+    """api_name defines the API entrypoint to call.
+    api_name is the name of the function defined in bentoml.Service.
+    Example:
+
+        @svc.api(input=NumpyNdarray(), output=File())
+        def predict(input: NDArray[float]) -> bytes:
+            ...
+
+        api_name is "predict" in this case.
+    """
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    @property
+    def multipart(self) -> global___Multipart:
+        """Multipart represents a multipart message.
+        It comprises of a mapping from given type name to a subset of aforementioned types.
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        api_name: builtins.str = ...,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        multipart: global___Multipart | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["api_name", b"api_name", "content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["content", b"content"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "multipart", "serialized_bytes"] | None: ...
+
+global___Request = Request
+
+@typing_extensions.final
+class Response(google.protobuf.message.Message):
+    """Request message for incoming Call."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    MULTIPART_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    @property
+    def multipart(self) -> global___Multipart:
+        """Multipart represents a multipart message.
+        It comprises of a mapping from given type name to a subset of aforementioned types.
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        multipart: global___Multipart | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["content", b"content", "dataframe", b"dataframe", "file", b"file", "json", b"json", "multipart", b"multipart", "ndarray", b"ndarray", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["content", b"content"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "multipart", "serialized_bytes"] | None: ...
+
+global___Response = Response
+
+@typing_extensions.final
+class Part(google.protobuf.message.Message):
+    """Part represents possible value types for multipart message.
+    These are the same as the types in Request message.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    NDARRAY_FIELD_NUMBER: builtins.int
+    DATAFRAME_FIELD_NUMBER: builtins.int
+    SERIES_FIELD_NUMBER: builtins.int
+    FILE_FIELD_NUMBER: builtins.int
+    TEXT_FIELD_NUMBER: builtins.int
+    JSON_FIELD_NUMBER: builtins.int
+    SERIALIZED_BYTES_FIELD_NUMBER: builtins.int
+    @property
+    def ndarray(self) -> global___NDArray:
+        """NDArray represents a n-dimensional array of arbitrary type."""
+    @property
+    def dataframe(self) -> global___DataFrame:
+        """DataFrame represents any tabular data type. We are using
+        DataFrame as a trivial representation for tabular type.
+        """
+    @property
+    def series(self) -> global___Series:
+        """Series portrays a series of values. This can be used for
+        representing Series types in tabular data.
+        """
+    @property
+    def file(self) -> global___File:
+        """File represents for any arbitrary file type. This can be
+        plaintext, image, video, audio, etc.
+        """
+    @property
+    def text(self) -> google.protobuf.wrappers_pb2.StringValue:
+        """Text represents a string inputs."""
+    @property
+    def json(self) -> google.protobuf.struct_pb2.Value:
+        """JSON is represented by using google.protobuf.Value.
+        see https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/struct.proto
+        """
+    serialized_bytes: builtins.bytes
+    """serialized_bytes is for data serialized in BentoML's internal serialization format."""
+    def __init__(
+        self,
+        *,
+        ndarray: global___NDArray | None = ...,
+        dataframe: global___DataFrame | None = ...,
+        series: global___Series | None = ...,
+        file: global___File | None = ...,
+        text: google.protobuf.wrappers_pb2.StringValue | None = ...,
+        json: google.protobuf.struct_pb2.Value | None = ...,
+        serialized_bytes: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["dataframe", b"dataframe", "file", b"file", "json", b"json", "ndarray", b"ndarray", "representation", b"representation", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["dataframe", b"dataframe", "file", b"file", "json", b"json", "ndarray", b"ndarray", "representation", b"representation", "serialized_bytes", b"serialized_bytes", "series", b"series", "text", b"text"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["representation", b"representation"]) -> typing_extensions.Literal["ndarray", "dataframe", "series", "file", "text", "json", "serialized_bytes"] | None: ...
+
+global___Part = Part
+
+@typing_extensions.final
+class Multipart(google.protobuf.message.Message):
+    """Multipart represents a multipart message.
+    It comprises of a mapping from given type name to a subset of aforementioned types.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    @typing_extensions.final
+    class FieldsEntry(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        KEY_FIELD_NUMBER: builtins.int
+        VALUE_FIELD_NUMBER: builtins.int
+        key: builtins.str
+        @property
+        def value(self) -> global___Part: ...
+        def __init__(
+            self,
+            *,
+            key: builtins.str = ...,
+            value: global___Part | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["value", b"value"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ...
+
+    FIELDS_FIELD_NUMBER: builtins.int
+    @property
+    def fields(self) -> google.protobuf.internal.containers.MessageMap[builtins.str, global___Part]: ...
+    def __init__(
+        self,
+        *,
+        fields: collections.abc.Mapping[builtins.str, global___Part] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["fields", b"fields"]) -> None: ...
+
+global___Multipart = Multipart
+
+@typing_extensions.final
+class File(google.protobuf.message.Message):
+    """File represents for any arbitrary file type. This can be
+    plaintext, image, video, audio, etc.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    KIND_FIELD_NUMBER: builtins.int
+    CONTENT_FIELD_NUMBER: builtins.int
+    kind: builtins.str
+    """optional file type, let it be csv, text, parquet, etc.
+    v1alpha1 uses 1 as FileType enum.
+    """
+    content: builtins.bytes
+    """contents of file as bytes."""
+    def __init__(
+        self,
+        *,
+        kind: builtins.str | None = ...,
+        content: builtins.bytes = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["_kind", b"_kind", "kind", b"kind"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["_kind", b"_kind", "content", b"content", "kind", b"kind"]) -> None: ...
+    def WhichOneof(self, oneof_group: typing_extensions.Literal["_kind", b"_kind"]) -> typing_extensions.Literal["kind"] | None: ...
+
+global___File = File
+
+@typing_extensions.final
+class DataFrame(google.protobuf.message.Message):
+    """DataFrame represents any tabular data type. We are using
+    DataFrame as a trivial representation for tabular type.
+    This message carries given implementation of tabular data based on given orientation.
+    TODO: support index, records, etc.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    COLUMN_NAMES_FIELD_NUMBER: builtins.int
+    COLUMNS_FIELD_NUMBER: builtins.int
+    @property
+    def column_names(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """columns name"""
+    @property
+    def columns(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Series]:
+        """columns orient.
+        { column ↠ { index ↠ value } }
+        """
+    def __init__(
+        self,
+        *,
+        column_names: collections.abc.Iterable[builtins.str] | None = ...,
+        columns: collections.abc.Iterable[global___Series] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["column_names", b"column_names", "columns", b"columns"]) -> None: ...
+
+global___DataFrame = DataFrame
+
+@typing_extensions.final
+class Series(google.protobuf.message.Message):
+    """Series portrays a series of values. This can be used for
+    representing Series types in tabular data.
+    """
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    BOOL_VALUES_FIELD_NUMBER: builtins.int
+    FLOAT_VALUES_FIELD_NUMBER: builtins.int
+    INT32_VALUES_FIELD_NUMBER: builtins.int
+    INT64_VALUES_FIELD_NUMBER: builtins.int
+    STRING_VALUES_FIELD_NUMBER: builtins.int
+    DOUBLE_VALUES_FIELD_NUMBER: builtins.int
+    @property
+    def bool_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]:
+        """A bool parameter value"""
+    @property
+    def float_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """A float parameter value"""
+    @property
+    def int32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """A int32 parameter value"""
+    @property
+    def int64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """A int64 parameter value"""
+    @property
+    def string_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """A string parameter value"""
+    @property
+    def double_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a double parameter value."""
+    def __init__(
+        self,
+        *,
+        bool_values: collections.abc.Iterable[builtins.bool] | None = ...,
+        float_values: collections.abc.Iterable[builtins.float] | None = ...,
+        int32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        int64_values: collections.abc.Iterable[builtins.int] | None = ...,
+        string_values: collections.abc.Iterable[builtins.str] | None = ...,
+        double_values: collections.abc.Iterable[builtins.float] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["bool_values", b"bool_values", "double_values", b"double_values", "float_values", b"float_values", "int32_values", b"int32_values", "int64_values", b"int64_values", "string_values", b"string_values"]) -> None: ...
+
+global___Series = Series
+
+@typing_extensions.final
+class NDArray(google.protobuf.message.Message):
+    """NDArray represents a n-dimensional array of arbitrary type."""
+
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    class _DType:
+        ValueType = typing.NewType("ValueType", builtins.int)
+        V: typing_extensions.TypeAlias = ValueType
+
+    class _DTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[NDArray._DType.ValueType], builtins.type):
+        DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+        DTYPE_UNSPECIFIED: NDArray._DType.ValueType  # 0
+        """Represents a None type."""
+        DTYPE_FLOAT: NDArray._DType.ValueType  # 1
+        """Represents an float type."""
+        DTYPE_DOUBLE: NDArray._DType.ValueType  # 2
+        """Represents an double type."""
+        DTYPE_BOOL: NDArray._DType.ValueType  # 3
+        """Represents a bool type."""
+        DTYPE_INT32: NDArray._DType.ValueType  # 4
+        """Represents an int32 type."""
+        DTYPE_INT64: NDArray._DType.ValueType  # 5
+        """Represents an int64 type."""
+        DTYPE_UINT32: NDArray._DType.ValueType  # 6
+        """Represents a uint32 type."""
+        DTYPE_UINT64: NDArray._DType.ValueType  # 7
+        """Represents a uint64 type."""
+        DTYPE_STRING: NDArray._DType.ValueType  # 8
+        """Represents a string type."""
+
+    class DType(_DType, metaclass=_DTypeEnumTypeWrapper):
+        """Represents data type of a given array."""
+
+    DTYPE_UNSPECIFIED: NDArray.DType.ValueType  # 0
+    """Represents a None type."""
+    DTYPE_FLOAT: NDArray.DType.ValueType  # 1
+    """Represents an float type."""
+    DTYPE_DOUBLE: NDArray.DType.ValueType  # 2
+    """Represents an double type."""
+    DTYPE_BOOL: NDArray.DType.ValueType  # 3
+    """Represents a bool type."""
+    DTYPE_INT32: NDArray.DType.ValueType  # 4
+    """Represents an int32 type."""
+    DTYPE_INT64: NDArray.DType.ValueType  # 5
+    """Represents an int64 type."""
+    DTYPE_UINT32: NDArray.DType.ValueType  # 6
+    """Represents a uint32 type."""
+    DTYPE_UINT64: NDArray.DType.ValueType  # 7
+    """Represents a uint64 type."""
+    DTYPE_STRING: NDArray.DType.ValueType  # 8
+    """Represents a string type."""
+
+    DTYPE_FIELD_NUMBER: builtins.int
+    SHAPE_FIELD_NUMBER: builtins.int
+    STRING_VALUES_FIELD_NUMBER: builtins.int
+    FLOAT_VALUES_FIELD_NUMBER: builtins.int
+    DOUBLE_VALUES_FIELD_NUMBER: builtins.int
+    BOOL_VALUES_FIELD_NUMBER: builtins.int
+    INT32_VALUES_FIELD_NUMBER: builtins.int
+    INT64_VALUES_FIELD_NUMBER: builtins.int
+    UINT32_VALUES_FIELD_NUMBER: builtins.int
+    UINT64_VALUES_FIELD_NUMBER: builtins.int
+    dtype: global___NDArray.DType.ValueType
+    """DTYPE is the data type of given array"""
+    @property
+    def shape(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """shape is the shape of given array."""
+    @property
+    def string_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """represents a string parameter value."""
+    @property
+    def float_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a float parameter value."""
+    @property
+    def double_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
+        """represents a double parameter value."""
+    @property
+    def bool_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]:
+        """represents a bool parameter value."""
+    @property
+    def int32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a int32 parameter value."""
+    @property
+    def int64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a int64 parameter value."""
+    @property
+    def uint32_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a uint32 parameter value."""
+    @property
+    def uint64_values(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+        """represents a uint64 parameter value."""
+    def __init__(
+        self,
+        *,
+        dtype: global___NDArray.DType.ValueType = ...,
+        shape: collections.abc.Iterable[builtins.int] | None = ...,
+        string_values: collections.abc.Iterable[builtins.str] | None = ...,
+        float_values: collections.abc.Iterable[builtins.float] | None = ...,
+        double_values: collections.abc.Iterable[builtins.float] | None = ...,
+        bool_values: collections.abc.Iterable[builtins.bool] | None = ...,
+        int32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        int64_values: collections.abc.Iterable[builtins.int] | None = ...,
+        uint32_values: collections.abc.Iterable[builtins.int] | None = ...,
+        uint64_values: collections.abc.Iterable[builtins.int] | None = ...,
+    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["bool_values", b"bool_values", "double_values", b"double_values", "dtype", b"dtype", "float_values", b"float_values", "int32_values", b"int32_values", "int64_values", b"int64_values", "shape", b"shape", "string_values", b"string_values", "uint32_values", b"uint32_values", "uint64_values", b"uint64_values"]) -> None: ...
+
+global___NDArray = NDArray
+
+class BentoService(google.protobuf.service.Service, metaclass=abc.ABCMeta):
+    """a gRPC BentoServer."""
+
+    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
+    @abc.abstractmethod
+    def Call(
+        inst: BentoService,  # pyright: ignore[reportSelfClsParameterName]
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___Request,
+        callback: collections.abc.Callable[[global___Response], None] | None,
+    ) -> concurrent.futures.Future[global___Response]:
+        """Call handles methodcaller of given API entrypoint."""
+    @abc.abstractmethod
+    def ServiceMetadata(
+        inst: BentoService,  # pyright: ignore[reportSelfClsParameterName]
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___ServiceMetadataRequest,
+        callback: collections.abc.Callable[[global___ServiceMetadataResponse], None] | None,
+    ) -> concurrent.futures.Future[global___ServiceMetadataResponse]:
+        """ServiceMetadata returns metadata of bentoml.Service."""
+
+class BentoService_Stub(BentoService):
+    """a gRPC BentoServer."""
+
+    def __init__(self, rpc_channel: google.protobuf.service.RpcChannel) -> None: ...
+    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
+    def Call(
+        inst: BentoService_Stub,  # pyright: ignore[reportSelfClsParameterName]
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___Request,
+        callback: collections.abc.Callable[[global___Response], None] | None = ...,
+    ) -> concurrent.futures.Future[global___Response]:
+        """Call handles methodcaller of given API entrypoint."""
+    def ServiceMetadata(
+        inst: BentoService_Stub,  # pyright: ignore[reportSelfClsParameterName]
+        rpc_controller: google.protobuf.service.RpcController,
+        request: global___ServiceMetadataRequest,
+        callback: collections.abc.Callable[[global___ServiceMetadataResponse], None] | None = ...,
+    ) -> concurrent.futures.Future[global___ServiceMetadataResponse]:
+        """ServiceMetadata returns metadata of bentoml.Service."""
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.py b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.py
new file mode 100644
index 00000000..cff4c957
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.py
@@ -0,0 +1,104 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import service_pb2 as service__pb2
+
+
+class BentoServiceStub(object):
+    """a gRPC BentoServer.
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Call = channel.unary_unary(
+                '/bentoml.grpc.v1.BentoService/Call',
+                request_serializer=service__pb2.Request.SerializeToString,
+                response_deserializer=service__pb2.Response.FromString,
+                )
+        self.ServiceMetadata = channel.unary_unary(
+                '/bentoml.grpc.v1.BentoService/ServiceMetadata',
+                request_serializer=service__pb2.ServiceMetadataRequest.SerializeToString,
+                response_deserializer=service__pb2.ServiceMetadataResponse.FromString,
+                )
+
+
+class BentoServiceServicer(object):
+    """a gRPC BentoServer.
+    """
+
+    def Call(self, request, context):
+        """Call handles methodcaller of given API entrypoint.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ServiceMetadata(self, request, context):
+        """ServiceMetadata returns metadata of bentoml.Service.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BentoServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Call': grpc.unary_unary_rpc_method_handler(
+                    servicer.Call,
+                    request_deserializer=service__pb2.Request.FromString,
+                    response_serializer=service__pb2.Response.SerializeToString,
+            ),
+            'ServiceMetadata': grpc.unary_unary_rpc_method_handler(
+                    servicer.ServiceMetadata,
+                    request_deserializer=service__pb2.ServiceMetadataRequest.FromString,
+                    response_serializer=service__pb2.ServiceMetadataResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'bentoml.grpc.v1.BentoService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class BentoService(object):
+    """a gRPC BentoServer.
+    """
+
+    @staticmethod
+    def Call(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/bentoml.grpc.v1.BentoService/Call',
+            service__pb2.Request.SerializeToString,
+            service__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ServiceMetadata(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/bentoml.grpc.v1.BentoService/ServiceMetadata',
+            service__pb2.ServiceMetadataRequest.SerializeToString,
+            service__pb2.ServiceMetadataResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.pyi b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.pyi
new file mode 100644
index 00000000..11f7b401
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/_generated_pb4/service_pb2_grpc.pyi
@@ -0,0 +1,67 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+Vendorred from: https://github.com/bentoml/BentoML/blob/main/src/bentoml/grpc/v1/service.proto"""
+import abc
+import collections.abc
+import grpc
+import grpc.aio
+import service_pb2
+import typing
+
+_T = typing.TypeVar('_T')
+
+class _MaybeAsyncIterator(collections.abc.AsyncIterator[_T], collections.abc.Iterator[_T], metaclass=abc.ABCMeta):
+    ...
+
+class _ServicerContext(grpc.ServicerContext, grpc.aio.ServicerContext):  # type: ignore
+    ...
+
+class BentoServiceStub:
+    """a gRPC BentoServer."""
+
+    def __init__(self, channel: typing.Union[grpc.Channel, grpc.aio.Channel]) -> None: ...
+    Call: grpc.UnaryUnaryMultiCallable[
+        service_pb2.Request,
+        service_pb2.Response,
+    ]
+    """Call handles methodcaller of given API entrypoint."""
+    ServiceMetadata: grpc.UnaryUnaryMultiCallable[
+        service_pb2.ServiceMetadataRequest,
+        service_pb2.ServiceMetadataResponse,
+    ]
+    """ServiceMetadata returns metadata of bentoml.Service."""
+
+class BentoServiceAsyncStub:
+    """a gRPC BentoServer."""
+
+    Call: grpc.aio.UnaryUnaryMultiCallable[
+        service_pb2.Request,
+        service_pb2.Response,
+    ]
+    """Call handles methodcaller of given API entrypoint."""
+    ServiceMetadata: grpc.aio.UnaryUnaryMultiCallable[
+        service_pb2.ServiceMetadataRequest,
+        service_pb2.ServiceMetadataResponse,
+    ]
+    """ServiceMetadata returns metadata of bentoml.Service."""
+
+class BentoServiceServicer(metaclass=abc.ABCMeta):
+    """a gRPC BentoServer."""
+
+    @abc.abstractmethod
+    def Call(
+        self,
+        request: service_pb2.Request,
+        context: _ServicerContext,
+    ) -> typing.Union[service_pb2.Response, collections.abc.Awaitable[service_pb2.Response]]:
+        """Call handles methodcaller of given API entrypoint."""
+    @abc.abstractmethod
+    def ServiceMetadata(
+        self,
+        request: service_pb2.ServiceMetadataRequest,
+        context: _ServicerContext,
+    ) -> typing.Union[service_pb2.ServiceMetadataResponse, collections.abc.Awaitable[service_pb2.ServiceMetadataResponse]]:
+        """ServiceMetadata returns metadata of bentoml.Service."""
+
+def add_BentoServiceServicer_to_server(servicer: BentoServiceServicer, server: typing.Union[grpc.Server, grpc.aio.Server]) -> None: ...
diff --git a/openllm-client/src/openllm_client/pb/v1/service_pb2.py b/openllm-client/src/openllm_client/pb/v1/service_pb2.py
new file mode 100644
index 00000000..61998b01
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/service_pb2.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+from google.protobuf import __version__
+if __version__.startswith("4"):
+    from ._generated_pb4.service_pb2 import *
+else:
+    from ._generated_pb3.service_pb2 import *
diff --git a/openllm-client/src/openllm_client/pb/v1/service_pb2.pyi b/openllm-client/src/openllm_client/pb/v1/service_pb2.pyi
new file mode 100644
index 00000000..61998b01
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/service_pb2.pyi
@@ -0,0 +1,6 @@
+from __future__ import annotations
+from google.protobuf import __version__
+if __version__.startswith("4"):
+    from ._generated_pb4.service_pb2 import *
+else:
+    from ._generated_pb3.service_pb2 import *
diff --git a/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.py b/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.py
new file mode 100644
index 00000000..8b9707fd
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+from google.protobuf import __version__
+if __version__.startswith("4"):
+    from ._generated_pb4.service_pb2_grpc import *
+else:
+    from ._generated_pb3.service_pb2_grpc import *
diff --git a/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.pyi b/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.pyi
new file mode 100644
index 00000000..8b9707fd
--- /dev/null
+++ b/openllm-client/src/openllm_client/pb/v1/service_pb2_grpc.pyi
@@ -0,0 +1,6 @@
+from __future__ import annotations
+from google.protobuf import __version__
+if __version__.startswith("4"):
+    from ._generated_pb4.service_pb2_grpc import *
+else:
+    from ._generated_pb3.service_pb2_grpc import *
diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py
index 424c30ea..418b3668 100644
--- a/openllm-core/src/openllm_core/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -71,6 +71,8 @@ class MetadataOutput:
   configuration: str
   supports_embeddings: bool
   supports_hf_agent: bool
+  prompt_template: str
+  system_message: str
 
 @attr.frozen(slots=True)
 class EmbeddingsOutput:
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index 6c25e1f1..e20064fd 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -111,6 +111,8 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   supports_embeddings: bool
   supports_hf_agent: bool
   has_adapters: bool
+  system_message: str | None
+  prompt_template: str | None
   embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]]
   generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
index ba3be4e8..06a36fd8 100644
--- a/openllm-core/src/openllm_core/config/configuration_baichuan.py
+++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -56,6 +56,14 @@ class BaichuanConfig(openllm_core.LLMConfig):
     top_p: float = 0.7
     temperature: float = 0.95
 
+  @property
+  def default_prompt_template(self) -> str:
+    return DEFAULT_PROMPT_TEMPLATE.to_string()
+
+  @property
+  def default_system_message(self) -> str:
+    return DEFAULT_SYSTEM_MESSAGE
+
   def sanitize_parameters(self,
                           prompt: str,
                           prompt_template: PromptTemplate | str | None = None,
diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py
index 66c580dd..cd13bf21 100644
--- a/openllm-core/src/openllm_core/config/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -69,7 +69,7 @@ class LlamaConfig(openllm_core.LLMConfig):
   Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
   for more information.
   """
-  use_llama2_prompt: bool = dantic.Field(False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.')
+  use_llama2_prompt: bool = dantic.Field(True, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.')
   __config__ = {
       'name_type': 'lowercase',
       'url': 'https://github.com/facebookresearch/llama',
@@ -106,6 +106,14 @@ class LlamaConfig(openllm_core.LLMConfig):
     best_of: int = 1
     presence_penalty: float = 0.5
 
+  @property
+  def default_prompt_template(self, use_llama2_prompt: bool = True) -> str:
+    return DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1').to_string()
+
+  @property
+  def default_system_message(self) -> str:
+    return DEFAULT_SYSTEM_MESSAGE
+
   def sanitize_parameters(self,
                           prompt: str,
                           prompt_template: PromptTemplate | str | None = None,
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 4e16f9c1..3f303c91 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -39,7 +39,6 @@ from openllm_core._typing_compat import T
 from openllm_core._typing_compat import TupleAny
 from openllm_core._typing_compat import overload
 from openllm_core.prompts import PromptTemplate
-from openllm_core.prompts import process_prompt
 from openllm_core.utils import DEBUG
 from openllm_core.utils import MYPY
 from openllm_core.utils import EnvVarMixin
@@ -620,7 +619,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     # set default tokenizer kwargs
     tokenizer_kwds.update({'padding_side': 'left', 'truncation_side': 'left'})
 
-    # parsing tokenizer and model kwargs, as the hierachy is param pass > default
+    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
     normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
     # NOTE: Save the args and kwargs for latter load
     self.__attrs_init__(llm_config, quantization_config, _quantize, model_id, args, {
@@ -1211,6 +1210,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
       if adapter_name is not None: __self.set_adapter(adapter_name)
       request_id: str | None = attrs.pop('request_id', None)
       if request_id is None: raise ValueError('request_id must not be None.')
+      prompt, *_ = self.sanitize_parameters(prompt, **attrs)
+      if openllm_core.utils.DEBUG: logger.debug('Prompt:\n%s', prompt)
 
       if stop_token_ids is None: stop_token_ids = []
       stop_token_ids.append(self.tokenizer.eos_token_id)
@@ -1237,7 +1238,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
     async def vllm_generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[str, None]:
       # TODO: System prompt support
       pre = 0
-      prompt = process_prompt(prompt, None, False)
       echo = attrs.pop('echo', False)
       stop: str | t.Iterable[str] | None = attrs.pop('stop', None)
       stop_token_ids: list[int] | None = attrs.pop('stop_token_ids', None)
@@ -1247,6 +1247,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
       if adapter_name is not None: __self.set_adapter(adapter_name)
       request_id: str | None = attrs.pop('request_id', None)
       if request_id is None: raise ValueError('request_id must not be None.')
+      prompt, *_ = self.sanitize_parameters(prompt, **attrs)
+      if openllm_core.utils.DEBUG: logger.debug('Prompt:\n%s', prompt)
 
       if stop_token_ids is None: stop_token_ids = []
       stop_token_ids.append(self.tokenizer.eos_token_id)
@@ -1342,7 +1344,9 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
                              '__repr_args__': _wrapped_repr_args,
                              'supports_embeddings': self['supports_embeddings'],
                              'supports_hf_agent': self['supports_generate_one'],
-                             'has_adapters': self._adapters_mapping is not None
+                             'has_adapters': self._adapters_mapping is not None,
+                             'prompt_template': self._prompt_template.to_string() if self._prompt_template else self.config.default_prompt_template,
+                             'system_message': self._system_message if self._system_message else self.config.default_system_message,
                          }))
 
 __all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'EmbeddingsOutput']
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index e5aeea27..47084aa9 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -32,13 +32,7 @@ model = svars.model
 model_id = svars.model_id
 adapter_map = svars.adapter_map
 llm_config = openllm.AutoConfig.for_model(model)
-runner = openllm.Runner(
-  model,
-  llm_config=llm_config,
-  model_id=model_id,
-  ensure_available=False,
-  adapter_map=orjson.loads(adapter_map)
-)
+runner = openllm.Runner(model, llm_config=llm_config, model_id=model_id, ensure_available=False, adapter_map=orjson.loads(adapter_map))
 generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable,  # XXX: remove arg-type once bentoml.Runner is correct set with type
                                           name='llm-generic-embedding',
                                           scheduling_strategy=openllm_core.CascadingResourceStrategy,
@@ -189,9 +183,11 @@ async def chat_completion_v1(input_dict: dict[str, t.Any], ctx: bentoml.Context)
              'timeout': 3600,
              'model_name': llm_config['model_name'],
              'backend': runner.backend,
-             'configuration': '',
+             'configuration': llm_config.model_dump(flatten=True),
              'supports_embeddings': runner.supports_embeddings,
-             'supports_hf_agent': runner.supports_hf_agent
+             'supports_hf_agent': runner.supports_hf_agent,
+             'prompt_template': runner.prompt_template,
+             'system_message': runner.system_message,
          }))
 def metadata_v1(_: str) -> openllm.MetadataOutput:
   return openllm.MetadataOutput(timeout=llm_config['timeout'],
@@ -200,7 +196,10 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
                                 model_id=runner.llm.model_id,
                                 configuration=llm_config.model_dump_json().decode(),
                                 supports_embeddings=runner.supports_embeddings,
-                                supports_hf_agent=runner.supports_hf_agent)
+                                supports_hf_agent=runner.supports_hf_agent,
+                                prompt_template=runner.prompt_template,
+                                system_message=runner.system_message,
+                                )
 
 @svc.api(route='/v1/embeddings',
          input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index 2367e364..00b47bdd 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -27,7 +27,6 @@ import itertools
 import logging
 import os
 import platform
-import re
 import subprocess
 import sys
 import time
@@ -795,6 +794,7 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
         --text "¡Este es un API muy agradable!"
   ```
   '''
+  raise click.ClickException("'instruct' is currently disabled")
   client = openllm.client.HTTPClient(endpoint, timeout=timeout)
 
   try:
@@ -844,15 +844,16 @@ def embed_command(
     termui.echo(gen_embed.embeddings, fg='white')
   ctx.exit(0)
 @cli.command()
-@shared_client_options
+@shared_client_options(output_value='porcelain')
 @click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
+@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
 @click.argument('prompt', type=click.STRING)
 @click.option(
     '--sampling-params', help='Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)', required=False, multiple=True, callback=opt_callback, metavar='ARG=VALUE[,ARG=VALUE]'
 )
 @click.pass_context
 def query_command(
-    ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
+    ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, stream: bool, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
 ) -> None:
   '''Ask a LLM interactively, from a terminal.
 
@@ -862,23 +863,30 @@ def query_command(
   ```
   '''
   _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
-  if server_type == 'grpc': endpoint = re.sub(r'http://', '', endpoint)
-  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == 'http' else openllm.client.GrpcClient(endpoint, timeout=timeout)
+  if server_type == 'grpc': raise click.ClickException("'grpc' is currently disabled.")
+  # TODO: grpc support
+  client = openllm.client.HTTPClient(address=endpoint, timeout=timeout)
   input_fg, generated_fg = 'magenta', 'cyan'
   if output != 'porcelain':
     termui.echo('==Input==\n', fg='white')
     termui.echo(f'{prompt}', fg=input_fg)
-  res = client.query(prompt, return_response='raw', **{**client.configuration, **_memoized})
+  fn = client.generate_stream if stream else client.generate
+  res = fn(prompt, **{**client._config(), **_memoized})
   if output == 'pretty':
-    response = client.config.postprocess_generate(prompt, res['responses'])
-    if isinstance(response, dict) and 'text' in response: response = response['text']
     termui.echo('\n\n==Responses==\n', fg='white')
-    termui.echo(response, fg=generated_fg)
+    if stream:
+      for it in res: termui.echo(it.text, fg=generated_fg, nl=False)
+    else: termui.echo(res.responses[0], fg=generated_fg)
   elif output == 'json':
-    termui.echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  else:
-    termui.echo(res['responses'], fg='white')
+    if stream:
+      for it in res: termui.echo(orjson.dumps(bentoml_cattr.unstructure(it), option=orjson.OPT_INDENT_2).decode(), fg='white')
+    else: termui.echo(orjson.dumps(bentoml_cattr.unstructure(res), option=orjson.OPT_INDENT_2).decode(), fg='white')
+  else:  # noqa: PLR5501
+    if stream:
+      for it in res: termui.echo(it.text, fg=generated_fg, nl=False)
+    else: termui.echo(res.responses, fg='white')
   ctx.exit(0)
+
 @cli.group(cls=Extensions, hidden=True, name='extension')
 def extension_command() -> None:
   '''Extension for OpenLLM CLI.'''
diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py
index 9c2553b3..6a90bd60 100644
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -16,12 +16,10 @@ import typing as t
 import openllm_client
 
 if t.TYPE_CHECKING:
-  from openllm_client import AsyncGrpcClient as AsyncGrpcClient
   from openllm_client import AsyncHTTPClient as AsyncHTTPClient
-  from openllm_client import BaseAsyncClient as BaseAsyncClient
-  from openllm_client import BaseClient as BaseClient
-  from openllm_client import GrpcClient as GrpcClient
   from openllm_client import HTTPClient as HTTPClient
+  # from openllm_client import AsyncGrpcClient as AsyncGrpcClient
+  # from openllm_client import GrpcClient as GrpcClient
 
 def __dir__() -> t.Sequence[str]:
   return sorted(dir(openllm_client))
diff --git a/pyproject.toml b/pyproject.toml
index b680d202..eafdc972 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,6 +156,7 @@ whitelist-regex = ["test_.*"]
 toplevel = ["openllm"]
 
 [tool.ruff]
+tab-size = 2
 extend-exclude = [
   "tools",
   "examples",