diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
index 78241eba..b029a9e3 100644
--- a/src/openllm/__init__.py
+++ b/src/openllm/__init__.py
@@ -42,6 +42,7 @@ _import_structure = {
     "models.auto": ["AutoConfig", "CONFIG_MAPPING"],
     "models.flan_t5": ["FlanT5Config"],
     "models.dolly_v2": ["DollyV2Config"],
+    "models.chatglm": ["ChatGLMConfig"],
 }
 
 try:
@@ -54,6 +55,7 @@ except MissingDependencyError:
 else:
     _import_structure["models.flan_t5"].extend(["FlanT5"])
     _import_structure["models.dolly_v2"].extend(["DollyV2"])
+    _import_structure["models.chatglm"].extend(["ChatGLM"])
     _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
 
 try:
@@ -99,6 +101,7 @@ if t.TYPE_CHECKING:
     from .cli import start_grpc as start_grpc
     from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
     from .models.auto import AutoConfig as AutoConfig
+    from .models.chatglm import ChatGLMConfig as ChatGLMConfig
     from .models.dolly_v2 import DollyV2Config as DollyV2Config
     from .models.flan_t5 import FlanT5Config as FlanT5Config
 
@@ -111,6 +114,7 @@ if t.TYPE_CHECKING:
         from .models.auto import MODEL_MAPPING as MODEL_MAPPING
         from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
         from .models.auto import AutoLLM as AutoLLM
+        from .models.chatglm import ChatGLM as ChatGLM
         from .models.dolly_v2 import DollyV2 as DollyV2
         from .models.flan_t5 import FlanT5 as FlanT5
 
diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index 2bb9e60c..8c765ad0 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -430,29 +430,43 @@ class LLMConfig(pydantic.BaseModel, ABC):
         __openllm_model_name__: str = ""
         __openllm_start_name__: str = ""
         __openllm_timeout__: int = 0
+        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
         GenerationConfig: type[t.Any] = GenerationConfig
 
-    def __init_subclass__(cls, *, default_timeout: int | None = None, **kwargs: t.Any):
+    def __init_subclass__(
+        cls,
+        *,
+        default_timeout: int | None = None,
+        name_type: t.Literal["dasherize", "lowercase"] = "dasherize",
+        **kwargs: t.Any,
+    ):
         if default_timeout is None:
             default_timeout = 3600
         cls.__openllm_timeout__ = default_timeout
+        if name_type not in ("dasherize", "lowercase"):
+            raise RuntimeError(f"Unknown name_type {name_type}. Only allowed are 'dasherize' and 'lowercase'.")
+        cls.__openllm_name_type__ = name_type
 
         super(LLMConfig, cls).__init_subclass__(**kwargs)
 
     @classmethod
-    def __pydantic_init_subclass__(cls, **kwargs: t.Any):
-        cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
-        cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
+    def __pydantic_init_subclass__(cls, **_: t.Any):
+        if cls.__openllm_name_type__ == "dasherize":
+            cls.__openllm_model_name__ = inflection.underscore(cls.__name__.replace("Config", ""))
+            cls.__openllm_start_name__ = inflection.dasherize(cls.__openllm_model_name__)
+        else:
+            cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
+            cls.__openllm_start_name__ = cls.__openllm_model_name__
+
         if hasattr(cls, "GenerationConfig"):
-            generation_class = t.cast(
+            cls.generation_config = t.cast(
                 "type[GenerationConfig]",
                 types.new_class(
                     cls.__name__.replace("Config", "") + "GenerationConfig",
                     (GenerationConfig,),
                     {"model_name": cls.__openllm_model_name__, "_internal": True},
                 ),
-            )
-            cls.generation_config = generation_class.construct_from_llm_config(cls)
+            ).construct_from_llm_config(cls)
             delattr(cls, "GenerationConfig")
 
         for key, field in cls.model_fields.items():
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 0df5160b..5caae2ba 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -70,7 +70,14 @@ class TaskType(enum.Enum, metaclass=TypeMeta):
     TEXT2TEXT_GENERATION = enum.auto()
 
 
-def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str, *model_args: t.Any, **kwds: t.Any):
+def import_model(
+    model_name: str,
+    tag: bentoml.Tag,
+    __openllm_framework__: str,
+    *model_args: t.Any,
+    tokenizer_kwds: dict[str, t.Any],
+    **kwds: t.Any,
+):
     """Auto detect model type from given model_name and import it to bentoml's model store.
 
     For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first, returning all of the unused kwargs.
@@ -91,10 +98,6 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
     config: transformers.PretrainedConfig = kwds.pop("config", None)
     trust_remote_code = kwds.pop("trust_remote_code", False)
 
-    tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
-
-    kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
-
     # this logic below is synonymous to handling `from_pretrained` kwds.
     hub_kwds_names = [
         "cache_dir",
@@ -117,6 +120,7 @@ def import_model(model_name: str, tag: bentoml.Tag, __openllm_framework__: str,
                 model_name, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_kwds, **copied_kwds
             ),
         )
+
     if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
         task_type = "text-generation"
     elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
@@ -186,14 +190,6 @@ class LLMInterface(ABC):
         )
 
 
-if t.TYPE_CHECKING:
-
-    class LLMRunnable(bentoml.Runnable):
-        @abstractmethod
-        def generate(self, prompt: str, **kwargs: t.Any) -> t.Any:
-            ...
-
-
 class LLM(LLMInterface):
     _implementation: t.Literal["pt", "tf", "flax"]
 
@@ -204,7 +200,14 @@ class LLM(LLMInterface):
 
     if t.TYPE_CHECKING:
 
-        def import_model(self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, **kwds: t.Any) -> bentoml.Model:
+        def import_model(
+            self,
+            pretrained: str,
+            tag: bentoml.Tag,
+            *args: t.Any,
+            tokenizer_kwds: dict[str, t.Any],
+            **kwds: t.Any,
+        ) -> bentoml.Model:
             ...
 
     def __init_subclass__(cls, *, implementation: t.Literal["pt", "tf", "flax"] = "pt", _internal: bool = False):
@@ -216,7 +219,7 @@ class LLM(LLMInterface):
                 if implementation == "tf":
                     cls.config_class = getattr(openllm, f"{cls.__name__[2:]}Config")
                 elif implementation == "flax":
-                    cls.config_class = getattr(openllm, f"{cls.__name__[len('flax'):]}Config")
+                    cls.config_class = getattr(openllm, f"{cls.__name__[4:]}Config")
                 else:
                     cls.config_class = getattr(openllm, f"{cls.__name__}Config")
             else:
@@ -245,7 +248,11 @@ class LLM(LLMInterface):
         return {"configuration": self.config.model_dump(), "variants": self.variants}
 
     def __init__(
-        self, pretrained: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **kwargs: t.Any
+        self,
+        pretrained: str | None = None,
+        llm_config: openllm.LLMConfig | None = None,
+        *args: t.Any,
+        **kwargs: t.Any,
     ):
         """Initialize the LLM with given pretrained model.
 
@@ -261,16 +268,24 @@ class LLM(LLMInterface):
         If you need to overwrite the default ``import_model``, implement the following in your subclass:
 
         ```python
-            def import_model(self, pretrained: str, tag: bentoml.Tag, *args: t.Any, **kwargs: t.Any):
-                tokenizer_kwargs = {k[len('_tokenizer_'):]: v for k, v in kwargs.items() if k.startswith('_tokenizer_')]}
-                kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_tokenizer_')}
+            def import_model(
+                self,
+                pretrained: str,
+                tag: bentoml.Tag,
+                *args: t.Any,
+                tokenizer_kwds: dict[str, t.Any],
+                **kwargs: t.Any,
+            ):
                 return bentoml.transformers.save_model(
                     str(tag),
                     transformers.AutoModelForCausalLM.from_pretrained(
                         pretrained, device_map="auto", torch_dtype=torch.bfloat16, **kwargs
                     ),
-                    custom_objects={"tokenizer": transformers.AutoTokenizer.from_pretrained(pretrained, padding_size="left",
-                        **tokenizer_kwargs)},
+                    custom_objects={
+                        "tokenizer": transformers.AutoTokenizer.from_pretrained(
+                            pretrained, padding_size="left", **tokenizer_kwds
+                        )
+                    },
                 )
         ```
 
@@ -295,7 +310,7 @@ class LLM(LLMInterface):
         """
 
         if llm_config is not None:
-            logger.debug("Using given 'llm_config=%s' to initialize LLM", llm_config)
+            logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
             self.config = llm_config
         else:
             self.config = self.config_class(**kwargs)
@@ -319,7 +334,11 @@ class LLM(LLMInterface):
     @property
     def _bentomodel(self) -> bentoml.Model:
         if self.__bentomodel__ is None:
-            tag, kwargs = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
+            tag, kwds = openllm.utils.generate_tags(self._pretrained, prefix=self._implementation, **self._kwargs)
+
+            tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
+            kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
+
             try:
                 self.__bentomodel__ = bentoml.transformers.get(tag)
             except bentoml.exceptions.BentoMLException:
@@ -328,13 +347,20 @@ class LLM(LLMInterface):
                 )
                 if hasattr(self, "import_model"):
                     logger.debug("Using custom 'import_model' defined in subclass.")
-                    self.__bentomodel__ = self.import_model(self._pretrained, tag, *self._args, **kwargs)
+                    self.__bentomodel__ = self.import_model(
+                        self._pretrained, tag, *self._args, tokenizer_kwds=tokenizer_kwds, **kwds
+                    )
                 else:
                     if self.import_kwargs:
-                        kwargs = {**self.import_kwargs, **kwargs}
+                        kwds = {**self.import_kwargs, **kwds}
                     # NOTE: In this branch, we just use the default implementation.
                     self.__bentomodel__ = import_model(
-                        self._pretrained, tag, __openllm_framework__=self._implementation, *self._args, **kwargs
+                        self._pretrained,
+                        tag,
+                        *self._args,
+                        tokenizer_kwds=tokenizer_kwds,
+                        __openllm_framework__=self._implementation,
+                        **kwds,
                     )
         return self.__bentomodel__
 
@@ -360,7 +386,8 @@ class LLM(LLMInterface):
                 # This could happen if users implement their own import_model
                 raise openllm.exceptions.OpenLLMException(
                     "Model does not have tokenizer. Make sure to save \
-                    the tokenizer within the model via 'custom_objects'."
+                    the tokenizer within the model via 'custom_objects'.\
+                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
                 )
         return self.__llm_tokenizer__
 
@@ -406,7 +433,9 @@ class LLM(LLMInterface):
             method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
         else:
             generate_sig = ModelSignature.convert_signatures_dict(method_configs).get("generate", generate_sig)
-            ModelSignature.convert_signatures_dict(method_configs).get("generate_iterator", generate_iterator_sig)
+            generate_iterator_sig = ModelSignature.convert_signatures_dict(method_configs).get(
+                "generate_iterator", generate_iterator_sig
+            )
 
         class _Runnable(bentoml.Runnable):
             SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
@@ -418,7 +447,7 @@ class LLM(LLMInterface):
                 input_spec=generate_sig.input_spec,
                 output_spec=generate_sig.output_spec,
             )
-            def generate(__self, prompt: str, **kwds: t.Any) -> list[str]:
+            def generate(__self, prompt: str, **kwds: t.Any) -> list[t.Any]:
                 return self.generate(prompt, **kwds)
 
             @bentoml.Runnable.method(
@@ -427,14 +456,11 @@ class LLM(LLMInterface):
                 input_spec=generate_iterator_sig.input_spec,
                 output_spec=generate_iterator_sig.output_spec,
             )
-            def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[str]:
-                return self.generate_iterator(prompt, **kwds)
+            def generate_iterator(__self, prompt: str, **kwds: t.Any) -> t.Iterator[t.Any]:
+                yield self.generate_iterator(prompt, **kwds)
 
         return bentoml.Runner(
-            t.cast(
-                "type[LLMRunnable]",
-                types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
-            ),
+            types.new_class(inflection.camelize(self.config.__openllm_model_name__) + "Runnable", (_Runnable,)),
             runnable_init_params=kwargs,
             name=name,
             models=models,
diff --git a/src/openllm/models/__init__.py b/src/openllm/models/__init__.py
index b32b98e0..92f09876 100644
--- a/src/openllm/models/__init__.py
+++ b/src/openllm/models/__init__.py
@@ -13,4 +13,6 @@
 # limitations under the License.
 
 from . import auto as auto
+from . import chatglm as chatglm
+from . import dolly_v2 as dolly_v2
 from . import flan_t5 as flan_t5
diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py
index 5c6b109c..0f4556fb 100644
--- a/src/openllm/models/auto/configuration_auto.py
+++ b/src/openllm/models/auto/configuration_auto.py
@@ -28,7 +28,13 @@ else:
     ConfigOrderedDict = OrderedDict
 
 # NOTE: This is the entrypoint when adding new model config
-CONFIG_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Config"), ("dolly_v2", "DollyV2Config")])
+CONFIG_MAPPING_NAMES = OrderedDict(
+    [
+        ("flan_t5", "FlanT5Config"),
+        ("dolly_v2", "DollyV2Config"),
+        ("chatglm", "ChatGLMConfig"),
+    ]
+)
 
 
 class _LazyConfigMapping(ConfigOrderedDict):
diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py
index 82af2b80..44e8131b 100644
--- a/src/openllm/models/auto/modeling_auto.py
+++ b/src/openllm/models/auto/modeling_auto.py
@@ -19,7 +19,7 @@ from collections import OrderedDict
 from .configuration_auto import CONFIG_MAPPING_NAMES
 from .factory import _BaseAutoLLMClass, _LazyAutoMapping
 
-MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2")])
+MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5"), ("dolly_v2", "DollyV2"), ("chatglm", "ChatGLM")])
 
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 
diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py
index 5e1ed3d7..deeba09b 100644
--- a/src/openllm/models/chatglm/__init__.py
+++ b/src/openllm/models/chatglm/__init__.py
@@ -14,4 +14,39 @@
 
 from __future__ import annotations
 
-raise NotImplementedError("This module is not implemented yet.")
+import typing as t
+
+import openllm
+
+_import_structure = {
+    "configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
+}
+
+try:
+    if not openllm.utils.is_torch_available():
+        raise openllm.exceptions.MissingDependencyError
+except openllm.exceptions.MissingDependencyError:
+    pass
+else:
+    _import_structure["modeling_chatglm"] = ["ChatGLM"]
+
+if t.TYPE_CHECKING:
+    from .configuration_chatglm import \
+        DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
+    from .configuration_chatglm import \
+        START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
+    from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
+
+    try:
+        if not openllm.utils.is_torch_available():
+            raise openllm.exceptions.MissingDependencyError
+    except openllm.exceptions.MissingDependencyError:
+        pass
+    else:
+        from .modeling_chatglm import ChatGLM as ChatGLM
+else:
+    import sys
+
+    sys.modules[__name__] = openllm.utils.LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py
index e69de29b..f3711533 100644
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -0,0 +1,49 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import openllm
+
+
+class ChatGLMConfig(openllm.LLMConfig, name_type="lowercase"):
+    """Configuration for the ChatGLM model."""
+
+    retain_history: bool = True
+    """Whether to retain history given to the model. If set to True, then the model will retain given history."""
+
+    use_half_precision: bool = True
+    """Whether to use half precision for model."""
+
+    class GenerationConfig:
+        max_length: int = 2048
+        num_beams: int = 1
+        top_p: float = 0.7
+        temperature: float = 0.95
+
+
+START_CHATGLM_COMMAND_DOCSTRING = """\
+Run a LLMServer for ChatGLM model and variants.
+
+\b
+> See more information about ChatGLM at [THUDM/ChatGLM-6b](https://huggingface.co/thudm/chatglm-6b)
+
+\b
+## Usage
+
+Currently, ChatGLM only supports PyTorch. Make sure ``torch`` is available in your system.
+
+\b
+ChatGLM Runner will use THUDM/ChatGLM-6b as the default model. To change any to any other ChatGLM
+saved pretrained, or a fine-tune ChatGLM, provide ``OPENLLM_CHATGLM_PRETRAINED='thudm/chatglm-6b-int8'``
+"""
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
new file mode 100644
index 00000000..db58a551
--- /dev/null
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -0,0 +1,104 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import typing as t
+
+import bentoml
+import transformers
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList
+
+import openllm
+
+if t.TYPE_CHECKING:
+    import torch
+else:
+    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    """Ported from modeling_chatglm.py"""
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class ChatGLM(openllm.LLM, _internal=True):
+    default_model = "THUDM/chatglm-6b"
+
+    variants = ["THUDM/chatglm-6b", "THUDM/chatglm-6b-int8", "THUDM/chatglm-6b-int4"]
+
+    def model_post_init(self, _: t.Any):
+        self.history: list[tuple[str, str]] = []
+
+    def import_model(
+        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **kwds: t.Any
+    ) -> bentoml.Model:
+        return bentoml.transformers.save_model(
+            str(tag),
+            transformers.AutoModel.from_pretrained(pretrained, trust_remote_code=True, **kwds),
+            custom_objects={
+                "tokenizer": transformers.AutoTokenizer.from_pretrained(
+                    pretrained, trust_remote_code=True, **tokenizer_kwds
+                )
+            },
+        )
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt: str,
+        max_length: int | None = None,
+        num_beams: int | None = None,
+        top_p: float | None = None,
+        temperature: float | None = None,
+        **kwargs: t.Any,
+    ) -> t.Any:
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+        if self.config.use_half_precision:
+            self.model = self.model.half()
+        self.model.eval()
+
+        logit_processor = LogitsProcessorList()
+        logit_processor.append(InvalidScoreLogitsProcessor())
+
+        prompt_text = ""
+        for i, (old_query, response) in enumerate(self.history):
+            prompt_text += f"[Round {i}]\n问：{old_query}\n答：{response}\n"
+        prompt_text += f"[Round {len(self.history)}]\n问：{prompt}\n答："
+
+        inputs = self.tokenizer([prompt_text], return_tensors="pt").to(self.model.device)
+        outputs = self.model.generate(
+            **inputs,
+            generation_config=self.config.with_options(
+                max_length=max_length,
+                num_beams=num_beams,
+                top_p=top_p,
+                temperature=temperature,
+                do_sample=True,
+                **kwargs,
+            ).to_generation_config(),
+            logits_processor=logit_processor,
+        )
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
+        response = self.tokenizer.decode(outputs)
+        response = self.model.process_response(response)
+        if self.config.retain_history:
+            self.history.append((prompt, response))
+        return self.history
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index 0c041181..1cf14021 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -11,14 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""The following includes OpenLLM configuration and excerpt from 
-[instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)"""
-
 from __future__ import annotations
 
 import openllm
 
 
+# NOTE: The following includes OpenLLM configuration and excerpt from [instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-3b/blob/main/instruct_pipeline.py)
 class DollyV2Config(openllm.LLMConfig, default_timeout=3600000):
     """Configuration for the dolly-v2 model."""
 
@@ -55,7 +53,7 @@ INTRO_BLURB = (
     "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 )
 
-# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
+# NOTE: This is the prompt that is used for generating responses using an already trained model.  It ends with the response
 # key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
 DEFAULT_PROMPT_TEMPLATE = """{intro}
 {instruction_key}
diff --git a/src/openllm/utils/dummy_pt_objects.py b/src/openllm/utils/dummy_pt_objects.py
index 5822287d..2949e5fb 100644
--- a/src/openllm/utils/dummy_pt_objects.py
+++ b/src/openllm/utils/dummy_pt_objects.py
@@ -19,6 +19,13 @@ class DollyV2(metaclass=DummyMetaclass):
         require_backends(self, ["torch"])
 
 
+class ChatGLM(metaclass=DummyMetaclass):
+    _backends = ["torch"]
+
+    def __init__(self, *args: t.Any, **kwargs: t.Any):
+        require_backends(self, ["torch"])
+
+
 class AutoLLM(metaclass=DummyMetaclass):
     _backends = ["torch"]