From c73732db6f4e4bfec08c9f32dcae71b149187e51 Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 27 May 2023 00:57:33 -0700
Subject: [PATCH] fix(configuration): Make sure GenerationInput dumped the
 correct dictionary for llm_config

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_schema.py                              | 8 ++++++++
 src/openllm/_service.py                             | 1 -
 src/openllm/models/dolly_v2/modeling_dolly_v2.py    | 1 -
 src/openllm/models/flan_t5/modeling_flan_t5.py      | 3 +--
 src/openllm/models/flan_t5/modeling_flax_flan_t5.py | 3 +--
 src/openllm/models/flan_t5/modeling_tf_flan_t5.py   | 3 +--
 src/openllm/models/starcoder/modeling_starcoder.py  | 3 +--
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/openllm/_schema.py b/src/openllm/_schema.py
index 2934cc6a..7f1bb10b 100644
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -44,6 +44,14 @@ class GenerationInput(pydantic.BaseModel):
             llm_config=(llm_config.__class__, ...),
         )
 
+    # XXX: Need more investigation why llm_config.model_dump is not invoked
+    # recursively when GenerationInput.model_dump is called
+    def model_dump(self, **kwargs: t.Any):
+        """Override the default model_dump to make sure llm_config is correctly flattened."""
+        dumped = super().model_dump(**kwargs)
+        dumped['llm_config'] = self.llm_config.model_dump(flatten=True)
+        return dumped
+
 
 class GenerationOutput(pydantic.BaseModel):
     model_config = {"extra": "forbid"}
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index 17a38c44..d3b9f2f2 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -25,7 +25,6 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
     route="/v1/generate",
 )
 async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
-    print(qa)
     config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
     responses = await runner.generate.async_run(qa.prompt, **config)
     return openllm.GenerationOutput(responses=responses, configuration=config)
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index fd7be7a6..e26b9a58 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -81,7 +81,6 @@ class DollyV2(openllm.LLM):
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
-            do_sample=True,
             **kwargs,
         ).model_dump(flatten=True)
 
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index 2aaaca78..c9e51560 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -65,7 +65,6 @@ class FlanT5(openllm.LLM):
         self,
         prompt: str,
         max_new_tokens: int | None = None,
-        do_sample: bool = True,
         temperature: float | None = None,
         top_k: float | None = None,
         top_p: float | None = None,
@@ -75,7 +74,7 @@ class FlanT5(openllm.LLM):
         input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
         result_tensor = self.model.generate(
             input_ids,
-            do_sample=do_sample,
+            do_sample=True,
             generation_config=self.config.with_options(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 66c63fd8..b170868e 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -57,7 +57,6 @@ class FlaxFlanT5(openllm.LLM):
         self,
         prompt: str,
         max_new_tokens: int | None = None,
-        do_sample: bool = True,
         temperature: float | None = None,
         top_k: float | None = None,
         top_p: float | None = None,
@@ -67,7 +66,7 @@ class FlaxFlanT5(openllm.LLM):
         input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
         result_tensor = self.model.generate(
             input_ids,
-            do_sample=do_sample,
+            do_sample=True,
             generation_config=self.config.with_options(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index 2759b5bc..f1d150a6 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -57,7 +57,6 @@ class TFFlanT5(openllm.LLM):
         self,
         prompt: str,
         max_new_tokens: int | None = None,
-        do_sample: bool = True,
         temperature: float | None = None,
         top_k: float | None = None,
         top_p: float | None = None,
@@ -67,7 +66,7 @@ class TFFlanT5(openllm.LLM):
         input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
         outputs = self.model.generate(
             input_ids,
-            do_sample=do_sample,
+            do_sample=True,
             generation_config=self.config.with_options(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 1e42cbb6..02425830 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -128,7 +128,6 @@ class StarCoder(openllm.LLM):
     def generate(
         self,
         prompt: str,
-        do_sample: bool = True,
         temperature: float | None = None,
         top_p: float | None = None,
         max_new_tokens: int | None = None,
@@ -148,7 +147,7 @@ class StarCoder(openllm.LLM):
         inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
         result_tensor = self.model.generate(
             inputs,
-            do_sample=do_sample,
+            do_sample=True,
             generation_config=self.config.with_options(
                 top_p=top_p,
                 temperature=temperature,