fix(configuration): Make sure GenerationInput dumped the correct

dictionary for llm_config

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-05-27 00:57:33 -07:00
parent 8ee5b048f3
commit c73732db6f
7 changed files with 12 additions and 10 deletions

View File

@@ -44,6 +44,14 @@ class GenerationInput(pydantic.BaseModel):
llm_config=(llm_config.__class__, ...),
)
# XXX: Need more investigation why llm_config.model_dump is not invoked
# recursively when GenerationInput.model_dump is called
def model_dump(self, **kwargs: t.Any):
"""Override the default model_dump to make sure llm_config is correctly flattened."""
dumped = super().model_dump(**kwargs)
dumped['llm_config'] = self.llm_config.model_dump(flatten=True)
return dumped
class GenerationOutput(pydantic.BaseModel):
model_config = {"extra": "forbid"}

View File

@@ -25,7 +25,6 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
route="/v1/generate",
)
async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
print(qa)
config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
responses = await runner.generate.async_run(qa.prompt, **config)
return openllm.GenerationOutput(responses=responses, configuration=config)

View File

@@ -81,7 +81,6 @@ class DollyV2(openllm.LLM):
temperature=temperature,
top_k=top_k,
top_p=top_p,
do_sample=True,
**kwargs,
).model_dump(flatten=True)

View File

@@ -65,7 +65,6 @@ class FlanT5(openllm.LLM):
self,
prompt: str,
max_new_tokens: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
top_k: float | None = None,
top_p: float | None = None,
@@ -75,7 +74,7 @@ class FlanT5(openllm.LLM):
input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
result_tensor = self.model.generate(
input_ids,
do_sample=do_sample,
do_sample=True,
generation_config=self.config.with_options(
max_new_tokens=max_new_tokens,
temperature=temperature,

View File

@@ -57,7 +57,6 @@ class FlaxFlanT5(openllm.LLM):
self,
prompt: str,
max_new_tokens: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
top_k: float | None = None,
top_p: float | None = None,
@@ -67,7 +66,7 @@ class FlaxFlanT5(openllm.LLM):
input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
result_tensor = self.model.generate(
input_ids,
do_sample=do_sample,
do_sample=True,
generation_config=self.config.with_options(
max_new_tokens=max_new_tokens,
temperature=temperature,

View File

@@ -57,7 +57,6 @@ class TFFlanT5(openllm.LLM):
self,
prompt: str,
max_new_tokens: int | None = None,
do_sample: bool = True,
temperature: float | None = None,
top_k: float | None = None,
top_p: float | None = None,
@@ -67,7 +66,7 @@ class TFFlanT5(openllm.LLM):
input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
outputs = self.model.generate(
input_ids,
do_sample=do_sample,
do_sample=True,
generation_config=self.config.with_options(
max_new_tokens=max_new_tokens,
temperature=temperature,

View File

@@ -128,7 +128,6 @@ class StarCoder(openllm.LLM):
def generate(
self,
prompt: str,
do_sample: bool = True,
temperature: float | None = None,
top_p: float | None = None,
max_new_tokens: int | None = None,
@@ -148,7 +147,7 @@ class StarCoder(openllm.LLM):
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
result_tensor = self.model.generate(
inputs,
do_sample=do_sample,
do_sample=True,
generation_config=self.config.with_options(
top_p=top_p,
temperature=temperature,