From ebfed3c1166ba76eea3c14e06fcd326d814dc569 Mon Sep 17 00:00:00 2001 From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Date: Sat, 10 Jun 2023 09:46:33 +0000 Subject: [PATCH] fix(chatglm): generation tokens not concatenated correctly Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --- .../models/chatglm/configuration_chatglm.py | 2 +- .../models/chatglm/modeling_chatglm.py | 29 +++---------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py index f7958697..783b844c 100644 --- a/src/openllm/models/chatglm/configuration_chatglm.py +++ b/src/openllm/models/chatglm/configuration_chatglm.py @@ -41,7 +41,7 @@ class ChatGLMConfig( Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information. """ - retain_history: bool = True + retain_history: bool = False """Whether to retain history given to the model. If set to True, then the model will retain given history.""" use_half_precision: bool = True diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py index 86a605cb..bce2f513 100644 --- a/src/openllm/models/chatglm/modeling_chatglm.py +++ b/src/openllm/models/chatglm/modeling_chatglm.py @@ -13,7 +13,6 @@ # limitations under the License. from __future__ import annotations -import re import typing as t import bentoml @@ -39,32 +38,12 @@ class InvalidScoreLogitsProcessor(LogitsProcessor): return scores -def process_response( - response: str, - use_default_prompt_template: bool = True, -): - response = response.strip() - if use_default_prompt_template: - response = response.replace("[[训练时间]]", "2023年") - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) - return response - - class ChatGLM(openllm.LLM): __openllm_internal__ = True - default_model = "THUDM/chatglm-6b-int4" + default_model = "thudm/chatglm-6b-int4" - pretrained = ["THUDM/chatglm-6b", "THUDM/chatglm-6b-int8", "THUDM/chatglm-6b-int4"] + pretrained = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"] device = torch.device("cuda") @@ -121,7 +100,7 @@ class ChatGLM(openllm.LLM): if self.config.retain_history: assert chat_history is not None, "'retain_history' is True while there is no history provided." chat_history.append((prompt, generation_result)) - return generation_result + return "".join(generation_result) @torch.inference_mode() def generate(self, prompt: str, use_default_prompt_template: bool = True, **attrs: t.Any) -> str: @@ -144,4 +123,4 @@ class ChatGLM(openllm.LLM): ) outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :] response = self.tokenizer.decode(outputs) - return process_response(response, use_default_prompt_template) + return self.model.process_response(response)