From 8173cb09a50c282f028b5d5dc5646cc773fff827 Mon Sep 17 00:00:00 2001 From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Date: Thu, 7 Sep 2023 01:48:45 +0000 Subject: [PATCH] fix(quantize): dyn quant for int8 and int4 only set tokenizer when it is gptq Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --- openllm-core/src/openllm_core/config/configuration_llama.py | 2 +- openllm-python/pyproject.toml | 2 +- openllm-python/src/openllm/_llm.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index 221355d0..92ce6f6d 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -77,7 +77,7 @@ class LlamaConfig(openllm_core.LLMConfig): 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, 'architecture': 'LlamaForCausalLM', - 'requirements': ['fairscale', 'sentencepiece'], + 'requirements': ['fairscale', 'sentencepiece', 'scipy'], 'tokenizer_class': 'LlamaTokenizerFast', 'default_id': 'NousResearch/llama-2-7b-hf', 'model_ids': [ diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index b2219980..69dac610 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -108,7 +108,7 @@ full = [ ggml = ["ctransformers"] gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"] grpc = ["openllm-client[grpc]"] -llama = ["fairscale", "sentencepiece"] +llama = ["fairscale", "sentencepiece", "scipy"] mpt = ["triton", "einops"] openai = ["openai", "tiktoken"] opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"] diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index fd4e277f..bdfa1d6f 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -444,8 +444,8 @@ class LLM(LLMInterface[M, T], ReprMixin): raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.") if quantization_config is None and quantize is not None: # in case users input `tokenizer` to __init__, default to the _model_id - _gptq_tokenizer = attrs.pop('tokenizer', _model_id) - quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs) + if quantize == 'gptq': attrs.setdefault('tokenizer', _model_id) + quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs) if quantize == 'gptq': serialisation = 'safetensors' elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress