mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-03 14:46:00 -05:00
fix(quantize): dyn quant for int8 and int4
only set tokenizer when it is gptq Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -108,7 +108,7 @@ full = [
|
||||
ggml = ["ctransformers"]
|
||||
gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
|
||||
grpc = ["openllm-client[grpc]"]
|
||||
llama = ["fairscale", "sentencepiece"]
|
||||
llama = ["fairscale", "sentencepiece", "scipy"]
|
||||
mpt = ["triton", "einops"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
|
||||
@@ -444,8 +444,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
|
||||
if quantization_config is None and quantize is not None:
|
||||
# in case users input `tokenizer` to __init__, default to the _model_id
|
||||
_gptq_tokenizer = attrs.pop('tokenizer', _model_id)
|
||||
quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs)
|
||||
if quantize == 'gptq': attrs.setdefault('tokenizer', _model_id)
|
||||
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
|
||||
if quantize == 'gptq': serialisation = 'safetensors'
|
||||
elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress
|
||||
|
||||
|
||||
Reference in New Issue
Block a user