perf: upgrade mixtral to use expert parallelism (#783)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-06-12 02:20:32 -04:00 · 2023-12-15 11:45:08 -05:00
parent 7d2c4a4f7a
commit 88b6d3d6de
4 changed files with 6 additions and 5 deletions
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -119,7 +119,7 @@ openai = ["openai[datalib]>=1", "tiktoken"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 qwen = ["cpm-kernels", "tiktoken"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm>=0.2.4", "megablocks", "stanford-stk", "ray==2.6.0"]
+vllm = ["vllm>=0.2.5", "ray==2.6.0"]

 [tool.hatch.version]
 fallback-version = "0.0.0"
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -52,10 +52,10 @@ class HfIgnore:
  def ignore_patterns(cls, llm: openllm.LLM[t.Any, t.Any]) -> list[str]:
    if llm.__llm_backend__ in {'vllm', 'pt'}:
      base = [cls.tf, cls.flax, cls.gguf]
-      if llm.config['architecture'] == 'MixtralForCausalLM':  # XXX: Hack for Mixtral as safetensors is yet to be working atm
-        base.append(cls.safetensors)
-      elif has_safetensors_weights(llm.model_id):
+      if has_safetensors_weights(llm.model_id):
        base.extend([cls.pt, '*.pt'])
+      elif has_pt_weights(llm.model_id):
+        base.extend([cls.safetensors, cls.pt])
      else:
        base.append(cls.safetensors)
    elif llm.__llm_backend__ == 'ggml':