kimi k2 thinking

This commit is contained in:
Alex Cheema
2025-11-11 10:03:39 -08:00
committed by GitHub
parent 364087b91f
commit 631cb81009
9 changed files with 137 additions and 48 deletions

View File

@@ -586,8 +586,37 @@
const modelIds = cluster.model_ids || ['unknown'];
const modelName = modelIds.length === 1 ? modelIds[0] : `${modelIds.length} models`;
// Get strategy (default to 'N/A' if not specified)
const strategy = cluster.strategy || 'N/A';
// Get strategy (backwards compatible with old format)
// New format: sharding + instance_meta, e.g. "Pipeline (MLX Ring)"
// Old format: strategy field
let strategy = 'N/A';
if (cluster.strategy) {
// Backwards compatibility: use old strategy field
strategy = cluster.strategy;
} else if (cluster.sharding || cluster.instance_meta) {
// New format: combine sharding and instance_meta
const sharding = cluster.sharding || '';
const instanceMeta = cluster.instance_meta || '';
// Format instance_meta: convert camelCase/PascalCase to readable format
const formatInstanceMeta = (meta) => {
if (!meta) return '';
// Insert spaces before capital letters and handle common acronyms
return meta
.replace(/([A-Z])/g, ' $1')
.trim()
.replace(/\bMlx\b/g, 'MLX')
.replace(/\bIbv\b/g, 'IBV');
};
if (sharding && instanceMeta) {
strategy = `${sharding} (${formatInstanceMeta(instanceMeta)})`;
} else if (sharding) {
strategy = sharding;
} else if (instanceMeta) {
strategy = formatInstanceMeta(instanceMeta);
}
}
// For each stage in the configuration, create a row
stages.forEach((stageConfig, stageIdx) => {

View File

@@ -4,7 +4,7 @@
# Hardware configuration - maps runner labels to instance counts
hardware_plan:
puffin4: 1
# puffin8: 1
puffin8: 1
# Environment variables to set on each node
environment:
@@ -18,14 +18,15 @@ timeout_seconds: 1800
# Model instances to run concurrently
model_ids:
# - "mlx-community/DeepSeek-V3.1-8bit"
- "mlx-community/Kimi-K2-Instruct-4bit"
# - "mlx-community/Kimi-K2-Instruct-4bit"
- "mlx-community/Kimi-K2-Thinking"
# - "mlx-community/Qwen3-235B-A22B-4bit"
# - "mlx-community/Llama-3.3-70B-Instruct-4bit"
# - "mlx-community/Llama-3.3-70B-Instruct-8bit"
# - "mlx-community/Llama-3.2-1B-Instruct-4bit"
# Sharding strategy: "Pipeline" or "Tensor"
sharding: "Tensor"
sharding: "Pipeline"
# Instance type: "MlxRing" or "MlxIbv"
instance_meta: "MlxIbv"
@@ -46,62 +47,62 @@ stages:
prompt_length: 64
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp64_g512"
prompt_length: 64
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp64_g512"
# prompt_length: 64
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
- name: "pp256_g64"
prompt_length: 256
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp256_g512"
prompt_length: 256
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp256_g512"
# prompt_length: 256
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
- name: "pp1024_g64"
prompt_length: 1024
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp1024_g512"
prompt_length: 1024
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp1024_g512"
# prompt_length: 1024
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
- name: "pp2048_g64"
prompt_length: 2048
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp2048_g512"
prompt_length: 2048
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp2048_g512"
# prompt_length: 2048
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
- name: "pp4096_g64"
prompt_length: 4096
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp4096_g512"
prompt_length: 4096
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp4096_g512"
# prompt_length: 4096
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
- name: "pp8192_g64"
prompt_length: 8192
generation_length: 64
time_between_requests: 2.0
iterations: 10
- name: "pp8192_g512"
prompt_length: 8192
generation_length: 512
time_between_requests: 2.0
iterations: 10
iterations: 5
# - name: "pp8192_g512"
# prompt_length: 8192
# generation_length: 512
# time_between_requests: 2.0
# iterations: 10
# - name: "pp16384_g64"
# prompt_length: 16384
# generation_length: 64

View File

@@ -19,6 +19,7 @@
21. Make two separate things: tensor or pipeline, and ring or ibv.
22. When downloading for the first time, stuff times out and I think the model never ends up actually loading into memory, or something.
23. Do we need cache_limit? We went back and forth on that a lot because we thought it might be causing issues. One problem is it sets it relative to model size. So if you have multiple models loaded in it will take the most recent model size for the cache_limit. This is problematic if you launch DeepSeek -> Llama for example.
24. Task cancellation. When API http request gets cancelled, it should cancel corresponding task.
Potential refactors:

View File

@@ -36,6 +36,7 @@ dependencies = [
"bidict>=0.23.1",
"mlx>=0.29.3",
"mlx-lm>=0.28.3",
"tiktoken>=0.12.0", # required for kimi k2 tokenizer
]
[project.scripts]

View File

@@ -3,7 +3,7 @@ from functools import partial
from inspect import signature
from typing import TYPE_CHECKING, Callable, Protocol, cast, override
from mlx_lm.models.cache import KVCache
from mlx_lm.models.cache import KVCache, RotatingKVCache
from mlx_lm.models.deepseek_v3 import DeepseekV3MLP
from mlx_lm.models.deepseek_v3 import Model as DeepseekV3Model
from mlx_lm.models.llama import Model as LlamaModel
@@ -92,7 +92,7 @@ class PipelineLastLayer(CustomMlxLayer):
cache = self.original_layer_signature.bind_partial(x, *args, **kwargs).arguments.get("cache", None)
assert cache is None or isinstance(cache, KVCache)
assert cache is None or isinstance(cache, (KVCache, RotatingKVCache))
output: mx.array = self.original_layer(x, *args, **kwargs)

View File

@@ -2,7 +2,7 @@ import os
import resource
from typing import Any, Callable, cast
from mlx_lm.models.cache import KVCache
from mlx_lm.models.cache import KVCache, RotatingKVCache
from mlx_lm.sample_utils import make_sampler
from mlx_lm.tokenizer_utils import TokenizerWrapper
@@ -254,9 +254,14 @@ class NullKVCache(KVCache):
def make_kv_cache(
model: Model,
max_kv_size: int | None = None,
) -> list[KVCache]:
) -> list[KVCache | RotatingKVCache]:
assert hasattr(model, "layers")
if max_kv_size is None:
logger.info("Using default KV cache")
return [KVCache() for _ in model.layers]
else:
logger.info(f"Using rotating KV cache with {max_kv_size=}")
return [RotatingKVCache(max_size=max_kv_size) for _ in model.layers]
def mlx_force_oom(size: int = 40000) -> None:

View File

@@ -215,7 +215,7 @@ class API:
while not finished:
# TODO: how long should this timeout be?
chunk = await asyncio.wait_for(
self._chat_completion_queues[command_id].get(), timeout=60
self._chat_completion_queues[command_id].get(), timeout=600
)
assert isinstance(chunk, TokenChunk)
chunk_response: ChatCompletionResponse = chunk_to_response(

View File

@@ -93,6 +93,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
n_layers=61,
),
),
# kimi k2
"kimi-k2-instruct-4bit": ModelCard(
short_id="kimi-k2-instruct-4bit",
model_id="mlx-community/Kimi-K2-Instruct-4bit",
@@ -106,6 +107,19 @@ MODEL_CARDS: dict[str, ModelCard] = {
n_layers=61,
),
),
"kimi-k2-thinking": ModelCard(
short_id="kimi-k2-thinking",
model_id="mlx-community/Kimi-K2-Thinking",
name="Kimi K2 Thinking",
description="""Kimi K2 Thinking is the latest, most capable version of open-source thinking model.""",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
pretty_name="Kimi K2 Thinking",
storage_size=Memory.from_bytes(577597603840),
n_layers=61,
),
),
# llama-3.1
"llama-3.1-8b": ModelCard(
short_id="llama-3.1-8b",

40
uv.lock generated
View File

@@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 1
requires-python = ">=3.13"
resolution-markers = [
"sys_platform == 'darwin'",
@@ -361,6 +361,7 @@ dependencies = [
{ name = "sqlalchemy", extra = ["asyncio"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "sqlmodel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "textual", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "tiktoken", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typeguard", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "types-aiofiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -403,6 +404,7 @@ requires-dist = [
{ name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.43" },
{ name = "sqlmodel", specifier = ">=0.0.24" },
{ name = "textual", specifier = ">=5.3.0" },
{ name = "tiktoken", specifier = ">=0.12.0" },
{ name = "transformers", specifier = ">=4.55.2" },
{ name = "typeguard", specifier = ">=4.4.4" },
{ name = "types-aiofiles", specifier = ">=24.1.0.20250708" },
@@ -1458,6 +1460,42 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/42/37/1deba011782a49ea249c73adcf703a39b0249ac9b0e17d1a2e4074df8d57/textual-6.5.0-py3-none-any.whl", hash = "sha256:c5505be7fe606b8054fb88431279885f88352bddca64832f6acd293ef7d9b54f", size = 711848, upload-time = "2025-10-31T17:21:51.134Z" },
]
[[package]]
name = "tiktoken"
version = "0.12.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802 },
{ url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995 },
{ url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948 },
{ url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986 },
{ url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222 },
{ url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097 },
{ url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309 },
{ url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712 },
{ url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725 },
{ url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875 },
{ url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451 },
{ url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794 },
{ url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188 },
{ url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978 },
{ url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271 },
{ url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216 },
{ url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860 },
{ url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567 },
{ url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473 },
{ url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855 },
{ url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022 },
{ url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736 },
{ url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908 },
{ url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706 },
]
[[package]]
name = "tokenizers"
version = "0.22.1"