diff --git a/.github/benchmark-dashboard/index.html b/.github/benchmark-dashboard/index.html
index 5b64af48..5f72a831 100644
--- a/.github/benchmark-dashboard/index.html
+++ b/.github/benchmark-dashboard/index.html
@@ -586,8 +586,37 @@
                 const modelIds = cluster.model_ids || ['unknown'];
                 const modelName = modelIds.length === 1 ? modelIds[0] : `${modelIds.length} models`;
                 
-                // Get strategy (default to 'N/A' if not specified)
-                const strategy = cluster.strategy || 'N/A';
+                // Get strategy (backwards compatible with old format)
+                // New format: sharding + instance_meta, e.g. "Pipeline (MLX Ring)"
+                // Old format: strategy field
+                let strategy = 'N/A';
+                if (cluster.strategy) {
+                    // Backwards compatibility: use old strategy field
+                    strategy = cluster.strategy;
+                } else if (cluster.sharding || cluster.instance_meta) {
+                    // New format: combine sharding and instance_meta
+                    const sharding = cluster.sharding || '';
+                    const instanceMeta = cluster.instance_meta || '';
+                    
+                    // Format instance_meta: convert camelCase/PascalCase to readable format
+                    const formatInstanceMeta = (meta) => {
+                        if (!meta) return '';
+                        // Insert spaces before capital letters and handle common acronyms
+                        return meta
+                            .replace(/([A-Z])/g, ' $1')
+                            .trim()
+                            .replace(/\bMlx\b/g, 'MLX')
+                            .replace(/\bIbv\b/g, 'IBV');
+                    };
+                    
+                    if (sharding && instanceMeta) {
+                        strategy = `${sharding} (${formatInstanceMeta(instanceMeta)})`;
+                    } else if (sharding) {
+                        strategy = sharding;
+                    } else if (instanceMeta) {
+                        strategy = formatInstanceMeta(instanceMeta);
+                    }
+                }
                 
                 // For each stage in the configuration, create a row
                 stages.forEach((stageConfig, stageIdx) => {
diff --git a/.github/configs/bench_simple.yaml b/.github/configs/bench_simple.yaml
index 346df681..91c85020 100644
--- a/.github/configs/bench_simple.yaml
+++ b/.github/configs/bench_simple.yaml
@@ -4,7 +4,7 @@
 # Hardware configuration - maps runner labels to instance counts
 hardware_plan:
   puffin4: 1
-  # puffin8: 1
+  puffin8: 1
 
 # Environment variables to set on each node
 environment:
@@ -18,14 +18,15 @@ timeout_seconds: 1800
 # Model instances to run concurrently
 model_ids:
   # - "mlx-community/DeepSeek-V3.1-8bit"
-  - "mlx-community/Kimi-K2-Instruct-4bit"
+  # - "mlx-community/Kimi-K2-Instruct-4bit"
+  - "mlx-community/Kimi-K2-Thinking"
   # - "mlx-community/Qwen3-235B-A22B-4bit"
   # - "mlx-community/Llama-3.3-70B-Instruct-4bit"
   # - "mlx-community/Llama-3.3-70B-Instruct-8bit"
   # - "mlx-community/Llama-3.2-1B-Instruct-4bit"
 
 # Sharding strategy: "Pipeline" or "Tensor"
-sharding: "Tensor"
+sharding: "Pipeline"
 
 # Instance type: "MlxRing" or "MlxIbv"
 instance_meta: "MlxIbv"
@@ -46,62 +47,62 @@ stages:
     prompt_length: 64
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp64_g512"
-    prompt_length: 64
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp64_g512"
+  #   prompt_length: 64
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   - name: "pp256_g64"
     prompt_length: 256
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp256_g512"
-    prompt_length: 256
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp256_g512"
+  #   prompt_length: 256
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   - name: "pp1024_g64"
     prompt_length: 1024
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp1024_g512"
-    prompt_length: 1024
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp1024_g512"
+  #   prompt_length: 1024
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   - name: "pp2048_g64"
     prompt_length: 2048
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp2048_g512"
-    prompt_length: 2048
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp2048_g512"
+  #   prompt_length: 2048
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   - name: "pp4096_g64"
     prompt_length: 4096
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp4096_g512"
-    prompt_length: 4096
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp4096_g512"
+  #   prompt_length: 4096
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   - name: "pp8192_g64"
     prompt_length: 8192
     generation_length: 64
     time_between_requests: 2.0
-    iterations: 10
-  - name: "pp8192_g512"
-    prompt_length: 8192
-    generation_length: 512
-    time_between_requests: 2.0
-    iterations: 10
+    iterations: 5
+  # - name: "pp8192_g512"
+  #   prompt_length: 8192
+  #   generation_length: 512
+  #   time_between_requests: 2.0
+  #   iterations: 10
   # - name: "pp16384_g64"
   #   prompt_length: 16384
   #   generation_length: 64
diff --git a/TODO.md b/TODO.md
index c07c2220..85577411 100644
--- a/TODO.md
+++ b/TODO.md
@@ -19,6 +19,7 @@
 21. Make two separate things: tensor or pipeline, and ring or ibv.
 22. When downloading for the first time, stuff times out and I think the model never ends up actually loading into memory, or something.
 23. Do we need cache_limit? We went back and forth on that a lot because we thought it might be causing issues. One problem is it sets it relative to model size. So if you have multiple models loaded in it will take the most recent model size for the cache_limit. This is problematic if you launch DeepSeek -> Llama for example.
+24. Task cancellation. When API http request gets cancelled, it should cancel corresponding task.
 
 Potential refactors:
 
diff --git a/pyproject.toml b/pyproject.toml
index 5a7f8fa9..cd617aee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "bidict>=0.23.1",
     "mlx>=0.29.3",
     "mlx-lm>=0.28.3",
+    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
 ]
 
 [project.scripts]
diff --git a/src/exo/engines/mlx/auto_parallel.py b/src/exo/engines/mlx/auto_parallel.py
index 3223f86f..452b53c8 100644
--- a/src/exo/engines/mlx/auto_parallel.py
+++ b/src/exo/engines/mlx/auto_parallel.py
@@ -3,7 +3,7 @@ from functools import partial
 from inspect import signature
 from typing import TYPE_CHECKING, Callable, Protocol, cast, override
 
-from mlx_lm.models.cache import KVCache
+from mlx_lm.models.cache import KVCache, RotatingKVCache
 from mlx_lm.models.deepseek_v3 import DeepseekV3MLP
 from mlx_lm.models.deepseek_v3 import Model as DeepseekV3Model
 from mlx_lm.models.llama import Model as LlamaModel
@@ -92,7 +92,7 @@ class PipelineLastLayer(CustomMlxLayer):
 
         cache = self.original_layer_signature.bind_partial(x, *args, **kwargs).arguments.get("cache", None)
 
-        assert cache is None or isinstance(cache, KVCache)
+        assert cache is None or isinstance(cache, (KVCache, RotatingKVCache))
 
         output: mx.array = self.original_layer(x, *args, **kwargs)
 
diff --git a/src/exo/engines/mlx/utils_mlx.py b/src/exo/engines/mlx/utils_mlx.py
index e8e6391b..9e92e723 100644
--- a/src/exo/engines/mlx/utils_mlx.py
+++ b/src/exo/engines/mlx/utils_mlx.py
@@ -2,7 +2,7 @@ import os
 import resource
 from typing import Any, Callable, cast
 
-from mlx_lm.models.cache import KVCache
+from mlx_lm.models.cache import KVCache, RotatingKVCache
 from mlx_lm.sample_utils import make_sampler
 from mlx_lm.tokenizer_utils import TokenizerWrapper
 
@@ -254,9 +254,14 @@ class NullKVCache(KVCache):
 def make_kv_cache(
     model: Model,
     max_kv_size: int | None = None,
-) -> list[KVCache]:
+) -> list[KVCache | RotatingKVCache]:
     assert hasattr(model, "layers")
-    return [KVCache() for _ in model.layers]
+    if max_kv_size is None:
+        logger.info("Using default KV cache")
+        return [KVCache() for _ in model.layers]
+    else:
+        logger.info(f"Using rotating KV cache with {max_kv_size=}")
+        return [RotatingKVCache(max_size=max_kv_size) for _ in model.layers]
 
 
 def mlx_force_oom(size: int = 40000) -> None:
diff --git a/src/exo/master/api.py b/src/exo/master/api.py
index b52dbe29..22074064 100644
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -215,7 +215,7 @@ class API:
         while not finished:
             # TODO: how long should this timeout be?
             chunk = await asyncio.wait_for(
-                self._chat_completion_queues[command_id].get(), timeout=60
+                self._chat_completion_queues[command_id].get(), timeout=600
             )
             assert isinstance(chunk, TokenChunk)
             chunk_response: ChatCompletionResponse = chunk_to_response(
diff --git a/src/exo/shared/models/model_cards.py b/src/exo/shared/models/model_cards.py
index 8fc85c48..12051b3b 100644
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -93,6 +93,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
             n_layers=61,
         ),
     ),
+    # kimi k2
     "kimi-k2-instruct-4bit": ModelCard(
         short_id="kimi-k2-instruct-4bit",
         model_id="mlx-community/Kimi-K2-Instruct-4bit",
@@ -106,6 +107,19 @@ MODEL_CARDS: dict[str, ModelCard] = {
             n_layers=61,
         ),
     ),
+    "kimi-k2-thinking": ModelCard(
+        short_id="kimi-k2-thinking",
+        model_id="mlx-community/Kimi-K2-Thinking",
+        name="Kimi K2 Thinking",
+        description="""Kimi K2 Thinking is the latest, most capable version of open-source thinking model.""",
+        tags=[],
+        metadata=ModelMetadata(
+            model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
+            pretty_name="Kimi K2 Thinking",
+            storage_size=Memory.from_bytes(577597603840),
+            n_layers=61,
+        ),
+    ),
     # llama-3.1
     "llama-3.1-8b": ModelCard(
         short_id="llama-3.1-8b",
diff --git a/uv.lock b/uv.lock
index 6960924b..a3e25d6f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 1
 requires-python = ">=3.13"
 resolution-markers = [
     "sys_platform == 'darwin'",
@@ -361,6 +361,7 @@ dependencies = [
     { name = "sqlalchemy", extra = ["asyncio"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "sqlmodel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "textual", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "typeguard", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "types-aiofiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -403,6 +404,7 @@ requires-dist = [
     { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.43" },
     { name = "sqlmodel", specifier = ">=0.0.24" },
     { name = "textual", specifier = ">=5.3.0" },
+    { name = "tiktoken", specifier = ">=0.12.0" },
     { name = "transformers", specifier = ">=4.55.2" },
     { name = "typeguard", specifier = ">=4.4.4" },
     { name = "types-aiofiles", specifier = ">=24.1.0.20250708" },
@@ -1458,6 +1460,42 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/42/37/1deba011782a49ea249c73adcf703a39b0249ac9b0e17d1a2e4074df8d57/textual-6.5.0-py3-none-any.whl", hash = "sha256:c5505be7fe606b8054fb88431279885f88352bddca64832f6acd293ef7d9b54f", size = 711848, upload-time = "2025-10-31T17:21:51.134Z" },
 ]
 
+[[package]]
+name = "tiktoken"
+version = "0.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802 },
+    { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995 },
+    { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948 },
+    { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986 },
+    { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222 },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097 },
+    { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309 },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712 },
+    { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725 },
+    { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875 },
+    { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451 },
+    { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794 },
+    { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188 },
+    { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978 },
+    { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271 },
+    { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216 },
+    { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860 },
+    { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567 },
+    { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473 },
+    { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855 },
+    { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022 },
+    { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736 },
+    { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908 },
+    { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706 },
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.22.1"