Add qwen3.5-next-moe support to MLX runner and models

This change: * adds support for qwen3.5-next-moe models (qwen3-next/qwen3.5-next/qwen3-coder) to the MLX runner * introduces recurrent cache support and related MLX ops * updates pipeline/runner integration and adds tests
consolidate the tokenizer (#14327 )
2026-02-23 10:45:08 -05:00 · 2026-02-20 17:25:23 -08:00 · 2026-02-19 15:55:45 -08:00 · 2026-02-19 10:55:02 -08:00 · 2026-02-18 18:32:45 -08:00 · 2026-02-18 17:19:36 -08:00
38 changed files with 4179 additions and 61 deletions
--- a/cmd/config/codex.go
+++ b/cmd/config/codex.go
@@ -6,6 +6,7 @@ import (
 	"os/exec"
 	"strings"

+	"github.com/ollama/ollama/envconfig"
 	"golang.org/x/mod/semver"
 )

@@ -32,6 +33,10 @@ func (c *Codex) Run(model string, args []string) error {
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+	cmd.Env = append(os.Environ(),
+		"OPENAI_BASE_URL="+envconfig.Host().String()+"/v1/",
+		"OPENAI_API_KEY=ollama",
+	)
 	return cmd.Run()
 }

--- a/cmd/tui/selector.go
+++ b/cmd/tui/selector.go
@@ -415,6 +415,12 @@ type multiSelectorModel struct {
 	cancelled    bool
 	confirmed    bool
 	width        int
+
+	// multi enables full multi-select editing mode. The zero value (false)
+	// shows a single-select picker where Enter adds the chosen model to
+	// the existing list. Tab toggles between modes.
+	multi     bool
+	singleAdd string // model picked in single mode
 }

 func newMultiSelectorModel(title string, items []SelectItem, preChecked []string) multiSelectorModel {
@@ -429,13 +435,23 @@ func newMultiSelectorModel(title string, items []SelectItem, preChecked []string
 		m.itemIndex[item.Name] = i
 	}

-	for _, name := range preChecked {
-		if idx, ok := m.itemIndex[name]; ok {
+	// Reverse order so preChecked[0] (the current default) ends up last
+	// in checkOrder, matching the "last checked = default" convention.
+	for i := len(preChecked) - 1; i >= 0; i-- {
+		if idx, ok := m.itemIndex[preChecked[i]]; ok {
 			m.checked[idx] = true
 			m.checkOrder = append(m.checkOrder, idx)
 		}
 	}

+	// Position cursor on the current default model
+	if len(preChecked) > 0 {
+		if idx, ok := m.itemIndex[preChecked[0]]; ok {
+			m.cursor = idx
+			m.updateScroll(m.otherStart())
+		}
+	}
+
 	return m
 }

@@ -546,14 +562,25 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.cancelled = true
 			return m, tea.Quit

+		case tea.KeyTab:
+			m.multi = !m.multi
+
 		case tea.KeyEnter:
-			if len(m.checkOrder) > 0 {
+			if !m.multi {
+				if len(filtered) > 0 && m.cursor < len(filtered) {
+					m.singleAdd = filtered[m.cursor].Name
+					m.confirmed = true
+					return m, tea.Quit
+				}
+			} else if len(m.checkOrder) > 0 {
 				m.confirmed = true
 				return m, tea.Quit
 			}

 		case tea.KeySpace:
-			m.toggleItem()
+			if m.multi {
+				m.toggleItem()
+			}

 		case tea.KeyUp:
 			if m.cursor > 0 {
@@ -592,7 +619,9 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			// On some terminals (e.g. Windows PowerShell), space arrives as
 			// KeyRunes instead of KeySpace. Intercept it so toggle still works.
 			if len(msg.Runes) == 1 && msg.Runes[0] == ' ' {
-				m.toggleItem()
+				if m.multi {
+					m.toggleItem()
+				}
 			} else {
 				m.filter += string(msg.Runes)
 				m.cursor = 0
@@ -604,6 +633,19 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

+func (m multiSelectorModel) renderSingleItem(s *strings.Builder, item SelectItem, idx int) {
+	if idx == m.cursor {
+		s.WriteString(selectorSelectedItemStyle.Render("▸ " + item.Name))
+	} else {
+		s.WriteString(selectorItemStyle.Render(item.Name))
+	}
+	s.WriteString("\n")
+	if item.Description != "" {
+		s.WriteString(selectorDescLineStyle.Render(item.Description))
+		s.WriteString("\n")
+	}
+}
+
 func (m multiSelectorModel) renderMultiItem(s *strings.Builder, item SelectItem, idx int) {
 	origIdx := m.itemIndex[item.Name]

@@ -615,7 +657,7 @@ func (m multiSelectorModel) renderMultiItem(s *strings.Builder, item SelectItem,
 	}

 	suffix := ""
-	if len(m.checkOrder) > 0 && m.checkOrder[0] == origIdx {
+	if len(m.checkOrder) > 0 && m.checkOrder[len(m.checkOrder)-1] == origIdx {
 		suffix = " " + selectorDefaultTagStyle.Render("(default)")
 	}

@@ -637,6 +679,11 @@ func (m multiSelectorModel) View() string {
 		return ""
 	}

+	renderItem := m.renderSingleItem
+	if m.multi {
+		renderItem = m.renderMultiItem
+	}
+
 	var s strings.Builder

 	s.WriteString(selectorTitleStyle.Render(m.title))
@@ -661,7 +708,7 @@ func (m multiSelectorModel) View() string {
 			if idx >= len(filtered) {
 				break
 			}
-			m.renderMultiItem(&s, filtered[idx], idx)
+			renderItem(&s, filtered[idx], idx)
 		}

 		if remaining := len(filtered) - m.scrollOffset - displayCount; remaining > 0 {
@@ -684,7 +731,7 @@ func (m multiSelectorModel) View() string {
 			s.WriteString(sectionHeaderStyle.Render("Recommended"))
 			s.WriteString("\n")
 			for _, idx := range recItems {
-				m.renderMultiItem(&s, filtered[idx], idx)
+				renderItem(&s, filtered[idx], idx)
 			}
 		}

@@ -704,7 +751,7 @@ func (m multiSelectorModel) View() string {
 				if idx >= len(otherItems) {
 					break
 				}
-				m.renderMultiItem(&s, filtered[otherItems[idx]], otherItems[idx])
+				renderItem(&s, filtered[otherItems[idx]], otherItems[idx])
 			}

 			if remaining := len(otherItems) - m.scrollOffset - displayCount; remaining > 0 {
@@ -716,15 +763,18 @@ func (m multiSelectorModel) View() string {

 	s.WriteString("\n")

-	count := m.selectedCount()
-	if count == 0 {
-		s.WriteString(selectorDescStyle.Render("  Select at least one model."))
+	if !m.multi {
+		s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • enter select • tab add multiple • esc cancel"))
 	} else {
-		s.WriteString(selectorDescStyle.Render(fmt.Sprintf("  %d selected - press enter to continue", count)))
+		count := m.selectedCount()
+		if count == 0 {
+			s.WriteString(selectorDescStyle.Render("  Select at least one model."))
+		} else {
+			s.WriteString(selectorDescStyle.Render(fmt.Sprintf("  %d selected - press enter to continue", count)))
+		}
+		s.WriteString("\n\n")
+		s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • space toggle • tab select single • enter confirm • esc cancel"))
 	}
-	s.WriteString("\n\n")
-
-	s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • space toggle • enter confirm • esc cancel"))

 	result := s.String()
 	if m.width > 0 {
@@ -747,18 +797,28 @@ func SelectMultiple(title string, items []SelectItem, preChecked []string) ([]st
 	}

 	fm := finalModel.(multiSelectorModel)
-	if fm.cancelled {
+	if fm.cancelled || !fm.confirmed {
 		return nil, ErrCancelled
 	}

-	if !fm.confirmed {
-		return nil, ErrCancelled
+	// Single-add mode: prepend the picked model, keep existing models deduped
+	if fm.singleAdd != "" {
+		result := []string{fm.singleAdd}
+		for _, name := range preChecked {
+			if name != fm.singleAdd {
+				result = append(result, name)
+			}
+		}
+		return result, nil
 	}

-	var result []string
+	// Multi-edit mode: last checked is default (first in result)
+	last := fm.checkOrder[len(fm.checkOrder)-1]
+	result := []string{fm.items[last].Name}
 	for _, idx := range fm.checkOrder {
-		result = append(result, fm.items[idx].Name)
+		if idx != last {
+			result = append(result, fm.items[idx].Name)
+		}
 	}
-
 	return result, nil
 }
--- a/cmd/tui/selector_test.go
+++ b/cmd/tui/selector_test.go
@@ -539,6 +539,7 @@ func TestMultiView_CursorIndicator(t *testing.T) {

 func TestMultiView_CheckedItemShowsX(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b"), []string{"a"})
+	m.multi = true
 	content := m.View()

 	if !strings.Contains(content, "[x]") {
@@ -550,11 +551,18 @@ func TestMultiView_CheckedItemShowsX(t *testing.T) {
 }

 func TestMultiView_DefaultTag(t *testing.T) {
-	m := newMultiSelectorModel("Pick:", items("a", "b"), []string{"a"})
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"a", "b"})
+	m.multi = true
 	content := m.View()

 	if !strings.Contains(content, "(default)") {
-		t.Error("first checked item should have (default) tag")
+		t.Error("should have (default) tag")
+	}
+	// preChecked[0] ("a") should be the default (last in checkOrder)
+	aIdx := strings.Index(content, "a")
+	defaultIdx := strings.Index(content, "(default)")
+	if defaultIdx < aIdx {
+		t.Error("(default) tag should appear after 'a' (the current default)")
 	}
 }

@@ -585,6 +593,7 @@ func TestMultiView_OverflowIndicator(t *testing.T) {

 func TestMultiUpdate_SpaceTogglesItem(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.multi = true
 	m.cursor = 1

 	// Simulate space delivered as tea.KeySpace
@@ -601,6 +610,7 @@ func TestMultiUpdate_SpaceTogglesItem(t *testing.T) {

 func TestMultiUpdate_SpaceRuneTogglesItem(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.multi = true
 	m.cursor = 1

 	// Simulate space delivered as tea.KeyRunes (Windows PowerShell behavior)
@@ -618,6 +628,161 @@ func TestMultiUpdate_SpaceRuneTogglesItem(t *testing.T) {
 	}
 }

+// --- Single-add mode ---
+
+func TestMulti_StartsInSingleMode(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	if m.multi {
+		t.Error("should start in single mode (multi=false)")
+	}
+}
+
+func TestMulti_SingleModeNoCheckboxes(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	content := m.View()
+	if strings.Contains(content, "[x]") || strings.Contains(content, "[ ]") {
+		t.Error("single mode should not show checkboxes")
+	}
+	if !strings.Contains(content, "▸") {
+		t.Error("single mode should show cursor indicator")
+	}
+}
+
+func TestMulti_SingleModeEnterPicksItem(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.cursor = 1
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
+	m = updated.(multiSelectorModel)
+
+	if m.singleAdd != "b" {
+		t.Errorf("enter in single mode should pick cursor item, got %q", m.singleAdd)
+	}
+	if !m.confirmed {
+		t.Error("should set confirmed")
+	}
+}
+
+func TestMulti_SingleModeSpaceIsNoop(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	m.cursor = 0
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeySpace})
+	m = updated.(multiSelectorModel)
+
+	if len(m.checked) != 0 {
+		t.Error("space in single mode should not toggle items")
+	}
+}
+
+func TestMulti_SingleModeSpaceRuneIsNoop(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	m.cursor = 0
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune{' '}})
+	m = updated.(multiSelectorModel)
+
+	if len(m.checked) != 0 {
+		t.Error("space rune in single mode should not toggle items")
+	}
+	if m.filter != "" {
+		t.Error("space rune in single mode should not add to filter")
+	}
+}
+
+func TestMulti_TabTogglesMode(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+
+	if m.multi {
+		t.Fatal("should start in single mode")
+	}
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyTab})
+	m = updated.(multiSelectorModel)
+	if !m.multi {
+		t.Error("tab should switch to multi mode")
+	}
+
+	updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyTab})
+	m = updated.(multiSelectorModel)
+	if m.multi {
+		t.Error("tab should switch back to single mode")
+	}
+}
+
+func TestMulti_SingleModeHelpText(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a"), nil)
+	content := m.View()
+	if !strings.Contains(content, "tab add multiple") {
+		t.Error("single mode should show 'tab add multiple' in help")
+	}
+}
+
+func TestMulti_MultiModeHelpText(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a"), nil)
+	m.multi = true
+	content := m.View()
+	if !strings.Contains(content, "tab select single") {
+		t.Error("multi mode should show 'tab select single' in help")
+	}
+}
+
+// --- preChecked initialization order ---
+
+func TestMulti_PreCheckedDefaultIsLast(t *testing.T) {
+	// preChecked[0] ("a") is the current default and should end up
+	// last in checkOrder so it gets the (default) tag.
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"a", "b", "c"})
+
+	if len(m.checkOrder) != 3 {
+		t.Fatalf("expected 3 in checkOrder, got %d", len(m.checkOrder))
+	}
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "a" {
+		t.Errorf("preChecked[0] should be last in checkOrder, got %q", m.items[lastIdx].Name)
+	}
+}
+
+func TestMulti_CursorOnDefaultModel(t *testing.T) {
+	// preChecked[0] ("b") is the default; cursor should start on it
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"b", "c"})
+
+	if m.cursor != 1 {
+		t.Errorf("cursor should be on preChecked[0] ('b') at index 1, got %d", m.cursor)
+	}
+}
+
+// --- Multi-mode last-checked is default ---
+
+func TestMulti_LastCheckedIsDefault(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("alpha", "beta", "gamma"), nil)
+	m.multi = true
+
+	// Check "alpha" then "gamma"
+	m.cursor = 0
+	m.toggleItem()
+	m.cursor = 2
+	m.toggleItem()
+
+	// Last checked ("gamma") should be at the end of checkOrder
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "gamma" {
+		t.Errorf("last checked should be 'gamma', got %q", m.items[lastIdx].Name)
+	}
+
+	// The (default) tag renders based on checkOrder[len-1]
+	content := m.View()
+	if !strings.Contains(content, "(default)") {
+		t.Fatal("should show (default) tag")
+	}
+	// "alpha" line should NOT have the default tag
+	for _, line := range strings.Split(content, "\n") {
+		if strings.Contains(line, "alpha") && strings.Contains(line, "(default)") {
+			t.Error("'alpha' (first checked) should not have (default) tag")
+		}
+	}
+}
+
 // Key message helpers for testing

 type keyType = int
--- a/cmd/tui/tui.go
+++ b/cmd/tui/tui.go
@@ -429,8 +429,24 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			}
 			if m.multiModalSelector.confirmed {
 				var selected []string
-				for _, idx := range m.multiModalSelector.checkOrder {
-					selected = append(selected, m.multiModalSelector.items[idx].Name)
+				if m.multiModalSelector.singleAdd != "" {
+					// Single-add mode: prepend picked model, keep existing deduped
+					selected = []string{m.multiModalSelector.singleAdd}
+					for _, name := range config.IntegrationModels(m.items[m.cursor].integration) {
+						if name != m.multiModalSelector.singleAdd {
+							selected = append(selected, name)
+						}
+					}
+				} else {
+					// Last checked is default (first in result)
+					co := m.multiModalSelector.checkOrder
+					last := co[len(co)-1]
+					selected = []string{m.multiModalSelector.items[last].Name}
+					for _, idx := range co {
+						if idx != last {
+							selected = append(selected, m.multiModalSelector.items[idx].Name)
+						}
+					}
 				}
 				if len(selected) > 0 {
 					m.changeModels = selected
--- a/go.mod
+++ b/go.mod
@@ -26,6 +26,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
+	github.com/klauspost/compress v1.18.3
 	github.com/mattn/go-runewidth v0.0.16
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -122,7 +122,6 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS
 github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
-github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
 github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
 github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
@@ -150,8 +149,9 @@ github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+
 github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
 github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
+github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
+github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
 github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/klauspost/compress/zstd"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/openai"
@@ -496,6 +497,17 @@ func (w *ResponsesWriter) Write(data []byte) (int, error) {

 func ResponsesMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
+		if c.GetHeader("Content-Encoding") == "zstd" {
+			reader, err := zstd.NewReader(c.Request.Body, zstd.WithDecoderMaxMemory(8<<20))
+			if err != nil {
+				c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "failed to decompress zstd body"))
+				return
+			}
+			defer reader.Close()
+			c.Request.Body = io.NopCloser(reader)
+			c.Request.Header.Del("Content-Encoding")
+		}
+
 		var req openai.ResponsesRequest
 		if err := c.ShouldBindJSON(&req); err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -14,6 +14,7 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/google/go-cmp/cmp"
+	"github.com/klauspost/compress/zstd"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/openai"
@@ -1238,3 +1239,102 @@ func TestImageEditsMiddleware(t *testing.T) {
 		})
 	}
 }
+
+func zstdCompress(t *testing.T, data []byte) []byte {
+	t.Helper()
+	var buf bytes.Buffer
+	w, err := zstd.NewWriter(&buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := w.Write(data); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return buf.Bytes()
+}
+
+func TestResponsesMiddlewareZstd(t *testing.T) {
+	tests := []struct {
+		name        string
+		body        string
+		useZstd     bool
+		oversized   bool
+		wantCode    int
+		wantModel   string
+		wantMessage string
+	}{
+		{
+			name:        "plain JSON",
+			body:        `{"model": "test-model", "input": "Hello"}`,
+			wantCode:    http.StatusOK,
+			wantModel:   "test-model",
+			wantMessage: "Hello",
+		},
+		{
+			name:        "zstd compressed",
+			body:        `{"model": "test-model", "input": "Hello"}`,
+			useZstd:     true,
+			wantCode:    http.StatusOK,
+			wantModel:   "test-model",
+			wantMessage: "Hello",
+		},
+		{
+			name:      "zstd over max decompressed size",
+			oversized: true,
+			useZstd:   true,
+			wantCode:  http.StatusBadRequest,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var capturedRequest *api.ChatRequest
+
+			gin.SetMode(gin.TestMode)
+			router := gin.New()
+			router.Use(ResponsesMiddleware(), captureRequestMiddleware(&capturedRequest))
+			router.Handle(http.MethodPost, "/v1/responses", func(c *gin.Context) {
+				c.Status(http.StatusOK)
+			})
+
+			var bodyReader io.Reader
+			if tt.oversized {
+				bodyReader = bytes.NewReader(zstdCompress(t, bytes.Repeat([]byte("A"), 9<<20)))
+			} else if tt.useZstd {
+				bodyReader = bytes.NewReader(zstdCompress(t, []byte(tt.body)))
+			} else {
+				bodyReader = strings.NewReader(tt.body)
+			}
+
+			req, _ := http.NewRequest(http.MethodPost, "/v1/responses", bodyReader)
+			req.Header.Set("Content-Type", "application/json")
+			if tt.useZstd || tt.oversized {
+				req.Header.Set("Content-Encoding", "zstd")
+			}
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			if resp.Code != tt.wantCode {
+				t.Fatalf("expected status %d, got %d: %s", tt.wantCode, resp.Code, resp.Body.String())
+			}
+
+			if tt.wantCode != http.StatusOK {
+				return
+			}
+
+			if capturedRequest == nil {
+				t.Fatal("expected captured request, got nil")
+			}
+			if capturedRequest.Model != tt.wantModel {
+				t.Fatalf("expected model %q, got %q", tt.wantModel, capturedRequest.Model)
+			}
+			if len(capturedRequest.Messages) != 1 || capturedRequest.Messages[0].Content != tt.wantMessage {
+				t.Fatalf("expected single user message %q, got %+v", tt.wantMessage, capturedRequest.Messages)
+			}
+		})
+	}
+}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -2,6 +2,10 @@
 # This script installs Ollama on Linux and macOS.
 # It detects the current operating system architecture and installs the appropriate version of Ollama.

+# Wrap script in main function so that a truncated partial download doesn't end
+# up executing half a script.
+main() {
+
 set -eu

 red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
@@ -446,3 +450,6 @@ fi

 status "NVIDIA GPU ready."
 install_success
+}
+
+main
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -4,13 +4,19 @@ package cache

 import (
 	"log/slog"
+	"os"

 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )

+func kvCacheGrowDebugEnabled() bool {
+	return os.Getenv("OLLAMA_MLX_DEBUG_CACHE_GROW") != ""
+}
+
 type Cache interface {
 	Update(keys, values *mlx.Array) (newKeys, newValues *mlx.Array)
 	State() (keys, values *mlx.Array)
+	Materialize() []*mlx.Array
 	Trim(int) int
 	Clone() Cache
 	Offset() int
@@ -48,6 +54,9 @@ func (c *KVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 		} else {
 			c.keys, c.values = newKeys, newValues
 		}
+		if kvCacheGrowDebugEnabled() {
+			slog.Info("KVCache grow", "prev", prev, "new_capacity", c.keys.Dim(2), "step", c.step)
+		}
 	}

 	c.offset += L
@@ -66,6 +75,17 @@ func (c *KVCache) State() (*mlx.Array, *mlx.Array) {
 		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
 }

+func (c *KVCache) Materialize() []*mlx.Array {
+	out := make([]*mlx.Array, 0, 2)
+	if c.keys != nil && c.keys.Valid() {
+		out = append(out, c.keys)
+	}
+	if c.values != nil && c.values.Valid() {
+		out = append(out, c.values)
+	}
+	return out
+}
+
 func (c *KVCache) Trim(n int) int {
 	n = min(c.offset, n)
 	c.offset -= n
--- a/x/mlxrunner/cache/cache_test.go
+++ b/x/mlxrunner/cache/cache_test.go
@@ -0,0 +1,17 @@
+//go:build mlx
+
+package cache
+
+import "testing"
+
+func TestKVCacheGrowDebugEnabled(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_DEBUG_CACHE_GROW", "")
+	if kvCacheGrowDebugEnabled() {
+		t.Fatal("kvCacheGrowDebugEnabled() = true, want false")
+	}
+
+	t.Setenv("OLLAMA_MLX_DEBUG_CACHE_GROW", "1")
+	if !kvCacheGrowDebugEnabled() {
+		t.Fatal("kvCacheGrowDebugEnabled() = false, want true")
+	}
+}
--- a/x/mlxrunner/cache/recurrent.go
+++ b/x/mlxrunner/cache/recurrent.go
@@ -0,0 +1,162 @@
+//go:build mlx
+
+package cache
+
+import "github.com/ollama/ollama/x/mlxrunner/mlx"
+
+// RecurrentCache stores state for linear-recurrent layers.
+//
+// Conv state shape: [B, convTail, convDim]
+// Delta state shape: [B, numVHeads, headVDim, headKDim]
+type RecurrentCache struct {
+	convState  *mlx.Array
+	deltaState *mlx.Array
+	offset     int
+
+	convTail  int
+	convDim   int
+	numVHeads int
+	headVDim  int
+	headKDim  int
+}
+
+func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
+	if v == nil || !v.Valid() {
+		return
+	}
+	if *dst == v {
+		return
+	}
+
+	// Break dependency chains so recurrent state does not retain the full
+	// per-token compute graph over time.
+	snap := mlx.Snapshot(v)
+	mlx.Eval(snap)
+
+	old := *dst
+	*dst = snap
+
+	// Release previous cached state root, then recursively free the transient
+	// incoming graph root now that a detached snapshot is retained in cache.
+	if old != nil && old != snap {
+		mlx.Release(old)
+	}
+	if v != snap && v != old {
+		mlx.Free(v)
+	}
+}
+
+func (c *RecurrentCache) setStateRaw(dst **mlx.Array, v *mlx.Array) {
+	if v == nil || !v.Valid() {
+		return
+	}
+	old := *dst
+	*dst = v
+	if old != nil && old != v {
+		mlx.Release(old)
+	}
+}
+
+func NewRecurrentCache(convTail, convDim, numVHeads, headVDim, headKDim int32) *RecurrentCache {
+	return &RecurrentCache{
+		convTail:  int(convTail),
+		convDim:   int(convDim),
+		numVHeads: int(numVHeads),
+		headVDim:  int(headVDim),
+		headKDim:  int(headKDim),
+	}
+}
+
+func (c *RecurrentCache) ensure(batch int, dtype mlx.DType) {
+	if batch <= 0 {
+		batch = 1
+	}
+
+	if c.convState == nil || c.convState.DType() != dtype ||
+		c.convState.Dim(0) != batch || c.convState.Dim(1) != c.convTail || c.convState.Dim(2) != c.convDim {
+		c.setStateRaw(&c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
+	}
+
+	if c.deltaState == nil || c.deltaState.DType() != dtype ||
+		c.deltaState.Dim(0) != batch || c.deltaState.Dim(1) != c.numVHeads || c.deltaState.Dim(2) != c.headVDim || c.deltaState.Dim(3) != c.headKDim {
+		c.setStateRaw(&c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
+	}
+}
+
+func (c *RecurrentCache) ConvState(batch int, dtype mlx.DType) *mlx.Array {
+	c.ensure(batch, dtype)
+	return c.convState
+}
+
+func (c *RecurrentCache) SetConvState(v *mlx.Array) {
+	c.setStateMaterialized(&c.convState, v)
+}
+
+func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
+	c.ensure(batch, dtype)
+	return c.deltaState
+}
+
+func (c *RecurrentCache) SetDeltaState(v *mlx.Array) {
+	c.setStateMaterialized(&c.deltaState, v)
+}
+
+func (c *RecurrentCache) Advance(n int) {
+	c.offset += n
+}
+
+func (c *RecurrentCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
+	return keys, values
+}
+
+func (c *RecurrentCache) State() (*mlx.Array, *mlx.Array) {
+	c.ensure(1, mlx.DTypeFloat32)
+	return c.convState, c.deltaState
+}
+
+func (c *RecurrentCache) Materialize() []*mlx.Array {
+	out := make([]*mlx.Array, 0, 2)
+	if c.convState != nil && c.convState.Valid() {
+		out = append(out, c.convState)
+	}
+	if c.deltaState != nil && c.deltaState.Valid() {
+		out = append(out, c.deltaState)
+	}
+	return out
+}
+
+func (c *RecurrentCache) Trim(n int) int {
+	n = min(c.offset, n)
+	c.offset -= n
+	// Recurrent state cannot be reversed cheaply; reset to a clean state when trimming.
+	if n > 0 {
+		if c.convState != nil {
+			c.setStateRaw(&c.convState, mlx.Zeros(c.convState.DType(), c.convState.Dim(0), c.convState.Dim(1), c.convState.Dim(2)))
+		}
+		if c.deltaState != nil {
+			c.setStateRaw(&c.deltaState, mlx.Zeros(c.deltaState.DType(), c.deltaState.Dim(0), c.deltaState.Dim(1), c.deltaState.Dim(2), c.deltaState.Dim(3)))
+		}
+	}
+	return n
+}
+
+func (c *RecurrentCache) Clone() Cache {
+	clone := &RecurrentCache{
+		offset:    c.offset,
+		convTail:  c.convTail,
+		convDim:   c.convDim,
+		numVHeads: c.numVHeads,
+		headVDim:  c.headVDim,
+		headKDim:  c.headKDim,
+	}
+	if c.convState != nil {
+		clone.convState = c.convState.Clone()
+	}
+	if c.deltaState != nil {
+		clone.deltaState = c.deltaState.Clone()
+	}
+	return clone
+}
+
+func (c *RecurrentCache) Offset() int { return c.offset }
+func (c *RecurrentCache) Len() int    { return c.offset }
--- a/x/mlxrunner/imports.go
+++ b/x/mlxrunner/imports.go
@@ -7,4 +7,6 @@ import (
 	_ "github.com/ollama/ollama/x/models/glm4_moe_lite"
 	_ "github.com/ollama/ollama/x/models/llama"
 	_ "github.com/ollama/ollama/x/models/qwen3"
+	_ "github.com/ollama/ollama/x/models/qwen3_5"
+	_ "github.com/ollama/ollama/x/models/qwen3_5_moe"
 )
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -272,3 +272,39 @@ func Free(s ...*Array) (n int) {

 	return n
 }
+
+// Release forcibly frees arrays regardless of reference accounting.
+// Use only for arrays that are known to be unreachable by any live model state.
+func Release(s ...*Array) (n int) {
+	seen := make(map[*Array]bool, len(s))
+	for _, t := range s {
+		if t == nil || !t.Valid() || seen[t] {
+			continue
+		}
+		seen[t] = true
+		n += t.NumBytes()
+		C.mlx_array_free(t.ctx)
+		t.ctx.ctx = nil
+		t.desc.inputs = nil
+		t.desc.numRefs = 0
+	}
+	return n
+}
+
+const pinnedNumRefs = 1 << 30
+
+// Pin keeps arrays alive for the process lifetime by setting a very high
+// reference count floor. Use for model parameter tensors shared across many
+// decode steps, where recursive Free traversals must never reclaim them.
+func Pin(s ...*Array) {
+	seen := make(map[*Array]bool, len(s))
+	for _, t := range s {
+		if t == nil || !t.Valid() || seen[t] {
+			continue
+		}
+		seen[t] = true
+		if t.desc.numRefs < pinnedNumRefs {
+			t.desc.numRefs = pinnedNumRefs
+		}
+	}
+}
--- a/x/mlxrunner/mlx/dynamic.go
+++ b/x/mlxrunner/mlx/dynamic.go
@@ -55,6 +55,30 @@ func tryLoadFromDir(dir string) bool {
 	return false
 }

+// tryLoadByName attempts to load the library using just its name,
+// allowing the system to use rpath, LD_LIBRARY_PATH, or standard search paths.
+// Returns true if the library was successfully loaded.
+func tryLoadByName() bool {
+	libraryName := "libmlxc.dylib"
+	if runtime.GOOS == "linux" {
+		libraryName = "libmlxc.so"
+	}
+
+	cPath := C.CString(libraryName)
+	defer C.free(unsafe.Pointer(cPath))
+
+	var handle C.mlx_dynamic_handle
+	if C.mlx_dynamic_load(&handle, cPath) != 0 {
+		return false
+	}
+	if C.mlx_dynamic_load_symbols(handle) != 0 {
+		C.mlx_dynamic_unload(&handle)
+		return false
+	}
+
+	return true
+}
+
 func init() {
 	switch runtime.GOOS {
 	case "darwin":
@@ -73,6 +97,11 @@ func init() {
 		}
 	}

+	// Try loading via rpath/standard library search
+	if tryLoadByName() {
+		return
+	}
+
 	// Build search paths: executable directory, then build directories
 	var searchDirs []string
 	if exe, err := os.Executable(); err == nil {
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -279,6 +279,24 @@ func Sigmoid(a *Array) *Array {
 	return a.Sigmoid()
 }

+func Exp(a *Array) *Array {
+	out := New("EXP", a)
+	C.mlx_exp(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+func Log(a *Array) *Array {
+	out := New("LOG", a)
+	C.mlx_log(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+func SoftmaxAxis(a *Array, axis int, precise bool) *Array {
+	out := New("SOFTMAX_AXIS", a)
+	C.mlx_softmax_axis(&out.ctx, a.ctx, C.int(axis), C.bool(precise), DefaultStream().ctx)
+	return out
+}
+
 func ScaledDotProductAttentionCausal(q, k, v *Array, scale float32, causalMask bool) *Array {
 	mask := New("")
 	sinks := New("")
@@ -386,6 +404,52 @@ func Collect(v any) []*Array {
 	return arrays
 }

+// Snapshot copies an array into a fresh leaf value with no Go-side graph inputs.
+func Snapshot(a *Array) *Array {
+	if a == nil || !a.Valid() {
+		return a
+	}
+	out := New("SNAPSHOT")
+	C.mlx_copy(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+// CollectReachable collects arrays from v and all transitive graph inputs.
+func CollectReachable(v any) []*Array {
+	roots := Collect(v)
+	if len(roots) == 0 {
+		return nil
+	}
+
+	seen := make(map[*Array]bool, len(roots))
+	out := make([]*Array, 0, len(roots))
+	stack := append([]*Array(nil), roots...)
+	for len(stack) > 0 {
+		a := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		if a == nil || !a.Valid() || seen[a] {
+			continue
+		}
+		seen[a] = true
+		out = append(out, a)
+		stack = append(stack, a.desc.inputs...)
+	}
+
+	return out
+}
+
+// Detach returns a new Array handle that shares the same MLX value but does
+// not retain Go-side graph input references.
+func Detach(a *Array) *Array {
+	if a == nil || !a.Valid() {
+		return a
+	}
+	out := New("DETACH")
+	C.mlx_array_set(&out.ctx, a.ctx)
+	return out
+}
+
 func collect(v reflect.Value, arrays *[]*Array, seen map[uintptr]bool) {
 	if !v.IsValid() {
 		return
--- a/x/mlxrunner/model/base/base.go
+++ b/x/mlxrunner/model/base/base.go
@@ -8,10 +8,10 @@ import (
 	"log/slog"
 	"sync"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 // Model is the interface that model implementations must satisfy.
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -6,13 +6,43 @@ import (
 	"bytes"
 	"errors"
 	"log/slog"
+	"os"
+	"strconv"
 	"time"
-	"unicode/utf8"

 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )

+func prefillChunkSize(lowMemoryDecode bool) int {
+	if v := os.Getenv("OLLAMA_MLX_PREFILL_CHUNK"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			return n
+		}
+	}
+
+	if lowMemoryDecode {
+		// Recurrent/no-prompt-cache path favors lower peak memory over prefill throughput.
+		// Keep this conservative to avoid transient prefill spikes and allocator thrash.
+		return 32
+	}
+	return 2 << 10
+}
+
+func mlxDebugMemoryEnabled() bool {
+	return os.Getenv("OLLAMA_MLX_DEBUG_MEMORY") != ""
+}
+
+func finalizeRequestCaches(usePromptCache bool, insertCache func(), freeCaches func(), logMemory func(string, int)) {
+	if usePromptCache {
+		insertCache()
+		logMemory("request_done_cached", -1)
+		return
+	}
+	freeCaches()
+	logMemory("request_done_freed", -1)
+}
+
 func (r *Runner) TextGenerationPipeline(request Request) error {
 	if r.Model == nil {
 		return errors.New("model not loaded")
@@ -30,7 +60,21 @@ func (r *Runner) TextGenerationPipeline(request Request) error {

 	inputs := r.Tokenizer.Encode(request.Prompt, true)

-	caches, tokens := r.FindNearestCache(inputs)
+	usePromptCache := true
+	if m, ok := r.Model.(interface{ DisablePromptCache() bool }); ok && m.DisablePromptCache() {
+		usePromptCache = false
+	}
+	lowMemoryDecode := !usePromptCache
+	prefillChunk := prefillChunkSize(lowMemoryDecode)
+
+	var caches []cache.Cache
+	var tokens []int32
+	if usePromptCache {
+		caches, tokens = r.FindNearestCache(inputs)
+	} else {
+		tokens = inputs
+	}
+
 	if len(caches) == 0 {
 		if cacheFactory, ok := r.Model.(interface{ NewCaches() []cache.Cache }); ok {
 			caches = cacheFactory.NewCaches()
@@ -42,23 +86,54 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		}
 	}

+	materializeCaches := func() {
+		state := make([]*mlx.Array, 0, 2*len(caches))
+		for _, c := range caches {
+			state = append(state, c.Materialize()...)
+		}
+		if len(state) == 0 {
+			return
+		}
+		mlx.Eval(state...)
+	}
+	freeCaches := func() {
+		state := make([]*mlx.Array, 0, 2*len(caches))
+		for _, c := range caches {
+			state = append(state, c.Materialize()...)
+		}
+		if len(state) == 0 {
+			return
+		}
+		// Non-prompt-cache requests allocate fresh caches every generation.
+		// Explicitly free cache roots so graph chains are reclaimed promptly.
+		mlx.Free(state...)
+		mlx.ClearCache()
+	}
+	debugMemory := mlxDebugMemoryEnabled()
+	logMemory := func(phase string, token int) {
+		if !debugMemory {
+			return
+		}
+		if token >= 0 {
+			slog.Info("MLX memory", "phase", phase, "token", token, "memory", mlx.Memory{})
+			return
+		}
+		slog.Info("MLX memory", "phase", phase, "memory", mlx.Memory{})
+	}
+	logMemory("prefill_start", -1)
+
 	total, processed := len(tokens), 0
 	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
-		n := min(2<<10, total-processed-1)
+		n := min(prefillChunk, total-processed-1)
 		temp := r.Model.Forward(mlx.FromValues(tokens[processed:processed+n], n).ExpandDims(0), caches)
-		defer mlx.Free(temp)
-		mlx.Eval(func() []*mlx.Array {
-			s := make([]*mlx.Array, 2*len(caches))
-			for i, c := range caches {
-				s[2*i], s[2*i+1] = c.State()
-			}
-			return s
-		}()...)
+		materializeCaches()
+		mlx.Free(temp)
 		processed += n
 		slog.Info("Prompt processing progress", "processed", processed, "total", total)
 		mlx.ClearCache()
 	}
+	logMemory("prefill_done", -1)

 	step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
 		fwd := r.Model.Forward(token.ExpandDims(0), caches)
@@ -70,7 +145,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	sample, logprobs := step(mlx.FromValues(tokens[processed:], total-processed))
-	mlx.AsyncEval(sample, logprobs)
+	if !lowMemoryDecode {
+		mlx.AsyncEval(sample, logprobs)
+	} else {
+		// Materialize cache updates to prevent transform graph growth.
+		materializeCaches()
+	}
+	logMemory("decode_init", -1)

 	var b bytes.Buffer

@@ -78,12 +159,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	final := Response{Done: true, PromptTokens: total, CompletionTokens: request.Options.MaxTokens, DoneReason: 1}
 	outputs := make([]int32, 0, request.Options.MaxTokens)
 	for i := range request.Options.MaxTokens {
-		nextSample, nextLogprobs := step(sample)
-		mlx.AsyncEval(nextSample, nextLogprobs)
-
 		if i == 0 {
 			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
+			logMemory("decode_first_eval", i)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()
 		}
@@ -95,6 +174,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			final.Token = int(output)
 			final.DoneReason = 0
 			final.CompletionTokens = i
+			mlx.Free(sample, logprobs)
 			break
 		}

@@ -103,18 +183,43 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			Token: int(output),
 		}

+		// For recurrent linear-attention models, avoid async prefetch to reduce
+		// peak memory and clear allocator cache every token.
+		if lowMemoryDecode {
+			mlx.Free(sample, logprobs)
+			if i+1 >= request.Options.MaxTokens {
+				break
+			}
+			mlx.ClearCache()
+			sample, logprobs = step(mlx.FromValues([]int32{output}, 1))
+			// Materialize cache updates to avoid unbounded transform chains.
+			materializeCaches()
+			if i%32 == 0 {
+				logMemory("decode_lowmem_step", i)
+			}
+			continue
+		}
+
+		nextSample, nextLogprobs := step(sample)
+		mlx.AsyncEval(nextSample, nextLogprobs)
+
 		mlx.Free(sample, logprobs)
 		if i%256 == 0 {
 			mlx.ClearCache()
 		}
+		if i%64 == 0 {
+			logMemory("decode_async_step", i)
+		}

 		sample, logprobs = nextSample, nextLogprobs
 	}
-
-	mlx.Free(sample, logprobs)
 	final.CompletionTokensDuration = time.Since(now)
 	request.Responses <- final
-	r.InsertCache(append(inputs, outputs...), caches)
+	finalizeRequestCaches(usePromptCache,
+		func() { r.InsertCache(append(inputs, outputs...), caches) },
+		freeCaches,
+		logMemory,
+	)
 	return nil
 }

@@ -126,13 +231,5 @@ func (r Runner) Decode(sample int32, b *bytes.Buffer) string {
 		return ""
 	}

-	if text := b.String(); utf8.ValidString(text) {
-		b.Reset()
-		return text
-	} else if b.Len() >= utf8.UTFMax {
-		b.Reset()
-		return text
-	}
-
-	return ""
+	return flushValidUTF8Prefix(b)
 }
--- a/x/mlxrunner/pipeline_helpers_test.go
+++ b/x/mlxrunner/pipeline_helpers_test.go
@@ -0,0 +1,83 @@
+//go:build mlx
+
+package mlxrunner
+
+import "testing"
+
+func TestPrefillChunkSize(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_PREFILL_CHUNK", "")
+	if got := prefillChunkSize(false); got != 2<<10 {
+		t.Fatalf("prefillChunkSize(false) = %d, want %d", got, 2<<10)
+	}
+	if got := prefillChunkSize(true); got != 32 {
+		t.Fatalf("prefillChunkSize(true) = %d, want %d", got, 32)
+	}
+}
+
+func TestPrefillChunkSizeEnvOverride(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_PREFILL_CHUNK", "96")
+	if got := prefillChunkSize(false); got != 96 {
+		t.Fatalf("prefillChunkSize(false) with env = %d, want %d", got, 96)
+	}
+	if got := prefillChunkSize(true); got != 96 {
+		t.Fatalf("prefillChunkSize(true) with env = %d, want %d", got, 96)
+	}
+}
+
+func TestMLXDebugMemoryEnabled(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_DEBUG_MEMORY", "")
+	if mlxDebugMemoryEnabled() {
+		t.Fatal("mlxDebugMemoryEnabled() = true, want false")
+	}
+
+	t.Setenv("OLLAMA_MLX_DEBUG_MEMORY", "1")
+	if !mlxDebugMemoryEnabled() {
+		t.Fatal("mlxDebugMemoryEnabled() = false, want true")
+	}
+}
+
+func TestFinalizeRequestCachesUsesPromptCachePath(t *testing.T) {
+	insertCalls := 0
+	freeCalls := 0
+	logPhase := ""
+
+	finalizeRequestCaches(
+		true,
+		func() { insertCalls++ },
+		func() { freeCalls++ },
+		func(phase string, _ int) { logPhase = phase },
+	)
+
+	if insertCalls != 1 {
+		t.Fatalf("insert calls = %d, want 1", insertCalls)
+	}
+	if freeCalls != 0 {
+		t.Fatalf("free calls = %d, want 0", freeCalls)
+	}
+	if logPhase != "request_done_cached" {
+		t.Fatalf("log phase = %q, want %q", logPhase, "request_done_cached")
+	}
+}
+
+func TestFinalizeRequestCachesUsesFreePath(t *testing.T) {
+	insertCalls := 0
+	freeCalls := 0
+	logPhase := ""
+
+	finalizeRequestCaches(
+		false,
+		func() { insertCalls++ },
+		func() { freeCalls++ },
+		func(phase string, _ int) { logPhase = phase },
+	)
+
+	if insertCalls != 0 {
+		t.Fatalf("insert calls = %d, want 0", insertCalls)
+	}
+	if freeCalls != 1 {
+		t.Fatalf("free calls = %d, want 1", freeCalls)
+	}
+	if logPhase != "request_done_freed" {
+		t.Fatalf("log phase = %q, want %q", logPhase, "request_done_freed")
+	}
+}
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -12,12 +12,12 @@ import (

 	"golang.org/x/sync/errgroup"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/mlxrunner/sample"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 type Request struct {
@@ -64,6 +64,38 @@ type Runner struct {
 	CacheEntries map[int32]*CacheEntry
 }

+func releaseTensorMap(tensors map[string]*mlx.Array, keep map[*mlx.Array]struct{}) (count int, bytes int) {
+	if len(tensors) == 0 {
+		return 0, 0
+	}
+
+	seen := make(map[*mlx.Array]bool, len(tensors))
+	toRelease := make([]*mlx.Array, 0, len(tensors))
+	for name, arr := range tensors {
+		if arr == nil || !arr.Valid() {
+			delete(tensors, name)
+			continue
+		}
+		if keep != nil {
+			if _, ok := keep[arr]; ok {
+				continue
+			}
+		}
+		delete(tensors, name)
+		if seen[arr] {
+			continue
+		}
+		seen[arr] = true
+		toRelease = append(toRelease, arr)
+	}
+
+	if len(toRelease) == 0 {
+		return 0, 0
+	}
+
+	return len(toRelease), mlx.Release(toRelease...)
+}
+
 func (r *Runner) Load(modelName string) error {
 	root, err := model.Open(modelName)
 	if err != nil {
@@ -85,9 +117,29 @@ func (r *Runner) Load(modelName string) error {
 	// Assign weights to model (model-specific logic)
 	loadWeights := base.Weights(m)
 	if err := loadWeights(tensors); err != nil {
+		if count, bytes := releaseTensorMap(tensors, nil); count > 0 {
+			slog.Info("Released tensors after load failure", "count", count, "bytes", mlx.PrettyBytes(bytes))
+			mlx.ClearCache()
+		}
 		return err
 	}

+	// Pin only model-owned tensor roots. Pinning the full transitive graph can
+	// retain large load-time intermediates and inflate steady-state memory.
+	roots := mlx.Collect(m)
+	mlx.Pin(roots...)
+
+	keep := make(map[*mlx.Array]struct{})
+	for _, arr := range roots {
+		if arr != nil && arr.Valid() {
+			keep[arr] = struct{}{}
+		}
+	}
+	if count, bytes := releaseTensorMap(tensors, keep); count > 0 {
+		slog.Info("Released unused model tensors", "count", count, "bytes", mlx.PrettyBytes(bytes))
+		mlx.ClearCache()
+	}
+
 	r.Model = m
 	r.Tokenizer = m.Tokenizer()
 	return nil
--- a/x/mlxrunner/utf8_buffer.go
+++ b/x/mlxrunner/utf8_buffer.go
@@ -0,0 +1,47 @@
+package mlxrunner
+
+import (
+	"bytes"
+	"unicode/utf8"
+)
+
+// flushValidUTF8Prefix returns and consumes the longest valid UTF-8 prefix
+// currently buffered, leaving any incomplete trailing bytes in place.
+func flushValidUTF8Prefix(b *bytes.Buffer) string {
+	data := b.Bytes()
+	if len(data) == 0 {
+		return ""
+	}
+
+	prefix := validUTF8PrefixLen(data)
+	if prefix == 0 {
+		return ""
+	}
+
+	text := string(data[:prefix])
+	b.Next(prefix)
+	return text
+}
+
+func validUTF8PrefixLen(data []byte) int {
+	i := 0
+	prefix := 0
+	for i < len(data) {
+		r, size := utf8.DecodeRune(data[i:])
+		if r == utf8.RuneError && size == 1 {
+			if !utf8.FullRune(data[i:]) {
+				break
+			}
+
+			// Invalid UTF-8 byte; consume one byte to guarantee forward progress.
+			i++
+			prefix = i
+			continue
+		}
+
+		i += size
+		prefix = i
+	}
+
+	return prefix
+}
--- a/x/mlxrunner/utf8_buffer_test.go
+++ b/x/mlxrunner/utf8_buffer_test.go
@@ -0,0 +1,46 @@
+package mlxrunner
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestFlushValidUTF8Prefix_PreservesIncompleteRune(t *testing.T) {
+	var b bytes.Buffer
+
+	b.Write([]byte{0xE3, 0x81})
+	if got := flushValidUTF8Prefix(&b); got != "" {
+		t.Fatalf("first flush = %q, want empty", got)
+	}
+
+	b.Write([]byte{0x93, 0xE3})
+	if got := flushValidUTF8Prefix(&b); got != "こ" {
+		t.Fatalf("second flush = %q, want %q", got, "こ")
+	}
+
+	if got := b.Bytes(); !bytes.Equal(got, []byte{0xE3}) {
+		t.Fatalf("buffer after second flush = %v, want %v", got, []byte{0xE3})
+	}
+
+	b.Write([]byte{0x82, 0x93})
+	if got := flushValidUTF8Prefix(&b); got != "ん" {
+		t.Fatalf("third flush = %q, want %q", got, "ん")
+	}
+
+	if b.Len() != 0 {
+		t.Fatalf("buffer not empty after third flush: %d", b.Len())
+	}
+}
+
+func TestFlushValidUTF8Prefix_ValidText(t *testing.T) {
+	var b bytes.Buffer
+	b.WriteString("hello 世界")
+
+	if got := flushValidUTF8Prefix(&b); got != "hello 世界" {
+		t.Fatalf("flush = %q, want %q", got, "hello 世界")
+	}
+
+	if b.Len() != 0 {
+		t.Fatalf("buffer not empty after flush: %d", b.Len())
+	}
+}
--- a/x/models/gemma3/gemma3.go
+++ b/x/models/gemma3/gemma3.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/glm4_moe_lite/glm4_moe_lite.go
+++ b/x/models/glm4_moe_lite/glm4_moe_lite.go
@@ -9,12 +9,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/llama/llama.go
+++ b/x/models/llama/llama.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/qwen3/qwen3.go
+++ b/x/models/qwen3/qwen3.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/qwen3_5/qwen3_5.go
+++ b/x/models/qwen3_5/qwen3_5.go
--- a/x/models/qwen3_5/qwen3_5_test.go
+++ b/x/models/qwen3_5/qwen3_5_test.go
@@ -0,0 +1,120 @@
+//go:build mlx
+
+package qwen3_5
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/x/mlxrunner/cache"
+)
+
+func TestParseConfigNestedDefaults(t *testing.T) {
+	data := []byte(`{
+		"model_type": "Qwen3_5MoeForConditionalGeneration",
+		"text_config": {
+			"hidden_size": 4096,
+			"intermediate_size": 14336,
+			"num_hidden_layers": 8,
+			"num_attention_heads": 32,
+			"num_key_value_heads": 8,
+			"head_dim": 128,
+			"linear_num_value_heads": 64,
+			"linear_num_key_heads": 16,
+			"linear_key_head_dim": 128,
+			"linear_value_head_dim": 128,
+			"linear_conv_kernel_dim": 4,
+			"num_experts": 16,
+			"num_experts_per_tok": 4,
+			"moe_intermediate_size": 2048,
+			"shared_expert_intermediate_size": 4096,
+			"rope_parameters": {
+				"rope_theta": 500000,
+				"partial_rotary_factor": 0.5
+			}
+		}
+	}`)
+
+	cfg, err := parseConfig(data)
+	if err != nil {
+		t.Fatalf("parseConfig failed: %v", err)
+	}
+
+	if cfg.RopeTheta != 500000 {
+		t.Fatalf("rope theta mismatch: got %v", cfg.RopeTheta)
+	}
+	if cfg.RopeDim != 64 {
+		t.Fatalf("rope dim mismatch: got %d want 64", cfg.RopeDim)
+	}
+	if cfg.FullAttentionInterval != 4 {
+		t.Fatalf("full_attention_interval default mismatch: got %d want 4", cfg.FullAttentionInterval)
+	}
+	if !cfg.NormTopKProb {
+		t.Fatalf("norm_topk_prob should default to true for MoE")
+	}
+}
+
+func TestLayerSelectionHelpers(t *testing.T) {
+	cfg := &Config{
+		NumHiddenLayers:       6,
+		FullAttentionInterval: 3,
+		NumExperts:            8,
+		DecoderSparseStep:     2,
+		MLPOnlyLayers:         []int32{1},
+	}
+
+	if !layerIsLinear(cfg, 0) {
+		t.Fatalf("layer 0 should be linear")
+	}
+	if layerIsLinear(cfg, 2) {
+		t.Fatalf("layer 2 should be full attention")
+	}
+
+	if layerUsesMoE(cfg, 1) {
+		t.Fatalf("layer 1 should be forced dense by mlp_only_layers")
+	}
+	if !layerUsesMoE(cfg, 3) {
+		t.Fatalf("layer 3 should use moe with decoder_sparse_step=2")
+	}
+}
+
+func TestModelRuntimeToggles(t *testing.T) {
+	m := &Model{}
+	if !m.DisablePromptCache() {
+		t.Fatal("DisablePromptCache() = false, want true")
+	}
+	if m.EnableCompile() {
+		t.Fatal("EnableCompile() = true, want false")
+	}
+}
+
+func TestNewCachesLayout(t *testing.T) {
+	m := &Model{
+		Config: &Config{
+			LinearConvKernelDim: 4,
+			LinearNumKeyHeads:   2,
+			LinearKeyHeadDim:    8,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  16,
+		},
+		Layers: []*Layer{
+			{IsLinear: true},
+			{IsLinear: false},
+			{IsLinear: true},
+		},
+	}
+
+	caches := m.NewCaches()
+	if len(caches) != len(m.Layers) {
+		t.Fatalf("len(caches) = %d, want %d", len(caches), len(m.Layers))
+	}
+
+	if _, ok := caches[0].(*cache.RecurrentCache); !ok {
+		t.Fatalf("cache[0] = %T, want *cache.RecurrentCache", caches[0])
+	}
+	if _, ok := caches[1].(*cache.KVCache); !ok {
+		t.Fatalf("cache[1] = %T, want *cache.KVCache", caches[1])
+	}
+	if _, ok := caches[2].(*cache.RecurrentCache); !ok {
+		t.Fatalf("cache[2] = %T, want *cache.RecurrentCache", caches[2])
+	}
+}
--- a/x/models/qwen3_5_moe/qwen3_5_moe.go
+++ b/x/models/qwen3_5_moe/qwen3_5_moe.go
@@ -0,0 +1,16 @@
+//go:build mlx
+
+// Package qwen3_5_moe registers Qwen 3.5 MoE architecture aliases.
+package qwen3_5_moe
+
+import (
+	"github.com/ollama/ollama/x/mlxrunner/model/base"
+	"github.com/ollama/ollama/x/models/qwen3_5"
+)
+
+func init() {
+	base.Register("Qwen3_5MoeForConditionalGeneration", qwen3_5.NewModel)
+	base.Register("Qwen3_5MoeForCausalLM", qwen3_5.NewModel)
+	base.Register("Qwen3NextMoeForConditionalGeneration", qwen3_5.NewModel)
+	base.Register("Qwen3NextMoeForCausalLM", qwen3_5.NewModel)
+}
--- a/x/tokenizer/tokenizer.go
+++ b/x/tokenizer/tokenizer.go
@@ -0,0 +1,108 @@
+//go:build mlx
+
+// tokenizer.go - BPE and SentencePiece tokenizer for HuggingFace models
+//
+// Based on standard BPE algorithm (Sennrich et al. 2015) with:
+// - GPT-2 byte-level encoding (OpenAI tiktoken)
+// - HuggingFace tokenizer.json pretokenizer patterns
+// - SentencePiece ▁-style space handling
+
+package tokenizer
+
+import "regexp"
+
+// TokenizerType identifies the tokenization algorithm
+type TokenizerType int
+
+const (
+	TokenizerBPE           TokenizerType = iota // GPT-2 style byte-level BPE
+	TokenizerSentencePiece                      // SentencePiece with ▁ for spaces
+)
+
+// Vocabulary holds the tokenizer vocabulary and merges
+type Vocabulary struct {
+	Values  []string
+	Reverse map[string]int32
+	Merges  map[string]int
+
+	BOS    int32
+	EOS    []int32 // Multiple EOS tokens supported (e.g., Gemma has <eos> and <end_of_turn>)
+	PAD    int32   // Padding token (often <|endoftext|> or <pad>)
+	AddBOS bool
+	AddEOS bool
+
+	// Precomputed byte token IDs for <0xNN> fallback (256 entries, -1 if not found)
+	byteTokens [256]int32
+}
+
+// Tokenizer handles BPE and SentencePiece tokenization
+type Tokenizer struct {
+	vocab               *Vocabulary
+	pretokenizer        *regexp.Regexp
+	specialTokens       map[string]int32 // Special tokens for direct lookup
+	sortedSpecialTokens []string         // Special tokens sorted by length, longest first
+	typ                 TokenizerType    // Algorithm type
+}
+
+// Precomputed GPT-2 byte-level encoding table
+// Maps byte values to their encoded rune equivalents
+var byteToRune [256]rune
+
+func init() {
+	for b := 0; b < 256; b++ {
+		r := rune(b)
+		switch {
+		case r == 0x00ad:
+			r = 0x0143
+		case r <= 0x0020:
+			r = r + 0x0100
+		case r >= 0x007f && r <= 0x00a0:
+			r = r + 0x00a2
+		}
+		byteToRune[b] = r
+	}
+}
+
+// VocabSize returns the vocabulary size
+func (t *Tokenizer) VocabSize() int {
+	return len(t.vocab.Values)
+}
+
+// BOS returns the beginning of sequence token ID
+func (t *Tokenizer) BOS() int32 {
+	return t.vocab.BOS
+}
+
+// EOS returns the first end of sequence token ID (for backwards compatibility)
+func (t *Tokenizer) EOS() int32 {
+	if len(t.vocab.EOS) > 0 {
+		return t.vocab.EOS[0]
+	}
+	return -1
+}
+
+// EOSTokens returns all end of sequence token IDs
+func (t *Tokenizer) EOSTokens() []int32 {
+	return t.vocab.EOS
+}
+
+// PAD returns the padding token ID, or -1 if not set
+func (t *Tokenizer) PAD() int32 {
+	return t.vocab.PAD
+}
+
+// IsEOS returns true if the token ID is an end of sequence token
+func (t *Tokenizer) IsEOS(id int32) bool {
+	for _, eos := range t.vocab.EOS {
+		if id == eos {
+			return true
+		}
+	}
+	return false
+}
+
+// GetSpecialToken returns the token ID for a special token string
+func (t *Tokenizer) GetSpecialToken(name string) (int32, bool) {
+	id, ok := t.specialTokens[name]
+	return id, ok
+}
--- a/x/tokenizer/tokenizer_benchmark_test.go
+++ b/x/tokenizer/tokenizer_benchmark_test.go
@@ -0,0 +1,251 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+var (
+	benchmarkSinkIDs []int32
+	benchmarkSinkStr string
+	benchmarkSinkTok *Tokenizer
+)
+
+const benchmarkWordPieceJSON = `{
+  "model": {
+    "type": "WordPiece",
+    "vocab": {
+      "[UNK]": 0,
+      "hello": 1,
+      "##world": 2,
+      "##ly": 3,
+      "##hello": 4
+    }
+  },
+  "added_tokens": []
+}`
+
+const benchmarkSentencePieceJSON = `{
+  "model": {
+    "type": "BPE",
+    "vocab": {
+      "\u2581": 0,
+      "h": 1,
+      "e": 2,
+      "l": 3,
+      "o": 4,
+      "w": 5,
+      "r": 6,
+      "d": 7,
+      "<0x0A>": 8
+    },
+    "merges": []
+  },
+  "decoder": {
+    "type": "Sequence",
+    "decoders": [
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": "\u2581"
+        }
+      }
+    ]
+  },
+  "added_tokens": []
+}`
+
+func benchmarkMiniLlamaPath(tb testing.TB) string {
+	tb.Helper()
+
+	_, filename, _, ok := runtime.Caller(0)
+	if !ok {
+		tb.Fatal("failed to resolve benchmark file path")
+	}
+
+	return filepath.Join(filepath.Dir(filename), "..", "imagegen", "tokenizer", "testdata", "mini_llama.json")
+}
+
+func benchmarkLoadMiniLlama(tb testing.TB) *Tokenizer {
+	tb.Helper()
+
+	data := benchmarkLoadMiniLlamaBytes(tb)
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load mini llama tokenizer: %v", err)
+	}
+	return tok
+}
+
+func benchmarkLoadMiniLlamaBytes(tb testing.TB) []byte {
+	tb.Helper()
+
+	data, err := os.ReadFile(benchmarkMiniLlamaPath(tb))
+	if err != nil {
+		tb.Fatalf("failed to read mini llama tokenizer: %v", err)
+	}
+	return data
+}
+
+func benchmarkLoadFromBytes(tb testing.TB, data []byte) *Tokenizer {
+	tb.Helper()
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load tokenizer from bytes: %v", err)
+	}
+	return tok
+}
+
+func BenchmarkTokenizerEncodeBPE(b *testing.B) {
+	tok := benchmarkLoadMiniLlama(b)
+
+	inputs := []struct {
+		name string
+		text string
+	}{
+		{name: "short", text: "Hello, world!"},
+		{name: "medium", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 16)},
+		{name: "long_sequential", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 80)},
+		{name: "long_parallel", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 160)},
+		{name: "huge_parallel", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 640)},
+		{name: "special_tokens", text: "<|begin_of_text|>system\nYou are concise.<|end_of_text|>"},
+	}
+
+	for _, input := range inputs {
+		b.Run(input.name, func(b *testing.B) {
+			b.ReportAllocs()
+			b.SetBytes(int64(len(input.text)))
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				benchmarkSinkIDs = tok.Encode(input.text, false)
+			}
+		})
+	}
+}
+
+func BenchmarkTokenizerDecodeBPE(b *testing.B) {
+	tok := benchmarkLoadMiniLlama(b)
+
+	inputs := []struct {
+		name string
+		text string
+	}{
+		{name: "medium", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 16)},
+		{name: "long", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 160)},
+	}
+
+	for _, input := range inputs {
+		ids := tok.Encode(input.text, false)
+		b.Run(input.name, func(b *testing.B) {
+			b.ReportAllocs()
+			b.SetBytes(int64(len(input.text)))
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				benchmarkSinkStr = tok.Decode(ids)
+			}
+		})
+	}
+}
+
+func BenchmarkTokenizerLoadFromBytes(b *testing.B) {
+	data := benchmarkLoadMiniLlamaBytes(b)
+
+	config := &TokenizerConfig{
+		TokenizerConfigJSON: []byte(`{
+			"bos_token": {"content": "<|begin_of_text|>"},
+			"eos_token": {"content": "<|end_of_text|>"},
+			"add_bos_token": true
+		}`),
+		GenerationConfigJSON: []byte(`{"bos_token_id": 128000, "eos_token_id": 128001}`),
+	}
+
+	b.Run("without_config", func(b *testing.B) {
+		b.ReportAllocs()
+		b.SetBytes(int64(len(data)))
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			tok, err := LoadFromBytes(data)
+			if err != nil {
+				b.Fatalf("LoadFromBytes failed: %v", err)
+			}
+			benchmarkSinkTok = tok
+		}
+	})
+
+	b.Run("with_config", func(b *testing.B) {
+		b.ReportAllocs()
+		b.SetBytes(int64(len(data)))
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			tok, err := LoadFromBytesWithConfig(data, config)
+			if err != nil {
+				b.Fatalf("LoadFromBytesWithConfig failed: %v", err)
+			}
+			benchmarkSinkTok = tok
+		}
+	})
+}
+
+func BenchmarkTokenizerEncodeWordPiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkWordPieceJSON))
+	text := strings.Repeat("helloworldly", 16)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkIDs = tok.Encode(text, false)
+	}
+}
+
+func BenchmarkTokenizerDecodeWordPiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkWordPieceJSON))
+	text := strings.Repeat("helloworldly", 16)
+	ids := tok.Encode(text, false)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkStr = tok.Decode(ids)
+	}
+}
+
+func BenchmarkTokenizerEncodeSentencePiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkSentencePieceJSON))
+	text := strings.Repeat("hello world\n", 64)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkIDs = tok.Encode(text, false)
+	}
+}
+
+func BenchmarkTokenizerDecodeSentencePiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkSentencePieceJSON))
+	text := strings.Repeat("hello world\n", 64)
+	ids := tok.Encode(text, false)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkStr = tok.Decode(ids)
+	}
+}
--- a/x/tokenizer/tokenizer_bpe.go
+++ b/x/tokenizer/tokenizer_bpe.go
@@ -0,0 +1,175 @@
+//go:build mlx
+
+package tokenizer
+
+import "container/heap"
+
+type bpeMergeNode struct {
+	prev  int
+	next  int
+	token string
+}
+
+type bpePair struct {
+	left  int
+	right int
+	rank  int
+	value string
+}
+
+type bpePairHeap []*bpePair
+
+func (h bpePairHeap) Len() int { return len(h) }
+
+func (h bpePairHeap) Less(i, j int) bool {
+	return h[i].rank < h[j].rank || (h[i].rank == h[j].rank && h[i].left < h[j].left)
+}
+
+func (h bpePairHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+
+func (h *bpePairHeap) Push(x any) {
+	*h = append(*h, x.(*bpePair))
+}
+
+func (h *bpePairHeap) Pop() any {
+	old := *h
+	n := len(old)
+	item := old[n-1]
+	*h = old[:n-1]
+	return item
+}
+
+// encodeBPEMerge encodes using BPE merge algorithm.
+// Uses the heap/linked-list pair merge strategy from tokenizer/bytepairencoding.go:
+// merge the lowest-rank valid pair, then only recheck adjacent pairs.
+func (t *Tokenizer) encodeBPEMerge(encoded string, ids []int32) []int32 {
+	runes := []rune(encoded)
+	if len(runes) == 0 {
+		return ids
+	}
+
+	nodes := make([]bpeMergeNode, len(runes))
+	for i := range runes {
+		nodes[i] = bpeMergeNode{
+			prev:  i - 1,
+			next:  i + 1,
+			token: string(runes[i]),
+		}
+	}
+
+	pairwise := func(left, right int) *bpePair {
+		if left < 0 || right >= len(nodes) {
+			return nil
+		}
+		if nodes[left].token == "" || nodes[right].token == "" {
+			return nil
+		}
+
+		leftToken, rightToken := nodes[left].token, nodes[right].token
+		rank, ok := t.vocab.Merges[leftToken+" "+rightToken]
+		if !ok {
+			return nil
+		}
+
+		value := leftToken + rightToken
+		if _, ok := t.vocab.Reverse[value]; !ok {
+			return nil
+		}
+
+		return &bpePair{
+			left:  left,
+			right: right,
+			rank:  rank,
+			value: value,
+		}
+	}
+
+	pairs := bpePairHeap{}
+	heap.Init(&pairs)
+	for i := 0; i < len(runes)-1; i++ {
+		if pair := pairwise(i, i+1); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+	}
+
+	for pairs.Len() > 0 {
+		pair := heap.Pop(&pairs).(*bpePair)
+		left, right := nodes[pair.left], nodes[pair.right]
+		if left.token == "" || right.token == "" {
+			continue
+		}
+		if left.next != pair.right || right.prev != pair.left {
+			continue
+		}
+		if left.token+right.token != pair.value {
+			continue
+		}
+
+		nodes[pair.left].token = pair.value
+		nodes[pair.right].token = ""
+		nodes[pair.left].next = right.next
+		if right.next < len(nodes) {
+			nodes[right.next].prev = pair.left
+		}
+
+		if pair := pairwise(nodes[pair.left].prev, pair.left); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+		if pair := pairwise(pair.left, nodes[pair.left].next); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+	}
+
+	for _, node := range nodes {
+		if node.token == "" {
+			continue
+		}
+
+		if id, ok := t.vocab.Reverse[node.token]; ok {
+			ids = append(ids, id)
+			continue
+		}
+
+		ids = t.appendByteFallback(ids, node.token)
+	}
+
+	return ids
+}
+
+func (t *Tokenizer) appendByteFallback(ids []int32, token string) []int32 {
+	if t.typ == TokenizerBPE {
+		for _, r := range token {
+			if b, ok := decodeByteLevelRune(r); ok {
+				if id := t.vocab.byteTokens[b]; id >= 0 {
+					ids = append(ids, id)
+				}
+			}
+		}
+		return ids
+	}
+
+	// SentencePiece fallback uses the UTF-8 bytes for <0xNN> tokens.
+	for _, b := range []byte(token) {
+		if id := t.vocab.byteTokens[b]; id >= 0 {
+			ids = append(ids, id)
+		}
+	}
+	return ids
+}
+
+func decodeByteLevelRune(r rune) (byte, bool) {
+	switch {
+	case r >= 0x00 && r <= 0xFF:
+		return byte(r), true
+	case r == 0x0100:
+		return 0x00, true
+	case r == 0x0143:
+		return 0x00ad, true
+	case r > 0x0100 && r <= 0x0120:
+		return byte(r - 0x0100), true
+	case r > 0x0120 && r <= 0x0142:
+		return byte(r - 0x00a2), true
+	default:
+		return 0, false
+	}
+}
--- a/x/tokenizer/tokenizer_correctness_test.go
+++ b/x/tokenizer/tokenizer_correctness_test.go
@@ -0,0 +1,137 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func equalIDs(a, b []int32) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestEncodeRoundtripMiniLlama(t *testing.T) {
+	tok := benchmarkLoadMiniLlama(t)
+
+	inputs := []string{
+		"",
+		"hello",
+		"hello world",
+		" hello  world ",
+		"don't we'll they're",
+		"1234567890",
+		"こんにちは世界",
+		"Hello 世界",
+		"func main() {}",
+		"<|begin_of_text|>system\nYou are concise.<|end_of_text|>",
+		strings.Repeat("The quick brown fox jumps over the lazy dog. ", 32),
+	}
+
+	for _, input := range inputs {
+		ids := tok.Encode(input, false)
+		got := tok.Decode(ids)
+		if got != input {
+			t.Fatalf("roundtrip mismatch for %q: got %q", input, got)
+		}
+	}
+}
+
+func TestSplitBySpecialTokensGreedyLongest(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "BPE",
+			"vocab": {"a": 0, "b": 1},
+			"merges": []
+		},
+		"added_tokens": [
+			{"id": 2, "content": "<tag>", "special": true},
+			{"id": 3, "content": "<tag>x", "special": true}
+		]
+	}`)
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		t.Fatalf("failed to load tokenizer: %v", err)
+	}
+
+	input := "a<tag>xb"
+	want := []string{"a", "<tag>x", "b"}
+
+	got := tok.splitBySpecialTokens(input)
+	if len(got) != len(want) {
+		t.Fatalf("split length mismatch: got %v want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("split mismatch at %d: got %v want %v", i, got, want)
+		}
+	}
+}
+
+func TestSplitBySpecialTokensFallbackWithoutCache(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "BPE",
+			"vocab": {"a": 0, "b": 1},
+			"merges": []
+		},
+		"added_tokens": [
+			{"id": 2, "content": "<tag>", "special": true},
+			{"id": 3, "content": "<tag>x", "special": true}
+		]
+	}`)
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		t.Fatalf("failed to load tokenizer: %v", err)
+	}
+
+	input := "a<tag>xb"
+	want := []string{"a", "<tag>x", "b"}
+
+	// Simulate construction outside loader path where cache is not set.
+	tok.sortedSpecialTokens = nil
+
+	got := tok.splitBySpecialTokens(input)
+	if len(got) != len(want) {
+		t.Fatalf("split length mismatch: got %v want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("split mismatch at %d: got %v want %v", i, got, want)
+		}
+	}
+}
+
+func TestEncodeDeterministicAcrossGOMAXPROCS(t *testing.T) {
+	tok := benchmarkLoadMiniLlama(t)
+
+	input := strings.Repeat("The quick brown fox jumps over the lazy dog. ", 640)
+
+	prev := runtime.GOMAXPROCS(0)
+	defer runtime.GOMAXPROCS(prev)
+
+	runtime.GOMAXPROCS(1)
+	seq := tok.Encode(input, false)
+
+	if prev < 2 {
+		runtime.GOMAXPROCS(2)
+	} else {
+		runtime.GOMAXPROCS(prev)
+	}
+	par := tok.Encode(input, false)
+
+	if !equalIDs(seq, par) {
+		t.Fatalf("encode mismatch between sequential and parallel paths: seq=%d par=%d", len(seq), len(par))
+	}
+}
--- a/x/tokenizer/tokenizer_decode.go
+++ b/x/tokenizer/tokenizer_decode.go
@@ -0,0 +1,56 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"strconv"
+	"strings"
+)
+
+// Decode converts token IDs back to text
+func (t *Tokenizer) Decode(ids []int32) string {
+	var sb strings.Builder
+
+	for _, id := range ids {
+		if int(id) >= len(t.vocab.Values) {
+			continue
+		}
+
+		token := t.vocab.Values[id]
+
+		switch t.typ {
+		case TokenizerSentencePiece:
+			// SentencePiece style: replace ▁ with space, decode byte tokens
+			token = strings.ReplaceAll(token, "▁", " ")
+			// Handle byte fallback tokens like <0x0D>
+			if len(token) == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x' && token[5] == '>' {
+				if v, err := strconv.ParseUint(token[3:5], 16, 8); err == nil {
+					sb.WriteByte(byte(v))
+					continue
+				}
+			}
+			sb.WriteString(token)
+		default:
+			// GPT-2 BPE style: decode byte-level encoding
+			for _, r := range token {
+				switch {
+				case r == 0x0100:
+					// Mirror GGML tokenizer behavior for NULL byte.
+					// 0x00 is omitted during decode.
+					continue
+				case r == 0x0143:
+					r = 0x00ad
+				case r > 0x0100 && r <= 0x0120:
+					r = r - 0x0100
+				case r > 0x0120 && r <= 0x0142:
+					r = r - 0x00a2
+				}
+
+				// Write as byte, not UTF-8 encoded rune
+				sb.WriteByte(byte(r))
+			}
+		}
+	}
+
+	return sb.String()
+}
--- a/x/tokenizer/tokenizer_encode.go
+++ b/x/tokenizer/tokenizer_encode.go
@@ -0,0 +1,289 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"unicode"
+	"unicode/utf8"
+)
+
+const (
+	encodeParallelMinInputBytes      = 4 * 1024
+	encodeParallelMinChunksPerWorker = 8
+)
+
+type tokenMatch struct {
+	start int
+	end   int
+}
+
+type encodeChunk struct {
+	text      string
+	isSpecial bool
+}
+
+// isNonNewlineWhitespace returns true if s contains only whitespace characters (no newlines)
+func isNonNewlineWhitespace(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		if r == '\n' || r == '\r' {
+			return false
+		}
+		if !unicode.IsSpace(r) {
+			return false
+		}
+	}
+	return true
+}
+
+// splitBySpecialTokens splits text into parts, keeping special tokens as separate elements
+func (t *Tokenizer) splitBySpecialTokens(s string) []string {
+	if len(t.specialTokens) == 0 {
+		return []string{s}
+	}
+
+	tokens := t.sortedSpecialTokens
+	if len(tokens) == 0 {
+		// Fallback for tokenizers constructed outside the loaders.
+		tokens = make([]string, 0, len(t.specialTokens))
+		for tok := range t.specialTokens {
+			tokens = append(tokens, tok)
+		}
+		sort.Slice(tokens, func(i, j int) bool {
+			return len(tokens[i]) > len(tokens[j])
+		})
+	}
+
+	var result []string
+	remaining := s
+
+	for len(remaining) > 0 {
+		found := false
+		for _, tok := range tokens {
+			if strings.HasPrefix(remaining, tok) {
+				result = append(result, tok)
+				remaining = remaining[len(tok):]
+				found = true
+				break
+			}
+		}
+		if !found {
+			// Find next special token position
+			nextPos := len(remaining)
+			for _, tok := range tokens {
+				if idx := strings.Index(remaining, tok); idx != -1 && idx < nextPos {
+					nextPos = idx
+				}
+			}
+			if nextPos > 0 {
+				result = append(result, remaining[:nextPos])
+			}
+			remaining = remaining[nextPos:]
+		}
+	}
+
+	return result
+}
+
+func adjustWhitespaceBoundary(part string, curr, next *tokenMatch) {
+	m := part[curr.start:curr.end]
+	nextText := part[next.start:next.end]
+
+	if !isNonNewlineWhitespace(m) || len(nextText) == 0 {
+		return
+	}
+
+	firstRune, _ := utf8.DecodeRuneInString(nextText)
+	if !unicode.IsLetter(firstRune) {
+		return
+	}
+
+	lastSpaceStart := curr.end
+	for j := curr.end; j > curr.start; {
+		r, size := utf8.DecodeLastRuneInString(part[curr.start:j])
+		if unicode.IsSpace(r) {
+			lastSpaceStart = j - size
+			break
+		}
+		j -= size
+	}
+	if lastSpaceStart > curr.start {
+		curr.end = lastSpaceStart
+		next.start = lastSpaceStart
+	} else {
+		next.start = curr.start
+		curr.end = curr.start
+	}
+}
+
+func (t *Tokenizer) forEachPartChunk(part string, fn func(encodeChunk)) {
+	if _, ok := t.specialTokens[part]; ok {
+		fn(encodeChunk{text: part, isSpecial: true})
+		return
+	}
+
+	if t.pretokenizer == nil {
+		fn(encodeChunk{text: part, isSpecial: false})
+		return
+	}
+
+	re := t.pretokenizer
+	offset := 0
+	loc := re.FindStringIndex(part[offset:])
+	if loc == nil {
+		return
+	}
+
+	curr := tokenMatch{start: offset + loc[0], end: offset + loc[1]}
+	offset += loc[1]
+
+	for {
+		loc = re.FindStringIndex(part[offset:])
+		if loc == nil {
+			if curr.end > curr.start {
+				fn(encodeChunk{text: part[curr.start:curr.end], isSpecial: false})
+			}
+			return
+		}
+
+		next := tokenMatch{start: offset + loc[0], end: offset + loc[1]}
+		offset += loc[1]
+
+		adjustWhitespaceBoundary(part, &curr, &next)
+
+		if curr.end > curr.start {
+			fn(encodeChunk{text: part[curr.start:curr.end], isSpecial: false})
+		}
+		curr = next
+	}
+}
+
+func (t *Tokenizer) appendEncodedChunk(ids []int32, c encodeChunk) []int32 {
+	if c.isSpecial {
+		if id, ok := t.specialTokens[c.text]; ok {
+			return append(ids, id)
+		}
+		return ids
+	}
+
+	return t.encodeChunkInto(c.text, ids)
+}
+
+// Encode tokenizes text to token IDs.
+// Parallel encoding is used only for very large inputs with enough chunks per worker.
+func (t *Tokenizer) Encode(s string, addBOS bool) []int32 {
+	// First: split by special tokens
+	parts := t.splitBySpecialTokens(s)
+
+	// Fast path: encode sequentially without materializing chunk slices.
+	if len(s) < encodeParallelMinInputBytes {
+		var ids []int32
+		for _, part := range parts {
+			t.forEachPartChunk(part, func(c encodeChunk) {
+				ids = t.appendEncodedChunk(ids, c)
+			})
+		}
+
+		if addBOS && t.vocab.BOS >= 0 {
+			ids = append([]int32{t.vocab.BOS}, ids...)
+		}
+		return ids
+	}
+
+	// For large inputs collect chunks to enable parallel processing.
+	var allChunks []encodeChunk
+	for _, part := range parts {
+		t.forEachPartChunk(part, func(c encodeChunk) {
+			allChunks = append(allChunks, c)
+		})
+	}
+
+	// Encode chunks. Use the parallel path only when the chunk count is
+	// large enough to amortize goroutine/synchronization overhead.
+	useParallel := true
+	numWorkers := runtime.GOMAXPROCS(0)
+	if numWorkers > len(allChunks) {
+		numWorkers = len(allChunks)
+	}
+	if numWorkers < 2 || len(allChunks) < numWorkers*encodeParallelMinChunksPerWorker {
+		useParallel = false
+	}
+
+	var ids []int32
+	if !useParallel {
+		for _, c := range allChunks {
+			ids = t.appendEncodedChunk(ids, c)
+		}
+	} else {
+		chunksPer := (len(allChunks) + numWorkers - 1) / numWorkers
+		results := make([][]int32, numWorkers)
+		var wg sync.WaitGroup
+
+		for i := 0; i < numWorkers; i++ {
+			start := i * chunksPer
+			end := start + chunksPer
+			if end > len(allChunks) {
+				end = len(allChunks)
+			}
+			if start >= end {
+				continue
+			}
+
+			wg.Add(1)
+			go func(i int, chunks []encodeChunk) {
+				defer wg.Done()
+				var r []int32
+				for _, c := range chunks {
+					r = t.appendEncodedChunk(r, c)
+				}
+				results[i] = r
+			}(i, allChunks[start:end])
+		}
+		wg.Wait()
+
+		for _, r := range results {
+			ids = append(ids, r...)
+		}
+	}
+
+	if addBOS && t.vocab.BOS >= 0 {
+		ids = append([]int32{t.vocab.BOS}, ids...)
+	}
+	return ids
+}
+
+// encodeChunkInto appends encoded tokens to ids and returns the extended slice.
+// Uses BPE merge algorithm for both BPE and SentencePiece tokenization.
+func (t *Tokenizer) encodeChunkInto(s string, ids []int32) []int32 {
+	if s == "" {
+		return ids
+	}
+
+	// Apply encoding transformation
+	// SentencePiece: replace space with ▁
+	// BPE: convert bytes using precomputed table (GPT-2 byte-level encoding)
+	var encoded string
+	if t.typ == TokenizerSentencePiece {
+		encoded = strings.ReplaceAll(s, " ", "▁")
+	} else {
+		var sb strings.Builder
+		sb.Grow(len(s) * 2)
+		for i := 0; i < len(s); i++ {
+			sb.WriteRune(byteToRune[s[i]])
+		}
+		encoded = sb.String()
+	}
+
+	// Fast path: check if entire chunk is a single token
+	if id, ok := t.vocab.Reverse[encoded]; ok {
+		return append(ids, id)
+	}
+
+	return t.encodeBPEMerge(encoded, ids)
+}
--- a/x/tokenizer/tokenizer_ggml_parity_test.go
+++ b/x/tokenizer/tokenizer_ggml_parity_test.go
@@ -0,0 +1,207 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"bufio"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func llama32GGMLFixturePath(tb testing.TB, file string) string {
+	tb.Helper()
+
+	_, filename, _, ok := runtime.Caller(0)
+	if !ok {
+		tb.Fatal("failed to resolve test file path")
+	}
+
+	return filepath.Join(filepath.Dir(filename), "..", "..", "tokenizer", "testdata", "llama3.2", file)
+}
+
+func loadLlama32FromGGMLFixture(tb testing.TB) *Tokenizer {
+	tb.Helper()
+
+	f, err := os.Open(llama32GGMLFixturePath(tb, "encoder.json"))
+	if err != nil {
+		tb.Fatalf("failed to open encoder.json: %v", err)
+	}
+	defer f.Close()
+
+	vocab := make(map[string]int32)
+	if err := json.NewDecoder(f).Decode(&vocab); err != nil {
+		tb.Fatalf("failed to decode encoder.json: %v", err)
+	}
+
+	type addedToken struct {
+		ID      int32  `json:"id"`
+		Content string `json:"content"`
+		Special bool   `json:"special"`
+	}
+	var addedTokens []addedToken
+	for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
+		if _, ok := vocab[token]; !ok {
+			id := int32(len(vocab))
+			vocab[token] = id
+			addedTokens = append(addedTokens, addedToken{ID: id, Content: token, Special: true})
+		}
+	}
+
+	mf, err := os.Open(llama32GGMLFixturePath(tb, "vocab.bpe"))
+	if err != nil {
+		tb.Fatalf("failed to open vocab.bpe: %v", err)
+	}
+	defer mf.Close()
+
+	var merges []string
+	scanner := bufio.NewScanner(mf)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.HasPrefix(line, "#") {
+			continue
+		}
+		line = strings.TrimSpace(line)
+		if line != "" {
+			merges = append(merges, line)
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		tb.Fatalf("failed to read vocab.bpe: %v", err)
+	}
+
+	payload := struct {
+		Model struct {
+			Type   string           `json:"type"`
+			Vocab  map[string]int32 `json:"vocab"`
+			Merges []string         `json:"merges"`
+		} `json:"model"`
+		PreTokenizer struct {
+			Type          string `json:"type"`
+			Pretokenizers []struct {
+				Type    string `json:"type"`
+				Pattern struct {
+					Regex string `json:"Regex"`
+				} `json:"pattern"`
+			} `json:"pretokenizers"`
+		} `json:"pre_tokenizer"`
+		AddedTokens []addedToken `json:"added_tokens"`
+	}{}
+
+	payload.Model.Type = "BPE"
+	payload.Model.Vocab = vocab
+	payload.Model.Merges = merges
+	payload.PreTokenizer.Type = "Sequence"
+	payload.PreTokenizer.Pretokenizers = []struct {
+		Type    string `json:"type"`
+		Pattern struct {
+			Regex string `json:"Regex"`
+		} `json:"pattern"`
+	}{
+		{
+			Type: "Split",
+			Pattern: struct {
+				Regex string `json:"Regex"`
+			}{
+				Regex: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			},
+		},
+	}
+	payload.AddedTokens = addedTokens
+
+	data, err := json.Marshal(payload)
+	if err != nil {
+		tb.Fatalf("failed to marshal synthetic tokenizer.json: %v", err)
+	}
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load tokenizer from fixture data: %v", err)
+	}
+	return tok
+}
+
+func TestGGMLLlamaKnownEncodings(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := map[string][]int32{
+		"hello world":                                          {15339, 1917},
+		"hello <|end_of_text|>":                                {15339, 220, 128001},
+		"<|begin_of_text|>A B!":                                {128000, 32, 426, 0},
+		"<|begin_of_text|>A<|end_of_text|>B!":                  {128000, 32, 128001, 33, 0},
+		"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
+		"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
+	}
+
+	for input, want := range cases {
+		got := tok.Encode(input, false)
+		if !equalIDs(got, want) {
+			t.Fatalf("encode mismatch for %q:\n got:  %v\n want: %v", input, got, want)
+		}
+	}
+}
+
+func TestGGMLLlamaRepeatedZeros(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := map[int][]int32{
+		1:  {15},
+		2:  {410},
+		3:  {931},
+		4:  {931, 15},
+		5:  {931, 410},
+		6:  {931, 931},
+		7:  {931, 931, 15},
+		8:  {931, 931, 410},
+		9:  {931, 931, 931},
+		10: {931, 931, 931, 15},
+		11: {931, 931, 931, 410},
+		12: {931, 931, 931, 931},
+		13: {931, 931, 931, 931, 15},
+		14: {931, 931, 931, 931, 410},
+		15: {931, 931, 931, 931, 931},
+		16: {931, 931, 931, 931, 931, 15},
+		17: {931, 931, 931, 931, 931, 410},
+	}
+
+	for n, want := range cases {
+		input := strings.Repeat("0", n)
+		got := tok.Encode(input, false)
+		if !equalIDs(got, want) {
+			t.Fatalf("encode mismatch for %q:\n got:  %v\n want: %v", input, got, want)
+		}
+	}
+}
+
+func TestGGMLLlamaRoundtripAndByteBehavior(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := []string{
+		"hello",
+		"hello ",
+		"hello  ",
+		" hello",
+		" hello ",
+		" hello  ",
+		"hello world",
+		"请考试我的软件！12345",
+	}
+
+	for _, input := range cases {
+		ids := tok.Encode(input, false)
+		got := tok.Decode(ids)
+		if got != input {
+			t.Fatalf("roundtrip mismatch for %q: got %q", input, got)
+		}
+	}
+
+	// Match GGML tokenizer behavior: 0x00 is omitted when decoding.
+	ids := tok.Encode(string(rune(0x00)), false)
+	got := tok.Decode(ids)
+	if got != "" {
+		t.Fatalf("expected empty decode for 0x00, got %q (ids=%v)", got, ids)
+	}
+}
--- a/x/tokenizer/tokenizer_load.go
+++ b/x/tokenizer/tokenizer_load.go
@@ -0,0 +1,458 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"encoding/json"
+	"fmt"
+	"regexp"
+	"sort"
+	"strings"
+)
+
+// TokenizerConfig holds optional configuration data that can be passed to LoadFromBytesWithConfig.
+type TokenizerConfig struct {
+	TokenizerConfigJSON  []byte // tokenizer_config.json content
+	GenerationConfigJSON []byte // generation_config.json content
+	SpecialTokensMapJSON []byte // special_tokens_map.json content
+	ConfigJSON           []byte // config.json content
+}
+
+// LoadFromBytes loads a tokenizer from tokenizer.json bytes.
+// This is useful when loading from blob storage where the file content is already in memory.
+// Note: This won't load special token config from companion files. Use LoadFromBytesWithConfig
+// to provide tokenizer_config.json data for proper PAD/EOS token loading.
+func LoadFromBytes(data []byte) (*Tokenizer, error) {
+	return loadFromTokenizerJSON(data)
+}
+
+// LoadFromBytesWithConfig loads a tokenizer from tokenizer.json bytes with additional config files.
+// This is useful when loading from blob storage where companion config files are also blobs.
+func LoadFromBytesWithConfig(data []byte, config *TokenizerConfig) (*Tokenizer, error) {
+	t, err := loadFromTokenizerJSON(data)
+	if err != nil {
+		return nil, err
+	}
+
+	if config == nil {
+		return t, nil
+	}
+
+	// Apply special token configs from provided data
+	loadSpecialTokenConfigFromBytes(t, config)
+
+	return t, nil
+}
+
+// loadFromTokenizerJSON parses tokenizer.json content from bytes.
+func loadFromTokenizerJSON(data []byte) (*Tokenizer, error) {
+
+	var raw struct {
+		Model struct {
+			Type   string           `json:"type"` // "BPE"
+			Vocab  map[string]int32 `json:"vocab"`
+			Merges json.RawMessage  `json:"merges"` // Can be []string or [][]string (BPE only)
+		} `json:"model"`
+		PreTokenizer json.RawMessage `json:"pre_tokenizer"`
+		Decoder      json.RawMessage `json:"decoder"`
+		AddedTokens  []struct {
+			ID      int32  `json:"id"`
+			Content string `json:"content"`
+			Special bool   `json:"special"`
+		} `json:"added_tokens"`
+	}
+
+	if err := json.Unmarshal(data, &raw); err != nil {
+		return nil, fmt.Errorf("failed to parse tokenizer: %w", err)
+	}
+
+	// Covers SentencePiece and BPE models
+	if raw.Model.Type != "BPE" {
+		return nil, fmt.Errorf("unsupported tokenizer type: %s", raw.Model.Type)
+	}
+
+	// Parse merges - can be []string (Llama) or [][]string (GPT-OSS).
+	var mergesStrings []string
+	if raw.Model.Merges != nil {
+		var mergesArrays [][]string
+		if err := json.Unmarshal(raw.Model.Merges, &mergesStrings); err != nil {
+			// Try array of arrays format
+			if err := json.Unmarshal(raw.Model.Merges, &mergesArrays); err != nil {
+				return nil, fmt.Errorf("failed to parse merges: %w", err)
+			}
+			// Convert [][]string to []string
+			mergesStrings = make([]string, len(mergesArrays))
+			for i, pair := range mergesArrays {
+				if len(pair) != 2 {
+					return nil, fmt.Errorf("failed to parse merges: expected merge pair of length 2, got %d", len(pair))
+				}
+				mergesStrings[i] = pair[0] + " " + pair[1]
+			}
+		}
+	}
+
+	// Build tokenizer
+	t := &Tokenizer{
+		vocab: &Vocabulary{
+			Values:  make([]string, len(raw.Model.Vocab)),
+			Reverse: raw.Model.Vocab,
+			Merges:  make(map[string]int, len(mergesStrings)),
+			BOS:     -1,
+			PAD:     -1,
+		},
+		specialTokens: make(map[string]int32),
+	}
+
+	// Build values array
+	for token, id := range raw.Model.Vocab {
+		if int(id) >= len(t.vocab.Values) {
+			newValues := make([]string, id+1)
+			copy(newValues, t.vocab.Values)
+			t.vocab.Values = newValues
+		}
+		t.vocab.Values[id] = token
+	}
+
+	// Build merges map
+	for i, merge := range mergesStrings {
+		t.vocab.Merges[merge] = i
+	}
+
+	// Add all added_tokens to vocabulary and special tokens map.
+	// HuggingFace treats ALL added_tokens as special for tokenization purposes -
+	// they bypass BPE and get their own token ID. The "special" flag just indicates
+	// if it's a "truly special" token like BOS/EOS/PAD, but for tokenization we need
+	// to treat all added_tokens as special to match HuggingFace behavior.
+	for _, tok := range raw.AddedTokens {
+		if int(tok.ID) >= len(t.vocab.Values) {
+			newValues := make([]string, tok.ID+1)
+			copy(newValues, t.vocab.Values)
+			t.vocab.Values = newValues
+		}
+		t.vocab.Values[tok.ID] = tok.Content
+		t.specialTokens[tok.Content] = tok.ID // Add ALL added_tokens to special tokens
+	}
+
+	// Precompute byte token IDs for <0xNN> fallback
+	initByteTokens(t)
+
+	// Determine tokenizer type
+	switch {
+	case detectSentencePiece(raw.Decoder):
+		t.typ = TokenizerSentencePiece
+	default:
+		t.typ = TokenizerBPE
+	}
+
+	// Parse and compile pretokenizer pattern (BPE only - SentencePiece doesn't use pretokenizer)
+	if t.typ == TokenizerBPE {
+		pattern := extractPretokenizer(raw.PreTokenizer)
+		if pattern == "" {
+			pattern = `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`
+		}
+		re, err := regexp.Compile(rewritePatternForRE2(pattern))
+		if err != nil {
+			return nil, fmt.Errorf("failed to compile pretokenizer regex %q: %w", pattern, err)
+		}
+		t.pretokenizer = re
+	}
+
+	cacheSortedSpecialTokens(t)
+
+	return t, nil
+}
+
+func cacheSortedSpecialTokens(t *Tokenizer) {
+	if len(t.specialTokens) == 0 {
+		t.sortedSpecialTokens = nil
+		return
+	}
+
+	tokens := make([]string, 0, len(t.specialTokens))
+	for tok := range t.specialTokens {
+		tokens = append(tokens, tok)
+	}
+	sort.Slice(tokens, func(i, j int) bool {
+		return len(tokens[i]) > len(tokens[j])
+	})
+	t.sortedSpecialTokens = tokens
+}
+
+type specialTokenConfigData struct {
+	tokenizerConfigJSON  []byte
+	generationConfigJSON []byte
+	specialTokensMapJSON []byte
+	configJSON           []byte
+}
+
+func applySpecialTokenConfig(t *Tokenizer, config specialTokenConfigData) {
+	parseTokenIDs := func(v interface{}) []int32 {
+		switch val := v.(type) {
+		case float64:
+			return []int32{int32(val)}
+		case []interface{}:
+			ids := make([]int32, 0, len(val))
+			for _, id := range val {
+				if f, ok := id.(float64); ok {
+					ids = append(ids, int32(f))
+				}
+			}
+			return ids
+		}
+		return nil
+	}
+
+	// Priority 1: generation_config.json
+	if len(config.generationConfigJSON) > 0 {
+		var genConfig struct {
+			EOSTokenID interface{} `json:"eos_token_id"`
+			BOSTokenID interface{} `json:"bos_token_id"`
+		}
+		if err := json.Unmarshal(config.generationConfigJSON, &genConfig); err == nil {
+			if ids := parseTokenIDs(genConfig.EOSTokenID); len(ids) > 0 {
+				t.vocab.EOS = ids
+			}
+			if ids := parseTokenIDs(genConfig.BOSTokenID); len(ids) > 0 {
+				t.vocab.BOS = ids[0]
+			}
+		}
+	}
+
+	// Priority 2: config.json
+	if len(config.configJSON) > 0 && (len(t.vocab.EOS) == 0 || t.vocab.BOS < 0) {
+		var modelConfig struct {
+			EOSTokenID interface{} `json:"eos_token_id"`
+			BOSTokenID interface{} `json:"bos_token_id"`
+		}
+		if err := json.Unmarshal(config.configJSON, &modelConfig); err == nil {
+			if len(t.vocab.EOS) == 0 {
+				if ids := parseTokenIDs(modelConfig.EOSTokenID); len(ids) > 0 {
+					t.vocab.EOS = ids
+				}
+			}
+			if t.vocab.BOS < 0 {
+				if ids := parseTokenIDs(modelConfig.BOSTokenID); len(ids) > 0 {
+					t.vocab.BOS = ids[0]
+				}
+			}
+		}
+	}
+
+	// Priority 3: tokenizer_config.json
+	if len(config.tokenizerConfigJSON) > 0 {
+		var tokConfig struct {
+			BOSToken    interface{} `json:"bos_token"`
+			EOSToken    interface{} `json:"eos_token"`
+			PADToken    interface{} `json:"pad_token"`
+			AddBOSToken *bool       `json:"add_bos_token"`
+			AddEOSToken *bool       `json:"add_eos_token"`
+		}
+		if err := json.Unmarshal(config.tokenizerConfigJSON, &tokConfig); err == nil {
+			if t.vocab.BOS < 0 {
+				if bosStr := extractTokenString(tokConfig.BOSToken); bosStr != "" {
+					if id, ok := t.specialTokens[bosStr]; ok {
+						t.vocab.BOS = id
+					}
+				}
+			}
+			if len(t.vocab.EOS) == 0 {
+				if eosStr := extractTokenString(tokConfig.EOSToken); eosStr != "" {
+					if id, ok := t.specialTokens[eosStr]; ok {
+						t.vocab.EOS = []int32{id}
+					}
+				}
+			}
+			if t.vocab.PAD < 0 {
+				if padStr := extractTokenString(tokConfig.PADToken); padStr != "" {
+					if id, ok := t.specialTokens[padStr]; ok {
+						t.vocab.PAD = id
+					}
+				}
+			}
+			if tokConfig.AddBOSToken != nil {
+				t.vocab.AddBOS = *tokConfig.AddBOSToken
+			}
+			if tokConfig.AddEOSToken != nil {
+				t.vocab.AddEOS = *tokConfig.AddEOSToken
+			}
+		}
+	}
+
+	// Priority 4: special_tokens_map.json
+	if len(config.specialTokensMapJSON) > 0 {
+		var tokensMap map[string]interface{}
+		if err := json.Unmarshal(config.specialTokensMapJSON, &tokensMap); err == nil {
+			if t.vocab.BOS < 0 {
+				if bosStr := extractTokenString(tokensMap["bos_token"]); bosStr != "" {
+					if id, ok := t.specialTokens[bosStr]; ok {
+						t.vocab.BOS = id
+					}
+				}
+			}
+			if len(t.vocab.EOS) == 0 {
+				if eosStr := extractTokenString(tokensMap["eos_token"]); eosStr != "" {
+					if id, ok := t.specialTokens[eosStr]; ok {
+						t.vocab.EOS = []int32{id}
+					}
+				}
+			}
+			if t.vocab.PAD < 0 {
+				if padStr := extractTokenString(tokensMap["pad_token"]); padStr != "" {
+					if id, ok := t.specialTokens[padStr]; ok {
+						t.vocab.PAD = id
+					}
+				}
+			}
+		}
+	}
+}
+
+// extractTokenString extracts the token string from various formats used in HuggingFace configs.
+// Tokens can be represented as:
+//   - string: "token"
+//   - object: {"content": "token", ...}
+func extractTokenString(v interface{}) string {
+	if v == nil {
+		return ""
+	}
+	// Direct string
+	if s, ok := v.(string); ok {
+		return s
+	}
+	// Object with content field
+	if m, ok := v.(map[string]interface{}); ok {
+		if content, ok := m["content"].(string); ok {
+			return content
+		}
+	}
+	return ""
+}
+
+// rewritePatternForRE2 rewrites HuggingFace pretokenizer regex patterns to be
+// compatible with Go's regexp package (RE2). HuggingFace patterns use PCRE features:
+//   - (?!\S) negative lookahead - RE2 doesn't support this
+//   - (?i:...) inline case-insensitive groups - RE2 doesn't support this
+//
+// We replace \s+(?!\S)|\s+ with \s+ and fix whitespace boundaries in encodeWithRegex().
+// The lookahead version splits "a  b" into ["a", " ", " b"] (space prepended to word).
+// Simple \s+ would give ["a", "  ", "b"]. We post-process to match Python's behavior.
+func rewritePatternForRE2(pattern string) string {
+	// Replace lookahead pattern with simple \s+ - we fix boundaries in encodeWithRegex()
+	pattern = strings.ReplaceAll(pattern, `\s+(?!\S)|\s+`, `\s+`)
+
+	// Handle the pattern when it appears with a ? suffix (optional contractions in GPT-4o style)
+	// IMPORTANT: Must be done before the non-optional version to avoid partial replacement
+	pattern = strings.ReplaceAll(pattern,
+		`(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+		`(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?`)
+
+	// Expand case-insensitive contraction pattern to explicit alternations
+	// (?i:'s|'t|'re|'ve|'m|'ll|'d) -> '[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]
+	pattern = strings.ReplaceAll(pattern,
+		`(?i:'s|'t|'re|'ve|'m|'ll|'d)`,
+		`(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])`)
+
+	return pattern
+}
+
+// loadSpecialTokenConfigFromBytes loads special token configuration from byte slices.
+func loadSpecialTokenConfigFromBytes(t *Tokenizer, config *TokenizerConfig) {
+	applySpecialTokenConfig(t, specialTokenConfigData{
+		tokenizerConfigJSON:  config.TokenizerConfigJSON,
+		generationConfigJSON: config.GenerationConfigJSON,
+		specialTokensMapJSON: config.SpecialTokensMapJSON,
+		configJSON:           config.ConfigJSON,
+	})
+}
+
+// detectSentencePiece checks if the decoder uses SentencePiece-style (▁ for spaces)
+// vs GPT-2 byte-level encoding
+func detectSentencePiece(data json.RawMessage) bool {
+	if data == nil {
+		return false
+	}
+
+	// Check for Sequence decoder with Replace step (SentencePiece style)
+	var seq struct {
+		Type     string `json:"type"`
+		Decoders []struct {
+			Type    string `json:"type"`
+			Pattern struct {
+				String string `json:"String"`
+			} `json:"pattern"`
+		} `json:"decoders"`
+	}
+	if err := json.Unmarshal(data, &seq); err == nil {
+		if seq.Type == "Sequence" {
+			for _, dec := range seq.Decoders {
+				// Look for Replace decoder that converts ▁ to space
+				if dec.Type == "Replace" && dec.Pattern.String == "▁" {
+					return true
+				}
+			}
+		}
+	}
+
+	// Check for direct ByteLevel decoder (GPT-2 style)
+	var simple struct {
+		Type string `json:"type"`
+	}
+	if err := json.Unmarshal(data, &simple); err == nil {
+		if simple.Type == "ByteLevel" {
+			return false
+		}
+	}
+
+	return false
+}
+
+// initByteTokens precomputes byte token IDs for <0xNN> fallback encoding
+func initByteTokens(t *Tokenizer) {
+	for i := range t.vocab.byteTokens {
+		t.vocab.byteTokens[i] = -1
+	}
+	for b := 0; b < 256; b++ {
+		token := fmt.Sprintf("<0x%02X>", b)
+		if id, ok := t.vocab.Reverse[token]; ok {
+			t.vocab.byteTokens[b] = id
+		}
+	}
+}
+
+// extractPretokenizer extracts the regex pattern from the pre_tokenizer config
+func extractPretokenizer(data json.RawMessage) string {
+	if data == nil {
+		return ""
+	}
+
+	// Try to parse as a single Split pretokenizer
+	var single struct {
+		Type    string `json:"type"`
+		Pattern struct {
+			Regex string `json:"Regex"`
+		} `json:"pattern"`
+	}
+	if err := json.Unmarshal(data, &single); err == nil && single.Pattern.Regex != "" {
+		return single.Pattern.Regex
+	}
+
+	// Try to parse as Sequence of pretokenizers - use first Split pattern
+	var seq struct {
+		Type          string `json:"type"`
+		Pretokenizers []struct {
+			Type    string `json:"type"`
+			Pattern struct {
+				Regex string `json:"Regex"`
+			} `json:"pattern"`
+		} `json:"pretokenizers"`
+	}
+	if err := json.Unmarshal(data, &seq); err == nil && seq.Type == "Sequence" {
+		for _, pt := range seq.Pretokenizers {
+			if pt.Type == "Split" && pt.Pattern.Regex != "" {
+				return pt.Pattern.Regex
+			}
+		}
+	}
+
+	return ""
+}
--- a/x/tokenizer/tokenizer_load_test.go
+++ b/x/tokenizer/tokenizer_load_test.go
@@ -0,0 +1,26 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestLoadFromBytesRejectsWordPiece(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "WordPiece",
+			"vocab": {"[UNK]": 0, "hello": 1}
+		},
+		"added_tokens": []
+	}`)
+
+	_, err := LoadFromBytes(data)
+	if err == nil {
+		t.Fatal("expected WordPiece load to fail")
+	}
+	if !strings.Contains(err.Error(), "unsupported tokenizer type: WordPiece") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
Author	SHA1	Message	Date
Patrick Devine	00f67e807a	Add qwen3.5-next-moe support to MLX runner and models This change: * adds support for qwen3.5-next-moe models (qwen3-next/qwen3.5-next/qwen3-coder) to the MLX runner * introduces recurrent cache support and related MLX ops * updates pipeline/runner integration and adds tests	2026-02-20 17:25:23 -08:00
Patrick Devine	97323d1c68	consolidate the tokenizer (#14327 ) This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.	2026-02-19 15:55:45 -08:00
natl-set	458dd1b9d9	mlx: try loading library via rpath before searching directories (#14322 ) The existing code manually searches directories for libmlxc.* and passes full paths to dlopen, bypassing the binary's rpath. This means MLX libraries installed via package managers (e.g., Homebrew) aren't found even when rpath is correctly set at link time. This change adds a fallback that tries loading via rpath first (using just the library name), before falling back to the existing directory search. This follows standard Unix/macOS conventions and works with any installation that sets rpath. Fixes library loading on macOS with Homebrew-installed mlx-c without requiring OLLAMA_LIBRARY_PATH environment variable. Co-authored-by: Natl <nat@MacBook-Pro.local>	2026-02-19 10:55:02 -08:00
Bruce MacDonald	9d02d1d767	install: prevent partial download script execution (#14311 ) Wrap script in main function so that a truncated partial download doesn't end up executing half a script.	2026-02-18 18:32:45 -08:00
Bruce MacDonald	1a636fb47a	cmd: set codex env vars on launch and handle zstd request bodies (#14122 ) The Codex runner was not setting OPENAI_BASE_URL or OPENAI_API_KEY, this prevents Codex from sending requests to api.openai.com instead of the local Ollama server. This mirrors the approach used by the Claude runner. Codex v0.98.0 sends zstd-compressed request bodies to the /v1/responses endpoint. Add decompression support in ResponsesMiddleware with an 8MB max decompressed size limit to prevent resource exhaustion.	2026-02-18 17:19:36 -08:00
Patrick Devine	0759fface9	Revert "chore: update mlx-c bindings to 0.5.0 (#14303 )" (#14316 ) This reverts commit `f01a9a7859`.	2026-02-18 17:01:25 -08:00
Parth Sareen	325b72bc31	cmd/tui: default to single-select for editor integrations (#14302 )	2026-02-17 18:17:27 -08:00
Patrick Devine	f01a9a7859	chore: update mlx-c bindings to 0.5.0 (#14303 )	2026-02-17 16:48:16 -08:00