fix: fixes a memory leak in bfloat16 package

This change vendors in the bfloat16 package from github.com/d4l3k/go-bfloat16/ and fixes a memory leak which was being caused by using unsafe pointers instead of the math package.
2026-02-17 02:53:51 -05:00 · 2025-03-17 21:46:12 -07:00
73 changed files with 427 additions and 3841 deletions
--- a/README.md
+++ b/README.md
@@ -512,7 +512,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
- [Ollama for D](https://github.com/kassane/ollama-d)

 ### Mobile

--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -1,178 +0,0 @@
-package benchmark
-
-import (
-	"context"
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// Command line flags
-var modelFlag string
-
-func init() {
-	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
-	flag.Lookup("m").DefValue = "model"
-}
-
-// modelName returns the model name from flags, failing the test if not set
-func modelName(b *testing.B) string {
-	if modelFlag == "" {
-		b.Fatal("Error: -m flag is required for benchmark tests")
-	}
-	return modelFlag
-}
-
-type TestCase struct {
-	name      string
-	prompt    string
-	maxTokens int
-}
-
-// runGenerateBenchmark contains the common generate and metrics logic
-func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
-	start := time.Now()
-	var ttft time.Duration
-	var metrics api.Metrics
-
-	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
-		if ttft == 0 && resp.Response != "" {
-			ttft = time.Since(start)
-		}
-		if resp.Done {
-			metrics = resp.Metrics
-		}
-		return nil
-	})
-
-	// Report custom metrics as part of the benchmark results
-	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
-	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
-
-	// Token throughput metrics
-	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
-	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
-	b.ReportMetric(promptThroughput, "prompt_tok/s")
-	b.ReportMetric(genThroughput, "gen_tok/s")
-
-	// Token counts
-	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
-	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
-	if err != nil {
-		b.Fatal(err)
-	}
-}
-
-// BenchmarkColdStart runs benchmarks with model loading from cold state
-func BenchmarkColdStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				b.StopTimer()
-				// Ensure model is unloaded before each iteration
-				unload(client, m, b)
-				b.StartTimer()
-
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// BenchmarkWarmStart runs benchmarks with pre-loaded model
-func BenchmarkWarmStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
-
-			// Pre-warm the model
-			warmup(client, m, tt.prompt, b)
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// setup verifies server and model availability
-func setup(b *testing.B) *api.Client {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		b.Fatal(err)
-	}
-	if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
-		b.Fatalf("Model unavailable: %v", err)
-	}
-
-	return client
-}
-
-// warmup ensures the model is loaded and warmed up
-func warmup(client *api.Client, model string, prompt string, b *testing.B) {
-	for range 3 {
-		err := client.Generate(
-			context.Background(),
-			&api.GenerateRequest{
-				Model:   model,
-				Prompt:  prompt,
-				Options: map[string]interface{}{"num_predict": 50, "temperature": 0.1},
-			},
-			func(api.GenerateResponse) error { return nil },
-		)
-		if err != nil {
-			b.Logf("Error during model warm-up: %v", err)
-		}
-	}
-}
-
-// unload forces model unloading using KeepAlive: 0 parameter
-func unload(client *api.Client, model string, b *testing.B) {
-	req := &api.GenerateRequest{
-		Model:     model,
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
-		b.Logf("Unload error: %v", err)
-	}
-	time.Sleep(1 * time.Second)
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -703,8 +703,6 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 			for _, k := range keys {
 				var v string
 				switch vData := resp.ModelInfo[k].(type) {
-				case bool:
-					v = fmt.Sprintf("%t", vData)
 				case string:
 					v = vData
 				case float64:
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -87,8 +87,6 @@ func TestShowInfo(t *testing.T) {
 			ModelInfo: map[string]any{
 				"general.architecture":    "test",
 				"general.parameter_count": float64(8_000_000_000),
-				"some.true_bool":          true,
-				"some.false_bool":         false,
 				"test.context_length":     float64(1000),
 				"test.embedding_length":   float64(11434),
 			},
@@ -113,8 +111,6 @@ func TestShowInfo(t *testing.T) {
  Metadata
    general.architecture       test     
    general.parameter_count    8e+09    
-    some.false_bool            false    
-    some.true_bool             true     
    test.context_length        1000     
    test.embedding_length      11434    

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -201,7 +201,7 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	default:
-		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
+		return errors.New("unsupported architecture")
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -11,9 +11,10 @@ import (
 	"slices"
 	"strings"

-	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
 	"golang.org/x/exp/maps"
+
+	"github.com/ollama/ollama/types/bfloat16"
 )

 type safetensorMetadata struct {
--- a/docs/api.md
+++ b/docs/api.md
@@ -558,10 +558,6 @@ Final response:
 {
  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
-  "message": {
-    "role": "assistant",
-    "content": ""
-  },
  "done": true,
  "total_duration": 4883583458,
  "load_duration": 1334875,
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -1,59 +0,0 @@
-# Benchmark
-
-Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
-
-## When to use
-
-Run these benchmarks when:
- Making changes to the model inference engine
- Modifying model loading/unloading logic
- Changing prompt processing or token generation code
- Implementing a new model architecture
- Testing performance across different hardware setups
-
-## Prerequisites
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
-## Usage and Examples
-
->[!NOTE]
->All commands must be run from the root directory of the Ollama project.
-
-Basic syntax:
-```bash
-go test -bench=. ./benchmark/... -m $MODEL_NAME
-```
-
-Required flags:
- `-bench=.`: Run all benchmarks
- `-m`: Model name to benchmark
-
-Optional flags:
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
-
-Common usage patterns:
-
-Single benchmark run with a model specified:
-```bash
-go test -bench=. ./benchmark/... -m llama3.3
-```
-
-## Output metrics
-
-The benchmark reports several key metrics:
-
- `gen_tok/s`: Generated tokens per second
- `prompt_tok/s`: Prompt processing tokens per second
- `ttft_ms`: Time to first token in milliseconds
- `load_ms`: Model load time in milliseconds
- `gen_tokens`: Total tokens generated
- `prompt_tokens`: Total prompt tokens processed
-
-Each benchmark runs two scenarios:
- Cold start: Model is loaded from disk for each test
- Warm start: Model is pre-loaded in memory
-
-Three prompt lengths are tested for each scenario:
- Short prompt (100 tokens)
- Medium prompt (500 tokens)
- Long prompt (1000 tokens)
--- a/go.mod
+++ b/go.mod
@@ -16,7 +16,6 @@ require (

 require (
 	github.com/agnivade/levenshtein v1.1.1
-	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
--- a/go.sum
+++ b/go.sum
@@ -35,8 +35,6 @@ github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARu
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
--- a/grammar/bench_test.go
+++ b/grammar/bench_test.go
@@ -1,22 +0,0 @@
-//go:build go1.24
-
-package grammar
-
-import "testing"
-
-func BenchmarkFromSchema(b *testing.B) {
-	for tt := range testCases(b) {
-		b.Run("", func(b *testing.B) {
-			s := []byte(tt.schema)
-
-			b.ReportAllocs()
-			for b.Loop() {
-				_, err := FromSchema(nil, s)
-				if err != nil {
-					b.Fatalf("GrammarFromSchema: %v", err)
-				}
-			}
-		})
-		return
-	}
-}
--- a/grammar/grammar.go
+++ b/grammar/grammar.go
@@ -1,227 +0,0 @@
-package grammar
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"iter"
-	"strconv"
-
-	"github.com/ollama/ollama/grammar/jsonschema"
-)
-
-const jsonTerms = `
-# Unicode
-#
-# Unicode characters can be specified directly in the grammar, for example
-# hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit
-# (\UXXXXXXXX).
-unicode ::= \x{hex}{2} | \u{hex}{4} | \U{hex}{8}
-
-# JSON grammar from RFC 7159
-null    ::= "null"
-object  ::= "{" (kv ("," kv)*)? "}"
-array   ::= "[" (value ("," value)*)? "]"
-kv      ::= string ":" value
-integer ::= "0" | [1-9] [0-9]*
-number  ::= "-"? integer frac? exp?
-frac    ::= "." [0-9]+
-exp     ::= ("e" | "E") ("+" | "-") [0-9]+
-string  ::= "\"" char* "\""
-escape  ::= ["/" | "b" | "f" | "n" | "r" | "t" | unicode]
-char    ::= [^"\\] | escape
-space   ::= (" " | "\t" | "\n" | "\r")*
-hex     ::= [0-9] | [a-f] | [A-F]
-boolean ::= "true" | "false"
-value   ::= object | array | string | number | boolean | "null"
-
-# User-defined
-`
-
-// FromSchema generates a grammar from a JSON schema.
-func FromSchema(buf []byte, jsonSchema []byte) ([]byte, error) {
-	var s *jsonschema.Schema
-	if err := json.Unmarshal(jsonSchema, &s); err != nil {
-		return nil, err
-	}
-
-	var g builder
-
-	// "root" is the only rule that is guaranteed to exist, so we start
-	// with its length for padding, and then adjust it as we go.
-	g.pad = len("root")
-	for id := range dependencies("root", s) {
-		g.pad = max(g.pad, len(id))
-	}
-
-	g.b.WriteString(jsonTerms)
-
-	ids := make(map[*jsonschema.Schema]string)
-	for id, s := range dependencies("root", s) {
-		ids[s] = id
-		g.define(id)
-		if err := fromSchema(&g, ids, s); err != nil {
-			return nil, err
-		}
-	}
-	g.define("root")
-	if err := fromSchema(&g, ids, s); err != nil {
-		return nil, err
-	}
-	g.define("") // finalize the last rule
-	return g.b.Bytes(), nil
-}
-
-func fromSchema(g *builder, ids map[*jsonschema.Schema]string, s *jsonschema.Schema) error {
-	switch typ := s.EffectiveType(); typ {
-	case "array":
-		if len(s.PrefixItems) == 0 && s.Items == nil {
-			g.u("array")
-		} else {
-			g.q("[")
-			for i, s := range s.PrefixItems {
-				if i > 0 {
-					g.q(",")
-				}
-				g.u(ids[s])
-			}
-			if s.Items != nil {
-				g.u("(")
-				if len(s.PrefixItems) > 0 {
-					g.q(",")
-				}
-				g.u(ids[s.Items])
-				g.u(")*")
-			}
-			g.q("]")
-		}
-	case "object":
-		if len(s.Properties) == 0 {
-			g.u("object")
-		} else {
-			g.q("{")
-			for i, p := range s.Properties {
-				name := ids[p]
-				if i > 0 {
-					g.q(",")
-				}
-				g.q(p.Name)
-				g.q(":")
-				g.u(name)
-			}
-			g.q("}")
-		}
-	case "number":
-		buildConstrainedNumber(g, s)
-	case "string":
-		if len(s.Enum) == 0 {
-			g.u("string")
-		} else {
-			g.u("(")
-			for i, e := range s.Enum {
-				if i > 0 {
-					g.q("|")
-				}
-				g.q(string(e))
-			}
-			g.u(")")
-		}
-	case "boolean", "value", "null", "integer":
-		g.u(typ)
-	default:
-		return fmt.Errorf("%s: unsupported type %q", s.Name, typ)
-	}
-	return nil
-}
-
-// dependencies returns a sequence of all child dependencies of the schema in
-// post-order.
-//
-// The first value is the id/pointer to the dependency, and the second value
-// is the schema.
-func dependencies(id string, s *jsonschema.Schema) iter.Seq2[string, *jsonschema.Schema] {
-	return func(yield func(string, *jsonschema.Schema) bool) {
-		for i, p := range s.Properties {
-			id := fmt.Sprintf("%s_%d", id, i)
-			for did, d := range dependencies(id, p) {
-				if !yield(did, d) {
-					return
-				}
-			}
-			if !yield(id, p) {
-				return
-			}
-		}
-		for i, p := range s.PrefixItems {
-			id := fmt.Sprintf("tuple_%d", i)
-			for did, d := range dependencies(id, p) {
-				id := fmt.Sprintf("%s_%s", id, did)
-				if !yield(id, d) {
-					return
-				}
-			}
-			if !yield(id, p) {
-				return
-			}
-		}
-		if s.Items != nil {
-			id := fmt.Sprintf("%s_tuple_%d", id, len(s.PrefixItems))
-			for did, d := range dependencies(id, s.Items) {
-				if !yield(did, d) {
-					return
-				}
-			}
-			if !yield(id, s.Items) {
-				return
-			}
-		}
-	}
-}
-
-type builder struct {
-	b     bytes.Buffer
-	pad   int
-	rules int
-	items int
-}
-
-// define terminates the current rule, if any, and then either starts a new
-// rule or does nothing else if the name is empty.
-func (b *builder) define(name string) {
-	if b.rules > 0 {
-		b.b.WriteString(";\n")
-	}
-	if name == "" {
-		return
-	}
-	fmt.Fprintf(&b.b, "% -*s", b.pad, name)
-	b.b.WriteString(" ::=")
-	b.rules++
-	b.items = 0
-}
-
-// quote appends a terminal to the current rule.
-func (b *builder) q(s string) {
-	if b.items > 0 {
-		b.b.WriteString(" ")
-	}
-	b.b.WriteString(" ")
-	b.b.WriteString(strconv.Quote(s))
-}
-
-// u appends a non-terminal to the current rule.
-func (b *builder) u(s string) {
-	if b.items > 0 {
-		b.b.WriteString(" ")
-	}
-	b.b.WriteString(" ")
-	b.b.WriteString(s)
-}
-
-func buildConstrainedNumber(b *builder, s *jsonschema.Schema) {
-	if s.Minimum == 0 && s.Maximum == 0 {
-		b.u("TODO")
-	} else {
-		b.u("number")
-	}
-}
--- a/grammar/grammar_test.go
+++ b/grammar/grammar_test.go
@@ -1,75 +0,0 @@
-package grammar
-
-import (
-	"bufio"
-	"cmp"
-	"iter"
-	"strings"
-	"testing"
-
-	_ "embed"
-
-	"github.com/ollama/ollama/grammar/internal/diff"
-)
-
-func TestFromSchema(t *testing.T) {
-	for tt := range testCases(t) {
-		t.Run(tt.name, func(t *testing.T) {
-			g, err := FromSchema(nil, []byte(tt.schema))
-			if err != nil {
-				t.Fatalf("FromSchema: %v", err)
-			}
-			got := string(g)
-			got = strings.TrimPrefix(got, jsonTerms)
-			if got != tt.want {
-				t.Logf("schema:\n%s", tt.schema)
-				t.Fatal(string(diff.Diff("got", []byte(got), "want", []byte(tt.want))))
-			}
-		})
-	}
-}
-
-type testCase struct {
-	name   string
-	schema string
-	want   string
-}
-
-//go:embed testdata/schemas.txt
-var tests string
-
-func testCases(t testing.TB) iter.Seq[testCase] {
-	t.Helper()
-	return func(yield func(testCase) bool) {
-		t.Helper()
-		sc := bufio.NewScanner(strings.NewReader(tests))
-		name := ""
-		for sc.Scan() {
-			line := strings.TrimSpace(sc.Text())
-			if line == "" {
-				name = ""
-				continue
-			}
-			if line[0] == '#' {
-				name = cmp.Or(name, strings.TrimSpace(line[1:]))
-				continue
-			}
-			s := sc.Text()
-			g := ""
-			for sc.Scan() {
-				line = strings.TrimSpace(sc.Text())
-				if line == "" || line[0] == '#' {
-					break
-				}
-				g += sc.Text() + "\n"
-			}
-			if !yield(testCase{name, s, g}) {
-				return
-			}
-			name = strings.TrimSpace(strings.TrimPrefix(line, "#"))
-		}
-		if err := sc.Err(); err != nil {
-			t.Fatalf("error reading tests: %v", err)
-		}
-	}
-}
--- a/grammar/internal/diff/diff.go
+++ b/grammar/internal/diff/diff.go
@@ -1,261 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package diff
-
-import (
-	"bytes"
-	"fmt"
-	"sort"
-	"strings"
-)
-
-// A pair is a pair of values tracked for both the x and y side of a diff.
-// It is typically a pair of line indexes.
-type pair struct{ x, y int }
-
-// Diff returns an anchored diff of the two texts old and new
-// in the “unified diff” format. If old and new are identical,
-// Diff returns a nil slice (no output).
-//
-// Unix diff implementations typically look for a diff with
-// the smallest number of lines inserted and removed,
-// which can in the worst case take time quadratic in the
-// number of lines in the texts. As a result, many implementations
-// either can be made to run for a long time or cut off the search
-// after a predetermined amount of work.
-//
-// In contrast, this implementation looks for a diff with the
-// smallest number of “unique” lines inserted and removed,
-// where unique means a line that appears just once in both old and new.
-// We call this an “anchored diff” because the unique lines anchor
-// the chosen matching regions. An anchored diff is usually clearer
-// than a standard diff, because the algorithm does not try to
-// reuse unrelated blank lines or closing braces.
-// The algorithm also guarantees to run in O(n log n) time
-// instead of the standard O(n²) time.
-//
-// Some systems call this approach a “patience diff,” named for
-// the “patience sorting” algorithm, itself named for a solitaire card game.
-// We avoid that name for two reasons. First, the name has been used
-// for a few different variants of the algorithm, so it is imprecise.
-// Second, the name is frequently interpreted as meaning that you have
-// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
-// when in fact the algorithm is faster than the standard one.
-func Diff(oldName string, old []byte, newName string, new []byte) []byte {
-	if bytes.Equal(old, new) {
-		return nil
-	}
-	x := lines(old)
-	y := lines(new)
-
-	// Print diff header.
-	var out bytes.Buffer
-	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
-	fmt.Fprintf(&out, "--- %s\n", oldName)
-	fmt.Fprintf(&out, "+++ %s\n", newName)
-
-	// Loop over matches to consider,
-	// expanding each match to include surrounding lines,
-	// and then printing diff chunks.
-	// To avoid setup/teardown cases outside the loop,
-	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
-	// in the sequence of matches.
-	var (
-		done  pair     // printed up to x[:done.x] and y[:done.y]
-		chunk pair     // start lines of current chunk
-		count pair     // number of lines from each side in current chunk
-		ctext []string // lines for current chunk
-	)
-	for _, m := range tgs(x, y) {
-		if m.x < done.x {
-			// Already handled scanning forward from earlier match.
-			continue
-		}
-
-		// Expand matching lines as far as possible,
-		// establishing that x[start.x:end.x] == y[start.y:end.y].
-		// Note that on the first (or last) iteration we may (or definitely do)
-		// have an empty match: start.x==end.x and start.y==end.y.
-		start := m
-		for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
-			start.x--
-			start.y--
-		}
-		end := m
-		for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
-			end.x++
-			end.y++
-		}
-
-		// Emit the mismatched lines before start into this chunk.
-		// (No effect on first sentinel iteration, when start = {0,0}.)
-		for _, s := range x[done.x:start.x] {
-			ctext = append(ctext, "-"+s)
-			count.x++
-		}
-		for _, s := range y[done.y:start.y] {
-			ctext = append(ctext, "+"+s)
-			count.y++
-		}
-
-		// If we're not at EOF and have too few common lines,
-		// the chunk includes all the common lines and continues.
-		const C = 3 // number of context lines
-		if (end.x < len(x) || end.y < len(y)) &&
-			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
-			for _, s := range x[start.x:end.x] {
-				ctext = append(ctext, " "+s)
-				count.x++
-				count.y++
-			}
-			done = end
-			continue
-		}
-
-		// End chunk with common lines for context.
-		if len(ctext) > 0 {
-			n := end.x - start.x
-			if n > C {
-				n = C
-			}
-			for _, s := range x[start.x : start.x+n] {
-				ctext = append(ctext, " "+s)
-				count.x++
-				count.y++
-			}
-			done = pair{start.x + n, start.y + n}
-
-			// Format and emit chunk.
-			// Convert line numbers to 1-indexed.
-			// Special case: empty file shows up as 0,0 not 1,0.
-			if count.x > 0 {
-				chunk.x++
-			}
-			if count.y > 0 {
-				chunk.y++
-			}
-			fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
-			for _, s := range ctext {
-				out.WriteString(s)
-			}
-			count.x = 0
-			count.y = 0
-			ctext = ctext[:0]
-		}
-
-		// If we reached EOF, we're done.
-		if end.x >= len(x) && end.y >= len(y) {
-			break
-		}
-
-		// Otherwise start a new chunk.
-		chunk = pair{end.x - C, end.y - C}
-		for _, s := range x[chunk.x:end.x] {
-			ctext = append(ctext, " "+s)
-			count.x++
-			count.y++
-		}
-		done = end
-	}
-
-	return out.Bytes()
-}
-
-// lines returns the lines in the file x, including newlines.
-// If the file does not end in a newline, one is supplied
-// along with a warning about the missing newline.
-func lines(x []byte) []string {
-	l := strings.SplitAfter(string(x), "\n")
-	if l[len(l)-1] == "" {
-		l = l[:len(l)-1]
-	} else {
-		// Treat last line as having a message about the missing newline attached,
-		// using the same text as BSD/GNU diff (including the leading backslash).
-		l[len(l)-1] += "\n\\ No newline at end of file\n"
-	}
-	return l
-}
-
-// tgs returns the pairs of indexes of the longest common subsequence
-// of unique lines in x and y, where a unique line is one that appears
-// once in x and once in y.
-//
-// The longest common subsequence algorithm is as described in
-// Thomas G. Szymanski, “A Special Case of the Maximal Common
-// Subsequence Problem,” Princeton TR #170 (January 1975),
-// available at https://research.swtch.com/tgs170.pdf.
-func tgs(x, y []string) []pair {
-	// Count the number of times each string appears in a and b.
-	// We only care about 0, 1, many, counted as 0, -1, -2
-	// for the x side and 0, -4, -8 for the y side.
-	// Using negative numbers now lets us distinguish positive line numbers later.
-	m := make(map[string]int)
-	for _, s := range x {
-		if c := m[s]; c > -2 {
-			m[s] = c - 1
-		}
-	}
-	for _, s := range y {
-		if c := m[s]; c > -8 {
-			m[s] = c - 4
-		}
-	}
-
-	// Now unique strings can be identified by m[s] = -1+-4.
-	//
-	// Gather the indexes of those strings in x and y, building:
-	//	xi[i] = increasing indexes of unique strings in x.
-	//	yi[i] = increasing indexes of unique strings in y.
-	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
-	var xi, yi, inv []int
-	for i, s := range y {
-		if m[s] == -1+-4 {
-			m[s] = len(yi)
-			yi = append(yi, i)
-		}
-	}
-	for i, s := range x {
-		if j, ok := m[s]; ok && j >= 0 {
-			xi = append(xi, i)
-			inv = append(inv, j)
-		}
-	}
-
-	// Apply Algorithm A from Szymanski's paper.
-	// In those terms, A = J = inv and B = [0, n).
-	// We add sentinel pairs {0,0}, and {len(x),len(y)}
-	// to the returned sequence, to help the processing loop.
-	J := inv
-	n := len(xi)
-	T := make([]int, n)
-	L := make([]int, n)
-	for i := range T {
-		T[i] = n + 1
-	}
-	for i := range n {
-		k := sort.Search(n, func(k int) bool {
-			return T[k] >= J[i]
-		})
-		T[k] = J[i]
-		L[i] = k + 1
-	}
-	k := 0
-	for _, v := range L {
-		if k < v {
-			k = v
-		}
-	}
-	seq := make([]pair, 2+k)
-	seq[1+k] = pair{len(x), len(y)} // sentinel at end
-	lastj := n
-	for i := n - 1; i >= 0; i-- {
-		if L[i] == k && J[i] < lastj {
-			seq[k] = pair{xi[i], yi[J[i]]}
-			k--
-		}
-	}
-	seq[0] = pair{0, 0} // sentinel at start
-	return seq
-}
--- a/grammar/internal/diff/diff_test.go
+++ b/grammar/internal/diff/diff_test.go
@@ -1,44 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package diff
-
-import (
-	"bytes"
-	"path/filepath"
-	"testing"
-
-	"golang.org/x/tools/txtar"
-)
-
-func clean(text []byte) []byte {
-	text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
-	text = bytes.TrimSuffix(text, []byte("^D\n"))
-	return text
-}
-
-func Test(t *testing.T) {
-	files, _ := filepath.Glob("testdata/*.txt")
-	if len(files) == 0 {
-		t.Fatalf("no testdata")
-	}
-
-	for _, file := range files {
-		t.Run(filepath.Base(file), func(t *testing.T) {
-			a, err := txtar.ParseFile(file)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if len(a.Files) != 3 || a.Files[2].Name != "diff" {
-				t.Fatalf("%s: want three files, third named \"diff\"", file)
-			}
-			diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
-			want := clean(a.Files[2].Data)
-			if !bytes.Equal(diffs, want) {
-				t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
-					diffs, want, Diff("have", diffs, "want", want))
-			}
-		})
-	}
-}
--- a/grammar/internal/diff/testdata/allnew.txt
+++ b/grammar/internal/diff/testdata/allnew.txt
@@ -1,13 +0,0 @@
-- old --
-- new --
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -0,0 +1,3 @@
-+a
-+b
-+c
--- a/grammar/internal/diff/testdata/allold.txt
+++ b/grammar/internal/diff/testdata/allold.txt
@@ -1,13 +0,0 @@
-- old --
-a
-b
-c
-- new --
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +0,0 @@
-a
-b
-c
--- a/grammar/internal/diff/testdata/basic.txt
+++ b/grammar/internal/diff/testdata/basic.txt
@@ -1,35 +0,0 @@
-Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
-https://www.cs.dartmouth.edu/~doug/diff.pdf
-
-- old --
-a
-b
-c
-d
-e
-f
-g
-- new --
-w
-a
-b
-x
-y
-z
-e
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,7 +1,7 @@
-+w
- a
- b
-c
-d
-+x
-+y
-+z
- e
-f
-g
--- a/grammar/internal/diff/testdata/dups.txt
+++ b/grammar/internal/diff/testdata/dups.txt
@@ -1,40 +0,0 @@
-- old --
-a
-
-b
-
-c
-
-d
-
-e
-
-f
-- new --
-a
-
-B
-
-C
-
-d
-
-e
-
-f
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,8 +1,8 @@
- a
- $
-b
-
-c
-+B
-+
-+C
- $
- d
- $
--- a/grammar/internal/diff/testdata/end.txt
+++ b/grammar/internal/diff/testdata/end.txt
@@ -1,38 +0,0 @@
-- old --
-1
-2
-3
-4
-5
-6
-7
-eight
-nine
-ten
-eleven
-- new --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-- diff --
-diff old new
--- old
-+++ new
-@@ -5,7 +5,6 @@
- 5
- 6
- 7
-eight
-nine
-ten
-eleven
-+8
-+9
-+10
--- a/grammar/internal/diff/testdata/eof.txt
+++ b/grammar/internal/diff/testdata/eof.txt
@@ -1,9 +0,0 @@
-- old --
-a
-b
-c^D
-- new --
-a
-b
-c^D
-- diff --
--- a/grammar/internal/diff/testdata/eof1.txt
+++ b/grammar/internal/diff/testdata/eof1.txt
@@ -1,18 +0,0 @@
-- old --
-a
-b
-c
-- new --
-a
-b
-c^D
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +1,3 @@
- a
- b
-c
-+c
-\ No newline at end of file
--- a/grammar/internal/diff/testdata/eof2.txt
+++ b/grammar/internal/diff/testdata/eof2.txt
@@ -1,18 +0,0 @@
-- old --
-a
-b
-c^D
-- new --
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +1,3 @@
- a
- b
-c
-\ No newline at end of file
-+c
--- a/grammar/internal/diff/testdata/long.txt
+++ b/grammar/internal/diff/testdata/long.txt
@@ -1,62 +0,0 @@
-- old --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-14½
-15
-16
-17
-18
-19
-20
-- new --
-1
-2
-3
-4
-5
-6
-8
-9
-10
-11
-12
-13
-14
-17
-18
-19
-20
-- diff --
-diff old new
--- old
-+++ new
-@@ -4,7 +4,6 @@
- 4
- 5
- 6
-7
- 8
- 9
- 10
-@@ -12,9 +11,6 @@
- 12
- 13
- 14
-14½
-15
-16
- 17
- 18
- 19
--- a/grammar/internal/diff/testdata/same.txt
+++ b/grammar/internal/diff/testdata/same.txt
@@ -1,5 +0,0 @@
-- old --
-hello world
-- new --
-hello world
-- diff --
--- a/grammar/internal/diff/testdata/start.txt
+++ b/grammar/internal/diff/testdata/start.txt
@@ -1,34 +0,0 @@
-- old --
-e
-pi
-4
-5
-6
-7
-8
-9
-10
-- new --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,5 +1,6 @@
-e
-pi
-+1
-+2
-+3
- 4
- 5
- 6
--- a/grammar/internal/diff/testdata/triv.txt
+++ b/grammar/internal/diff/testdata/triv.txt
@@ -1,40 +0,0 @@
-Another example from Hunt and McIlroy,
-“An Algorithm for Differential File Comparison.”
-https://www.cs.dartmouth.edu/~doug/diff.pdf
-
-Anchored diff gives up on finding anything,
-since there are no unique lines.
-
-- old --
-a
-b
-c
-a
-b
-b
-a
-- new --
-c
-a
-b
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,7 +1,6 @@
-a
-b
-c
-a
-b
-b
-a
-+c
-+a
-+b
-+a
-+b
-+c
--- a/grammar/jsonschema/decode.go
+++ b/grammar/jsonschema/decode.go
@@ -1,171 +0,0 @@
-package jsonschema
-
-import (
-	"bytes"
-	"encoding/json"
-	"errors"
-)
-
-// Schema holds a JSON schema.
-type Schema struct {
-	// Name is the name of the property. For the parent/root property, this
-	// is "root". For child properties, this is the name of the property.
-	Name string `json:"-"`
-
-	// Type is the type of the property.
-	//
-	// TODO: Union types (e.g. make this a []string).
-	Type string
-
-	// PrefixItems is a list of schemas for each item in a tuple. By
-	// default, the tuple is "closed." unless Items is set to true or a
-	// valid Schema.
-	PrefixItems []*Schema
-
-	// Items is the schema for each item in a list.
-	//
-	// If it is missing, or its JSON value is "null" or "false", it is nil.
-	// If the JSON value is "true", it is set to the empty Schema. If the
-	// JSON value is an object, it will be decoded as a Schema.
-	Items *Schema
-
-	// MinItems specifies the minimum number of items allowed in a list.
-	MinItems int
-
-	// MaxItems specifies the maximum number of items allowed in a list.
-	MaxItems int
-
-	// Properties is the schema for each property of an object.
-	Properties []*Schema
-
-	// Format is the format of the property. This is used to validate the
-	// property against a specific format.
-	//
-	// It is the callers responsibility to validate the property against
-	// the format.
-	Format string
-
-	// Minimum specifies the minimum value for numeric properties.
-	Minimum float64
-
-	// Maximum specifies the maximum value for numeric properties.
-	Maximum float64
-
-	// Enum is a list of valid values for the property.
-	Enum []json.RawMessage
-}
-
-func (s *Schema) UnmarshalJSON(data []byte) error {
-	type S Schema
-	w := struct {
-		Properties props
-		Items      items
-		*S
-	}{
-		S: (*S)(s),
-	}
-	if err := json.Unmarshal(data, &w); err != nil {
-		return err
-	}
-	if w.Items.set {
-		s.Items = &w.Items.Schema
-	}
-	s.Properties = w.Properties
-	return nil
-}
-
-type items struct {
-	Schema
-	set bool
-}
-
-func (s *items) UnmarshalJSON(data []byte) error {
-	switch b := data[0]; b {
-	case 't':
-		*s = items{set: true}
-	case '{':
-		type I items
-		if err := json.Unmarshal(data, (*I)(s)); err != nil {
-			return err
-		}
-		s.set = true
-	case 'n', 'f':
-	default:
-		return errors.New("invalid Items")
-	}
-	return nil
-}
-
-// EffectiveType returns the effective type of the schema. If the Type field is
-// not empty, it is returned; otherwise:
-//
-//   - If the schema has both Properties and Items, it returns an empty string.
-//   - If the schema has Properties, it returns "object".
-//   - If the schema has Items, it returns "array".
-//   - If the schema has neither Properties nor Items, it returns "value".
-//
-// The returned string is never empty.
-func (d *Schema) EffectiveType() string {
-	if d.Type == "" {
-		if len(d.Properties) > 0 {
-			return "object"
-		}
-		if len(d.PrefixItems) > 0 || d.Items != nil {
-			return "array"
-		}
-		return "value"
-	}
-	return d.Type
-}
-
-// props is an ordered list of properties. The order of the properties
-// is the order in which they were defined in the schema.
-type props []*Schema
-
-var _ json.Unmarshaler = (*props)(nil)
-
-func (v *props) UnmarshalJSON(data []byte) error {
-	if len(data) == 0 {
-		return nil
-	}
-	if data[0] != '{' {
-		return errors.New("expected object")
-	}
-
-	d := json.NewDecoder(bytes.NewReader(data))
-
-	// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
-	// llama.cpp, ignore unknown fields, which could be lead to unexpected
-	// behavior for clients of this package, since they may not be aware
-	// that "additionalFields", "itemsPrefix", etc, are being ignored.
-	//
-	// For now, just do what llama.cpp does.
-
-	t, err := d.Token()
-	if err != nil {
-		return err
-	}
-	if t != json.Delim('{') {
-		return errors.New("expected object")
-	}
-	for d.More() {
-		// Use the first token (map key) as the property name, then
-		// decode the rest of the object fields into a Schema and
-		// append.
-		t, err := d.Token()
-		if err != nil {
-			return err
-		}
-		if t == json.Delim('}') {
-			return nil
-		}
-		s := &Schema{
-			Name: t.(string),
-		}
-		if err := d.Decode(s); err != nil {
-			return err
-		}
-		*v = append(*v, s)
-	}
-	return nil
-}
--- a/grammar/jsonschema/decode_test.go
+++ b/grammar/jsonschema/decode_test.go
@@ -1,104 +0,0 @@
-package jsonschema
-
-import (
-	"encoding/json"
-	"reflect"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-const testSchemaBasic = `
-{
-  "properties": {
-    "tupleClosedEmpty":   { "prefixItems": [] },
-    "tupleClosedMissing": { "prefixItems": [{}] },
-    "tupleClosedNull":    { "prefixItems": [{}], "items": null },
-    "tupleClosedFalse":   { "prefixItems": [{}], "items": false },
-    "tupleOpenTrue":      { "prefixItems": [{}], "items": true },
-    "tupleOpenEmpty":     { "prefixItems": [{}], "items": {} },
-    "tupleOpenTyped":     { "prefixItems": [{}], "items": {"type": "boolean"} },
-    "tupleOpenMax":       { "prefixItems": [{}], "items": true, "maxItems": 3},
-
-    "array": { "items": {"type": "number"} },
-
-    "null": { "type": "null" },
-    "string": { "type": "string" },
-    "boolean": { "type": "boolean" }
-  }
-}
-`
-
-func TestSchemaUnmarshal(t *testing.T) {
-	var got *Schema
-	if err := json.Unmarshal([]byte(testSchemaBasic), &got); err != nil {
-		t.Fatalf("Unmarshal: %v", err)
-	}
-	want := &Schema{
-		Properties: []*Schema{
-			{Name: "tupleClosedEmpty", PrefixItems: []*Schema{}, Items: nil},
-			{Name: "tupleClosedMissing", PrefixItems: []*Schema{{}}, Items: nil},
-			{Name: "tupleClosedNull", PrefixItems: []*Schema{{}}, Items: nil},
-			{Name: "tupleClosedFalse", PrefixItems: []*Schema{{}}, Items: nil},
-
-			{Name: "tupleOpenTrue", PrefixItems: []*Schema{{}}, Items: &Schema{}},
-			{Name: "tupleOpenEmpty", PrefixItems: []*Schema{{}}, Items: &Schema{}},
-			{Name: "tupleOpenTyped", PrefixItems: []*Schema{{}}, Items: &Schema{Type: "boolean"}},
-			{Name: "tupleOpenMax", PrefixItems: []*Schema{{}}, Items: &Schema{}, MaxItems: 3},
-
-			{Name: "array", Items: &Schema{Type: "number"}},
-
-			{Name: "null", Type: "null"},
-			{Name: "string", Type: "string"},
-			{Name: "boolean", Type: "boolean"},
-		},
-	}
-
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("(-want, +got)\n%s", diff)
-	}
-}
-
-func TestEffectiveType(t *testing.T) {
-	const schema = `
-		{"properties": {
-			"o": {"type": "object"},
-			"a": {"type": "array"},
-			"n": {"type": "number"},
-			"s": {"type": "string"},
-			"z": {"type": "null"},
-			"b": {"type": "boolean"},
-
-			"t0": {"prefixItems": [{}], "items": {"type": "number"}},
-			"t1": {"items": {"type": "number"}, "maxItems": 3},
-
-			"v": {"maxItems": 3}
-		}}
-	`
-
-	var s *Schema
-	if err := json.Unmarshal([]byte(schema), &s); err != nil {
-		t.Fatalf("json.Unmarshal: %v", err)
-	}
-
-	var got []string
-	for _, p := range s.Properties {
-		got = append(got, p.EffectiveType())
-	}
-
-	want := strings.Fields(`
-		object
-		array
-		number
-		string
-		null
-		boolean
-		array
-		array
-		value
-	`)
-	if !reflect.DeepEqual(want, got) {
-		t.Errorf("\ngot:\n\t%v\nwant:\n\t%v", got, want)
-	}
-}
--- a/grammar/testdata/schemas.txt
+++ b/grammar/testdata/schemas.txt
@@ -1,76 +0,0 @@
-# This file holds tests for JSON schema to EBNF grammar conversions.
-#
-# The format is a JSON schema, followed by the expected EBNF grammar. Each test
-# MAY be preceded by a comment that describes the test (e.g. the test name), followed by
-# the JSON schema and the expected EBNF grammar. If no comment is present, the test
-# name the tests number in the file (e.g. "#0", "#1", etc.)
-#
-# Blank lines signify the end or start of a new test. Comments can be added
-# anywhere in the file, but they must be preceded by a '#' character and start at
-# the beginning of the line.
-
-# default
-{}
-root ::= value;
-
-{"properties": {}}
-root ::= value;
-
-# array
-{"properties": {"a": {"type": "array", "items": {"type": "string"}}}}
-root_0_tuple_0 ::= string;
-root_0         ::= "[" ( root_0_tuple_0 )* "]";
-root           ::= "{" "a" ":" root_0 "}";
-
-# array with nested array
-{"type": "array", "items": {"type": "array", "items": {"type": "string"}}}
-root_tuple_0_tuple_0 ::= string;
-root_tuple_0         ::= "[" ( root_tuple_0_tuple_0 )* "]";
-root                 ::= "[" ( root_tuple_0 )* "]";
-
-# object
-{"properties": {"e": {}}}
-root_0 ::= value;
-root   ::= "{" "e" ":" root_0 "}";
-
-# object with nested object
-{"properties": {"o": {"type": "object", "properties": {"e": {}}}}}
-root_0_0 ::= value;
-root_0   ::= "{" "e" ":" root_0_0 "}";
-root     ::= "{" "o" ":" root_0 "}";
-
-# boolean
-{"type": "boolean"}
-root ::= boolean;
-
-# number
-{"properties": {"n": {"type": "number", "minimum": 123, "maximum": 4567}}}
-root_0 ::= number;
-root   ::= "{" "n" ":" root_0 "}";
-
-# string
-{"type": "string"}
-root ::= string;
-
-# string with enum
-{"type": "string", "enum": ["a", "b", "c"]}
-root ::= ( "\"a\"" "|" "\"b\"" "|" "\"c\"" );
-
-# spaces in key
-{"properties": {"a b": {}}}
-root_0 ::= value;
-root   ::= "{" "a b" ":" root_0 "}";
-
-# issue7978
-{ "type": "object", "properties": { "steps": { "type": "array", "items": { "type": "object", "properties": { "explanation": { "type": "string" }, "output": { "type": "string" } }, "required": [ "explanation", "output" ], "additionalProperties": false } }, "final_answer": { "type": "string" } }, "required": [ "steps", "final_answer" ], "additionalProperties": false }
-root_0_tuple_0_0 ::= string;
-root_0_tuple_0_1 ::= string;
-root_0_tuple_0   ::= "{" "explanation" ":" root_0_tuple_0_0 "," "output" ":" root_0_tuple_0_1 "}";
-root_0           ::= "[" ( root_0_tuple_0 )* "]";
-root_1           ::= string;
-root             ::= "{" "steps" ":" root_0 "," "final_answer" ":" root_1 "}";
-
-# !! # special characters in key
-# !! {"properties": {"a!b": {}}}
-# !! !invalid character '!' in key
-# !! 
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -43,13 +43,8 @@ type Cache interface {

 	// ** cache management **

-	// Init sets up runtime parameters.
-	// backend: Used to allocate cache data storage and execute management operations (such as defrag)
-	// dtype: The data type for storing cache entries
-	// maxSequences: The maximum number of sequences stored in the cache - across all batches
-	// capacity: The number of cache entries to store, per sequence
-	// maxBatch: The maximum number of tokens that can occur in a single batch
-	Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int)
+	// Init sets up runtime parameters
+	Init(backend ml.Backend, dtype ml.DType, capacity int32)

 	// Close closes the cache and frees resources associated with it
 	Close()
@@ -57,7 +52,7 @@ type Cache interface {
 	// StartForward is called before the start of the model's forward pass.
 	// For each token in the coming batch, there must be a corresponding
 	// entry in positions and seqs.
-	StartForward(ctx ml.Context, batch input.Batch) error
+	StartForward(ctx ml.Context, opts input.Options) error

 	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
 	CopyPrefix(srcSeq, dstSeq int, len int32)
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -20,6 +20,7 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The mask is of shape history size, batch size
 type Causal struct {
 	DType      ml.DType
+	Capacity   int32
 	windowSize int32

 	opts CausalOptions
@@ -97,7 +98,7 @@ func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	}
 }

-func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -118,16 +119,9 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}

-	var cacheSize int
-	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize)+maxBatch {
-		cacheSize = maxSequences * capacity
-	} else {
-		cacheSize = maxSequences * (int(c.windowSize) + maxBatch)
-	}
-	cacheSize = roundUp(cacheSize, c.config.CachePadding)
-	c.cells = make([]cacheCell, cacheSize)
-
 	c.DType = dtype
+	c.Capacity = int32(roundUp(int(capacity), c.config.CachePadding))
+	c.cells = make([]cacheCell, c.Capacity)
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
 }
@@ -146,14 +140,12 @@ func (c *Causal) Close() {
 	}
 }

-func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
-	c.curBatchSize = len(batch.Positions)
-	c.curSequences = batch.Sequences
-	c.curPositions = batch.Positions
+func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
+	c.curBatchSize = len(opts.Positions)
+	c.curSequences = opts.Sequences
+	c.curPositions = opts.Positions
 	c.opts.Except = nil

-	c.updateSlidingWindow()
-
 	var err error
 	c.curLoc, err = c.findStartLoc()
 	if errors.Is(err, ErrKvCacheFull) {
@@ -165,8 +157,8 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
 	}

 	c.curCellRange = newRange()
-	for i, pos := range batch.Positions {
-		seq := batch.Sequences[i]
+	for i, pos := range opts.Positions {
+		seq := opts.Sequences[i]

 		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}

@@ -218,51 +210,7 @@ func (c *Causal) findStartLoc() (int, error) {
 		}
 	}

-	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
-}
-
-func (c *Causal) updateSlidingWindow() {
-	if c.windowSize == math.MaxInt32 {
-		return
-	}
-
-	// create a map of unique sequences to the lowest position in that sequence
-	lowestPos := make(map[int]int32)
-	for i := range c.curPositions {
-		seq := c.curSequences[i]
-
-		pos, ok := lowestPos[seq]
-		if !ok {
-			pos = c.curPositions[i]
-		} else if c.curPositions[i] < pos {
-			pos = c.curPositions[i]
-		}
-
-		lowestPos[seq] = pos
-	}
-
-	// delete any entries that are beyond the window of the oldest position in the sequence
-	for seq, pos := range lowestPos {
-		oldRange, ok := c.cellRanges[seq]
-		if !ok {
-			continue
-		}
-
-		newRange := newRange()
-
-		for i := oldRange.min; i <= oldRange.max; i++ {
-			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.windowSize {
-					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
-				} else {
-					newRange.min = min(newRange.min, i)
-					newRange.max = max(newRange.max, i)
-				}
-			}
-		}
-
-		c.cellRanges[seq] = newRange
-	}
+	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
 }

 func roundDown(length, pad int) int {
@@ -317,7 +265,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	return maskTensor, nil
 }

-func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
+func (c *Causal) moveCells(ctx ml.Context, src, dst, len int) {
 	for i, key := range c.keys {
 		if key == nil {
 			continue
@@ -327,8 +275,8 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
 		numKVHeads := key.Dim(1)
 		rowSize := key.Stride(2)

-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
+		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*len)
+		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*len)

 		value := c.values[i]
 		var vSrcView, vDstView ml.Tensor
@@ -336,14 +284,14 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
 			vHeadDim := value.Dim(1)
 			elemSize := value.Stride(0)

-			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
+			vSrcView = value.View(ctx, elemSize*src, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
+			vDstView = value.View(ctx, elemSize*dst, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
 		} else {
 			vHeadDim := value.Dim(0)
 			rowSize := value.Stride(2)

-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
+			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*len)
+			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*len)
 		}

 		ctx.Forward(
@@ -373,8 +321,7 @@ func (c *Causal) defrag() {
 	ctx := c.backend.NewContext()

 	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v). We also need to refer to the original
-	// k and v cache tensors - once per layer, not per move.
+	// copy for each of k and v).
 	layers := 0
 	for _, key := range c.keys {
 		if key == nil {
@@ -383,7 +330,7 @@ func (c *Causal) defrag() {
 		layers++
 	}

-	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
+	maxMoves := ctx.MaxGraphNodes() / (6 * layers)
 	moves := 0

 	var pendingSrc, pendingDst, pendingLen int
@@ -532,14 +479,14 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 	}

 	if _, ok := c.keys[c.curLayer]; !ok {
-		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, len(c.cells))
+		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, int(c.Capacity))
 	}

 	if _, ok := c.values[c.curLayer]; !ok {
 		if c.config.PermutedV {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), vHeadDim, numKVHeads)
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, int(c.Capacity), vHeadDim, numKVHeads)
 		} else {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, len(c.cells))
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, int(c.Capacity))
 		}
 	}

@@ -550,7 +497,7 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		elemSize := c.values[c.curLayer].Stride(0)

 		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
+		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)))
 	} else {
 		rowSize := c.values[c.curLayer].Stride(2)

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -25,7 +25,7 @@ func TestStore(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -58,11 +58,11 @@ func TestSWA(t *testing.T) {
 	cache := NewSWACache(1, nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF32, 16)

 	tests := []testCase{
 		{
-			name:          "FirstBatch",
+			name:          "SlidingWindow",
 			in:            []float32{1, 2, 3, 4},
 			inShape:       []int{1, 1, 4},
 			seqs:          []int{0, 0, 0, 0},
@@ -71,16 +71,6 @@ func TestSWA(t *testing.T) {
 			expectedShape: []int{1, 1, 4},
 			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
 		},
-		{
-			name:          "SecondBatch",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 0},
-			pos:           []int32{4, 5},
-			expected:      []float32{5, 6, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
-		},
 	}

 	testCache(t, backend, cache, tests)
@@ -91,7 +81,7 @@ func TestSequences(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -126,7 +116,7 @@ func TestRemove(t *testing.T) {
 	})
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -191,7 +181,7 @@ func TestDefrag(t *testing.T) {
 	})
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -239,7 +229,7 @@ func TestCopy(t *testing.T) {
 	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -280,7 +270,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			context := backend.NewContext()
 			defer context.Close()

-			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs})
+			err := cache.StartForward(context, input.Options{Positions: test.pos, Sequences: test.seqs})
 			if err != nil {
 				panic(err)
 			}
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -49,7 +49,7 @@ func NewEncoderCache() *EncoderCache {
 	}
 }

-func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -58,10 +58,6 @@ func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, ca
 		c.config = &config
 	}

-	if maxSequences > 1 {
-		panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
-	}
-
 	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
 		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
 	}
@@ -83,10 +79,10 @@ func (c *EncoderCache) Close() {
 	}
 }

-func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
 	// We work with the most recent image
-	if len(batch.Multimodal) > 0 {
-		c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
+	if len(opts.Multimodal) > 0 {
+		c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
 	}

 	return nil
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -23,9 +23,9 @@ func NewWrapperCache(caches ...Cache) *WrapperCache {
 	}
 }

-func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	for _, cache := range c.caches {
-		cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
+		cache.Init(backend, dtype, capacity)
 	}
 }

@@ -41,14 +41,14 @@ func (c *WrapperCache) Close() {
 	}
 }

-func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {
 	for i, cache := range c.caches {
-		err := cache.StartForward(ctx, batch)
+		err := cache.StartForward(ctx, opts)
 		if err != nil {
 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
 			for j := i - 1; j >= 0; j-- {
-				for k := range batch.Positions {
-					_ = c.caches[j].Remove(batch.Sequences[k], batch.Positions[k], math.MaxInt32)
+				for k := range opts.Positions {
+					_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)
 				}
 			}
 			return err
--- a/llm/server.go
+++ b/llm/server.go
@@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/grammar"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/model"
 )
@@ -701,9 +700,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			// User provided a JSON schema
-			g, err := grammar.FromSchema(nil, req.Format)
-			if err != nil {
-				return fmt.Errorf("invalid JSON schema in format: %w", err)
+			g := llama.SchemaToGrammar(req.Format)
+			if g == nil {
+				return fmt.Errorf("invalid JSON schema in format")
 			}
 			req.Grammar = string(g)
 		}
@@ -714,11 +713,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		req.Options = &opts
 	}

-	if req.Options == nil {
-		opts := api.DefaultOptions()
-		req.Options = &opts
-	}
-
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting completion request due to client closing the connection")
@@ -733,6 +727,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
 		req.Options.NumPredict = 10 * s.options.NumCtx
 	}
+
 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,7 +2,6 @@ package ml

 import (
 	"bytes"
-	"context"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -61,10 +60,6 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -81,9 +76,9 @@ type BackendParams struct {
 	FlashAttention bool
 }

-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+var backends = make(map[string]func(*os.File, BackendParams) (Backend, error))

-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -91,9 +86,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }

-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(f, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -9,17 +9,15 @@ package ggml
 import "C"

 import (
-	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
 	"maps"
 	"os"
-	"runtime"
 	"slices"
 	"strconv"
 	"strings"
-	"sync/atomic"
 	"unicode"
 	"unsafe"

@@ -60,7 +58,7 @@ type Backend struct {
 	maxGraphNodes int
 }

-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
+func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 	meta, n, err := fs.Decode(r, -1)
 	if err != nil {
 		return nil, err
@@ -299,16 +297,12 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

-	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
+	// concurrently read in tensor data. uses a section reader which is safe for concurrent reads
+	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
+	var g errgroup.Group
 	for _, t := range meta.Tensors().Items() {
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
-			for i := range tts {
-				target := targets[t.Name][i]
+		for _, target := range targets[t.Name] {
+			g.Go(func() error {
 				if target == "" {
 					target = t.Name
 				}
@@ -318,44 +312,25 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}

-				tts[i] = tt
-			}
+				bts := C.malloc(C.size_t(t.Size()))
+				if bts == nil {
+					return errors.New("failed to allocate tensor buffer")
+				}
+				defer C.free(bts)

-			sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					return err
+				buf := unsafe.Slice((*byte)(bts), t.Size())
+				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), buf)
+				if err != nil || n != len(buf) {
+					return errors.New("read failed")
 				}

-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if params.Progress != nil {
-					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
+				C.ggml_backend_tensor_set(tt, bts, 0, C.size_t(t.Size()))
+				return nil
+			})
+		}
 	}

-	// start a goroutine to cancel the errgroup if the parent context is done
-	go func() {
-		<-ctx.Done()
-		g.Go(func() error {
-			return ctx.Err()
-		})
-	}()
-
-	if err := g.Wait(); err != nil {
+	if g.Wait() != nil {
 		return nil, err
 	}

--- a/model/input/input.go
+++ b/model/input/input.go
@@ -1,7 +1,5 @@
 package input

-import "github.com/ollama/ollama/ml"
-
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
@@ -35,24 +33,11 @@ type MultimodalIndex struct {
 	Multimodal any
 }

-// Batch contains the inputs for a model forward pass
-type Batch struct {
-	// Inputs is the input tokens, including placeholders for multimodal inputs.
-	Inputs ml.Tensor
-
-	// Multimodal is a set of multimodal embeddings previously created by
-	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
-	// models or for batches without multimodal elements.
+// Options contains the inputs for a model forward pass
+type Options struct {
+	Inputs     []int32
 	Multimodal []MultimodalIndex
-
-	// Positions is the position for each Input, relative to its sequence. Equal
-	// in length to Inputs.
-	Positions []int32
-
-	// Sequences is the sequence for each Input. Equal in length to Inputs.
-	Sequences []int
-
-	// Outputs are the set of indicies into Inputs for which output data should
-	// be returned.
-	Outputs []int32
+	Positions  []int32
+	Sequences  []int
+	Outputs    []int32
 }
--- a/model/model.go
+++ b/model/model.go
@@ -1,7 +1,6 @@
 package model

 import (
-	"context"
 	"errors"
 	"fmt"
 	_ "image/jpeg"
@@ -27,7 +26,7 @@ var ErrNoVisionModel = errors.New("this model is missing data required for image

 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
-	Forward(ml.Context, input.Batch) (ml.Tensor, error)
+	Forward(ml.Context, input.Options) (ml.Tensor, error)

 	Backend() ml.Backend
 	Config() config
@@ -95,14 +94,14 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
+func New(modelPath string, params ml.BackendParams) (Model, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()

-	b, err := ml.NewBackend(ctx, r, params)
+	b, err := ml.NewBackend(r, params)
 	if err != nil {
 		return nil, err
 	}
@@ -281,30 +280,24 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }

-func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Tensor, error) {
-	if len(batch.Positions) != len(batch.Sequences) {
-		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(batch.Positions), len(batch.Sequences))
+func Forward(ctx ml.Context, m Model, opts input.Options) (ml.Tensor, error) {
+	if len(opts.Positions) != len(opts.Sequences) {
+		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
 	}

-	if len(batch.Positions) < 1 {
+	if len(opts.Positions) < 1 {
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	cache := m.Config().Cache
 	if cache != nil {
-		err := cache.StartForward(ctx, batch)
+		err := cache.StartForward(ctx, opts)
 		if err != nil {
 			return nil, err
 		}
 	}

-	t, err := m.Forward(ctx, batch)
+	t, err := m.Forward(ctx, opts)
 	if err != nil {
 		return nil, err
 	}
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -163,7 +163,7 @@ func TestGetTextProcessor(t *testing.T) {

 type notTextProcessorModel struct{}

-func (notTextProcessorModel) Forward(ml.Context, input.Batch) (ml.Tensor, error) {
+func (notTextProcessorModel) Forward(ml.Context, input.Options) (ml.Tensor, error) {
 	panic("unimplemented")
 }

--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -168,18 +168,23 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))

 	if len(m.Layers) == gemma27BLayerCount {
@@ -206,7 +211,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	// final logit softcap
 	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
 	hiddenState = hiddenState.Tanh(ctx)
-	return hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap)), nil
+	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
+	return hiddenState.Rows(ctx, outputs), nil
 }

 func init() {
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -139,18 +139,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return result, nil
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil
 }

 func init() {
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -171,13 +171,13 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 	return hiddenState.Add(ctx, residual)
 }

-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextOptions.hiddenSize)))

 	// set image embeddings
 	var except []int
-	for _, image := range batch.Multimodal {
+	for _, image := range opts.Multimodal {
 		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -139,18 +139,23 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -135,27 +135,32 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return inputs, nil
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
-	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
+	if len(opts.Multimodal) > 0 {
+		images := opts.Multimodal[len(opts.Multimodal)-1].Multimodal.([]ml.Tensor)
 		if len(images) > 0 {
 			crossAttentionStates = images[len(images)-1]
 		}
 	}

-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -32,7 +32,6 @@ type TextProcessor interface {
 	Encode(s string, addSpecial bool) ([]int32, error)
 	Decode([]int32) (string, error)
 	Is(int32, Special) bool
-	Vocab() *Vocabulary
 }

 type Vocabulary struct {
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -53,10 +53,6 @@ func (spm SentencePieceModel) Is(id int32, special Special) bool {
 	return spm.vocab.Is(id, special)
 }

-func (spm SentencePieceModel) Vocab() *Vocabulary {
-	return spm.vocab
-}
-
 func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
 	return func(yield func(string) bool) {
 		for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -31,10 +31,8 @@ type InputCache struct {
 	cache kvcache.Cache
 }

-func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
-	numCtx := kvSize / int32(numSlots)
-
-	if numCtx < 1 {
+func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/int32(numSlots) < 1 {
 		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
 	}

@@ -46,11 +44,11 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots

 	cache := model.Config().Cache
 	if cache != nil {
-		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), numSlots, int(numCtx), batchSize)
+		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
 	}

 	return &InputCache{
-		numCtx:         numCtx,
+		numCtx:         kvSize / int32(numSlots),
 		enabled:        cache != nil,
 		slots:          slots,
 		multiUserCache: multiUserCache,
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -348,8 +348,7 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()

-	var batchInputs []int32
-	var batch input.Batch
+	var options input.Options

 	for i, seq := range s.seqs {
 		if seq == nil {
@@ -396,17 +395,17 @@ func (s *Server) processBatch() error {
 				}
 			}

-			batchInputs = append(batchInputs, inp.Token)
+			options.Inputs = append(options.Inputs, inp.Token)
 			if inp.Multimodal != nil {
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
+				options.Multimodal = append(options.Multimodal, input.MultimodalIndex{Index: len(options.Inputs) - 1, Multimodal: inp.Multimodal})
 			}

-			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
-			batch.Sequences = append(batch.Sequences, seq.cache.Id)
+			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
+			options.Sequences = append(options.Sequences, seq.cache.Id)

-			seq.iBatch = len(batch.Outputs)
+			seq.iBatch = len(options.Outputs)
 			if j+1 == len(seq.inputs) {
-				batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
+				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
 			}
 			seq.pendingInputs = append(seq.pendingInputs, inp)
 		}
@@ -414,14 +413,14 @@ func (s *Server) processBatch() error {
 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}

-	if len(batchInputs) == 0 {
+	if len(options.Inputs) == 0 {
 		return nil
 	}

 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

-	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
+	modelOutput, err := model.Forward(ctx, s.model, options)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
@@ -461,27 +460,13 @@ func (s *Server) processBatch() error {
 		}

 		// sample a token
-		vocabSize := len(logits) / len(batch.Outputs)
+		vocabSize := len(logits) / len(options.Outputs)

 		token, err := seq.sampler.Sample(logits[seq.iBatch*vocabSize : (seq.iBatch+1)*vocabSize])
 		if err != nil {
 			return fmt.Errorf("failed to sample token: %w", err)
 		}

-		if seq.sampler.JSONSampler != nil {
-			_, err = seq.sampler.JSONSampler.UpdateState([]int32{token})
-			if err != nil {
-				return fmt.Errorf("failed to update state: %w", err)
-			}
-		}
-
-		if seq.sampler.PythonSampler != nil {
-			err = seq.sampler.PythonSampler.UpdateState(token)
-			if err != nil {
-				return fmt.Errorf("failed to update state: %w", err)
-			}
-		}
-
 		// if it's an end of sequence token, break
 		if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) {
 			// TODO (jmorganca): we should send this back
@@ -576,21 +561,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		}
 	}

-	// jsonSampler, err := sample.NewJSONSampler(s.model.(model.TextProcessor), nil)
-	// if err != nil {
-	// 	http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
-	// 	return
-	// }
-	// jsonSampler = nil
-	pythonSampler := &sample.PythonSampler{}
-	functions := []sample.PythonFunction{
-		{
-			Name:  "add_two_strings",
-			Args:  []string{"s1", "s2"},
-			Types: []string{"string", "string"},
-		},
-	}
-	pythonSampler.Init(functions, s.model.(model.TextProcessor))
 	sampler := sample.NewSampler(
 		req.Options.Temperature,
 		req.Options.TopK,
@@ -598,9 +568,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		req.Options.MinP,
 		req.Options.Seed,
 		grammar,
-		nil,
-		pythonSampler,
-		// nil,
 	)

 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
@@ -710,7 +677,6 @@ func (m *multiLPath) String() string {
 }

 func (s *Server) loadModel(
-	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -720,7 +686,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = model.New(ctx, mpath, params)
+	s.model, err = model.New(mpath, params)
 	if err != nil {
 		panic(err)
 	}
@@ -732,7 +698,7 @@ func (s *Server) loadModel(
 		panic("loras are not yet implemented")
 	}

-	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
+	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
 	if err != nil {
 		panic(err)
 	}
@@ -816,9 +782,6 @@ func Execute(args []string) error {
 	}

 	params := ml.BackendParams{
-		Progress: func(progress float32) {
-			server.progress = progress
-		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
@@ -827,13 +790,13 @@ func Execute(args []string) error {
 	}

 	server.ready.Add(1)
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+	go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)

 	server.cond = sync.NewCond(&server.mu)

+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
 	go server.run(ctx)

 	addr := "127.0.0.1:" + strconv.Itoa(*port)
--- a/sample/gtf.go
+++ b/sample/gtf.go
@@ -1,53 +0,0 @@
-package sample
-
-var DefaultGrammar = map[string]string{
-	"unicode": `\x{hex}{2} | \u{hex}{4} | \U{hex}{8}`,
-	"null":    `"null"`,
-	"object":  `"{" (kv ("," kv)*)? "}"`,
-	"array":   `"[" (value ("," value)*)? "]"`,
-	"kv":      `string ":" value`,
-	"integer": `"0" | [1-9] [0-9]*`,
-	"number":  `"-"? integer frac? exp?`,
-	"frac":    `"." [0-9]+`,
-	"exp":     `("e" | "E") ("+" | "-") [0-9]+`,
-	"string":  `"\"" char* "\""`,
-	"escape":  `["/" | "b" | "f" | "n" | "r" | "t" | unicode]`,
-	"char":    `[^"\\] | escape`,
-	"space":   `(" " | "\t" | "\n" | "\r")*`,
-	"hex":     `[0-9] | [a-f] | [A-F]`,
-	"boolean": `"true" | "false"`,
-	"value":   `object | array | string | number | boolean | "null"`,
-}
-
-const jsonString = `object | array`
-
-type StateMachine struct {
-	states map[rune]State
-}
-
-type State struct {
-	NextStates []string
-	// bitmask?
-	Mask       []bool
-	IsTerminal bool
-}
-
-func NewStateMachine(grammar map[string]string, startRule string) *StateMachine {
-	states := make(map[rune]State)
-
-	var cumu string
-	flag := false
-	for _, r := range startRule {
-		if r == '"' {
-			flag = !flag
-		}
-		if flag {
-			cumu += string(r)
-		}
-	}
-
-	sm := &StateMachine{
-		states: states,
-	}
-	return sm
-}
--- a/sample/gtf_test.go
+++ b/sample/gtf_test.go
@@ -1,138 +0,0 @@
-package sample
-
-import (
-	"testing"
-)
-
-func TestGrammarParsing(t *testing.T) {
-	tests := []struct {
-		name      string
-		grammar   map[string]string
-		startRule string
-		input     string
-		want      bool
-	}{
-		{
-			name: "simple object",
-			grammar: map[string]string{
-				"object": `"{" "}"`,
-			},
-			startRule: "object",
-			input:     "{}",
-			want:      true,
-		},
-		{
-			name: "simple array",
-			grammar: map[string]string{
-				"array": `"[" "]"`,
-			},
-			startRule: "array",
-			input:     "[]",
-			want:      true,
-		},
-		{
-			name: "character class",
-			grammar: map[string]string{
-				"digit": `[0-9]`,
-			},
-			startRule: "digit",
-			input:     "5",
-			want:      true,
-		},
-		{
-			name: "alternation",
-			grammar: map[string]string{
-				"bool": `"true" | "false"`,
-			},
-			startRule: "bool",
-			input:     "true",
-			want:      true,
-		},
-		{
-			name: "repetition",
-			grammar: map[string]string{
-				"digits": `[0-9]+`,
-			},
-			startRule: "digits",
-			input:     "123",
-			want:      true,
-		},
-		{
-			name: "nested rules",
-			grammar: map[string]string{
-				"value":  `object | array`,
-				"object": `"{" "}"`,
-				"array":  `"[" "]"`,
-			},
-			startRule: "value",
-			input:     "{}",
-			want:      true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			parser := NewParser(tt.grammar)
-			machine, err := parser.Parse(tt.startRule)
-			if err != nil {
-				t.Fatalf("Parse() error = %v", err)
-			}
-
-			matcher := NewMatcher(machine)
-			got, err := matcher.Match(tt.input)
-			if err != nil {
-				t.Fatalf("Match() error = %v", err)
-			}
-			if got != tt.want {
-				t.Errorf("Match() = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestJSONGrammar(t *testing.T) {
-	tests := []struct {
-		name  string
-		input string
-		want  bool
-	}{
-		{"empty object", "{}", true},
-		{"empty array", "[]", true},
-		{"simple string", `"hello"`, true},
-		{"simple number", "123", true},
-		{"simple boolean", "true", true},
-		{"simple null", "null", true},
-		{"object with string", `{"key": "value"}`, true},
-		{"array with numbers", "[1, 2, 3]", true},
-		{"nested object", `{"obj": {"key": "value"}}`, true},
-		{"nested array", `[1, [2, 3], 4]`, true},
-		{"invalid object", "{", false},
-		{"invalid array", "[1, 2", false},
-		{"invalid string", `"hello`, false},
-	}
-
-	parser := NewParser(DefaultGrammar)
-	machine, err := parser.Parse("value")
-	if err != nil {
-		t.Fatalf("Parse() error = %v", err)
-	}
-
-	matcher := NewMatcher(machine)
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := matcher.Match(tt.input)
-			if tt.want {
-				if err != nil {
-					t.Errorf("Match() error = %v", err)
-				}
-				if !got {
-					t.Errorf("Match() = false, want true")
-				}
-			} else {
-				if err == nil && got {
-					t.Errorf("Match() = true, want false")
-				}
-			}
-		})
-	}
-}
--- a/sample/json_types.go
+++ b/sample/json_types.go
@@ -1,160 +0,0 @@
-package sample
-
-import (
-	"fmt"
-)
-
-type JSONState int
-
-const (
-	StateStart JSONState = iota
-	StateInObject
-	StateInObjectKey
-	StateInStructuredKey
-	StateInStructuredValue
-	StateNewline
-	StateTab
-	StateSpace
-	StateInString
-	StateInInt
-	StateInFloat
-	StateInBool
-	StateInNull
-	StateInColon
-	StateInComma
-	StateInTab
-	StateInSpaceToValue
-	StateInSpaceEndValue
-	StateInNewlineEndValue
-	StateInObjSpace
-	StateInList
-	StateInListComma
-	StateInValue
-	StateInValueEnd
-	StateInListEnd
-	StateInListObjectEnd
-	StateInNewline
-	StateInNumber
-	StateInNumberEnd
-	StateInStringEnd
-	StateInObjectKeyEnd
-	StateTerminate
-	StateInObjectEnd
-	StateTransitioningToTerminate
-	StateInListStartJSON
-)
-
-var JSONStates = []JSONState{
-	StateStart,
-	StateInObject,
-	StateInObjectKey,
-	StateInStructuredKey,
-	StateInStructuredValue,
-	StateNewline,
-	StateTab,
-	StateSpace,
-	StateInString,
-	StateInInt,
-	StateInFloat,
-	StateInBool,
-	StateInNull,
-	StateInColon,
-	StateInComma,
-	StateInTab,
-	StateInSpaceToValue,
-	StateInSpaceEndValue,
-	StateInNewlineEndValue,
-	StateInObjSpace,
-	StateInListStartJSON,
-	StateInList,
-	StateInListComma,
-	StateInValue,
-	StateInValueEnd,
-	StateInListEnd,
-	StateInListObjectEnd,
-	StateInNewline,
-	StateInNumber,
-	StateInNumberEnd,
-	StateInStringEnd,
-	StateInObjectKeyEnd,
-	StateTerminate,
-	StateInObjectEnd,
-	StateTransitioningToTerminate,
-}
-
-func (s JSONState) String() string {
-	switch s {
-	case StateStart:
-		return "StateStart"
-	case StateInObject:
-		return "StateInObject"
-	case StateInObjectKey:
-		return "StateInObjectKey"
-	case StateInStructuredKey:
-		return "StateInStructuredKey"
-	case StateInStructuredValue:
-		return "StateInStructuredValue"
-	case StateNewline:
-		return "StateNewline"
-	case StateTab:
-		return "StateTab"
-	case StateSpace:
-		return "StateSpace"
-	case StateInString:
-		return "StateInString"
-	case StateInInt:
-		return "StateInInt"
-	case StateInFloat:
-		return "StateInFloat"
-	case StateInBool:
-		return "StateInBool"
-	case StateInNull:
-		return "StateInNull"
-	case StateInColon:
-		return "StateInColon"
-	case StateInComma:
-		return "StateInComma"
-	case StateInTab:
-		return "StateInTab"
-	case StateInSpaceToValue:
-		return "StateInSpaceToValue"
-	case StateInSpaceEndValue:
-		return "StateInSpaceEndValue"
-	case StateInNewlineEndValue:
-		return "StateInNewlineEndValue"
-	case StateInObjSpace:
-		return "StateInObjSpace"
-	case StateInList:
-		return "StateInList"
-	case StateInListComma:
-		return "StateInListComma"
-	case StateInValue:
-		return "StateInValue"
-	case StateInValueEnd:
-		return "StateInValueEnd"
-	case StateInListEnd:
-		return "StateInListEnd"
-	case StateInListObjectEnd:
-		return "StateInListObjectEnd"
-	case StateInNewline:
-		return "StateInNewline"
-	case StateInNumber:
-		return "StateInNumber"
-	case StateInNumberEnd:
-		return "StateInNumberEnd"
-	case StateInStringEnd:
-		return "StateInStringEnd"
-	case StateInObjectKeyEnd:
-		return "StateInObjectKeyEnd"
-	case StateTerminate:
-		return "StateTerminate"
-	case StateInObjectEnd:
-		return "StateInObjectEnd"
-	case StateTransitioningToTerminate:
-		return "StateTransitioningToTerminate"
-	case StateInListStartJSON:
-		return "StateInListStartJSON"
-	default:
-		return fmt.Sprintf("Unknown state: %d", s)
-	}
-}
--- a/sample/pushdown_automata.go
+++ b/sample/pushdown_automata.go
@@ -1,327 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"slices"
-
-	"github.com/ollama/ollama/model"
-)
-
-/*
-Key JSON rules to consider:
-
-1. Whitespace handling:
-   - Need to handle all valid JSON whitespace characters (\r, spaces between tokens)
-   - Current code only handles some whitespace cases
-
-2. Number validation:
-   - Need proper validation for special number cases like -0
-   - Should handle .5 style decimals
-   - Need limits on scientific notation (e, E)
-
-3. String escaping:
-   - Currently marks \ as invalid but should allow escaped sequences:
-     - \"
-     - \n
-     - \u1234 unicode escapes
-
-4. Empty object/array transitions:
-   - Direct {} and [] cases could be more explicit
-   - Need clear transitions for these edge cases
-
-5. Nested depth limits:
-   - No protection against excessive nesting
-   - Could cause stack overflow with deeply nested structures
-*/
-
-// TODO: / should be valid but an escape character
-var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ',', '/'}
-
-var (
-	intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
-	validIntRunes   = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
-)
-
-var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
-
-var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
-
-var validNullRunes = []rune{'n', 'u', 'l', 'l'}
-
-type PDA struct {
-	State             JSONState
-	TransitionEdges   map[rune]*PDA
-	MaskTokenIDToNode map[int32]*PDA
-}
-
-func NewPDANode(state JSONState) *PDA {
-	return &PDA{
-		State:             state,
-		TransitionEdges:   make(map[rune]*PDA),
-		MaskTokenIDToNode: make(map[int32]*PDA),
-	}
-}
-
-type PDAGraphBuilder struct {
-	proc             model.TextProcessor
-	decodedToks      []string
-	stateToNodeMap   map[JSONState]*PDA
-	tokenToStatesMap map[int32][]JSONState
-}
-
-func (b *PDAGraphBuilder) BuildGraph() error {
-	stateToNodeMap := make(map[JSONState]*PDA)
-	for _, state := range JSONStates {
-		stateToNodeMap[state] = NewPDANode(state)
-	}
-
-	stateToNodeMap[StateStart].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateStart].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
-
-	// TODO: update naming here - and revisit values
-	stateToNodeMap[StateInListStartJSON].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInListStartJSON].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
-
-	stateToNodeMap[StateInObject].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObject].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	stateToNodeMap[StateInObject].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	stateToNodeMap[StateInObject].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-
-	// new line
-	stateToNodeMap[StateInNewline].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInNewline].TransitionEdges['\t'] = stateToNodeMap[StateInTab]
-	stateToNodeMap[StateInNewline].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInNewline].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	// stateToNodeMap[StateInNewline].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-
-	// new line end value
-	// stateToNodeMap[StateInNewlineEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInNewlineEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInNewlineEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-
-	stateToNodeMap[StateInObjSpace].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObjSpace].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	// TODO: see if this is needed for formatting
-	stateToNodeMap[StateInObjSpace].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-
-	stateToNodeMap[StateInTab].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInTab].TransitionEdges['\t'] = stateToNodeMap[StateInNewline]
-
-	stateToNodeMap[StateInObjectKey].TransitionEdges[rune(-1)] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObjectKey].TransitionEdges['"'] = stateToNodeMap[StateInObjectKeyEnd]
-
-	stateToNodeMap[StateInObjectKeyEnd].TransitionEdges[':'] = stateToNodeMap[StateInColon]
-
-	stateToNodeMap[StateInObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInObjectEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-
-	// where values should be
-	// this could be combined but the probl might change, we're alr doing a skip ahead
-	stateToNodeMap[StateInColon].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-	stateToNodeMap[StateInColon].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
-
-	stateToNodeMap[StateInColon].TransitionEdges['['] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInColon].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	addValueConnections(stateToNodeMap[StateInColon], stateToNodeMap)
-
-	// Leads to a value
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['['] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	addValueConnections(stateToNodeMap[StateInSpaceToValue], stateToNodeMap)
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
-
-	// Values
-	// string node
-	stateToNodeMap[StateInString].TransitionEdges[rune(-1)] = stateToNodeMap[StateInString]
-	stateToNodeMap[StateInString].TransitionEdges['"'] = stateToNodeMap[StateInStringEnd]
-
-	// String end node
-	addEnds(stateToNodeMap[StateInStringEnd], stateToNodeMap)
-	// stateToNodeMap[StateInStringEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInStringEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// TODO: add counters for allowable number of decimals, e, E, etc
-	// number node
-	for _, r := range validNumberRunes {
-		stateToNodeMap[StateInNumber].TransitionEdges[r] = stateToNodeMap[StateInNumber]
-	}
-	addEnds(stateToNodeMap[StateInNumber], stateToNodeMap)
-	// stateToNodeMap[StateInNumber].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInNumber].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// list node
-	stateToNodeMap[StateInList].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInList].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInList].TransitionEdges[' '] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInList].TransitionEdges['\n'] = stateToNodeMap[StateInList]
-	// early end
-	stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-
-	// list end node
-	stateToNodeMap[StateInListEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	// stateToNodeMap[StateInListEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInListEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInListEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// empty list
-	stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	addValueConnections(stateToNodeMap[StateInList], stateToNodeMap)
-
-	// null node
-	for _, r := range validNullRunes {
-		stateToNodeMap[StateInNull].TransitionEdges[r] = stateToNodeMap[StateInNull]
-	}
-	addEnds(stateToNodeMap[StateInNull], stateToNodeMap)
-	stateToNodeMap[StateInNull].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-	stateToNodeMap[StateInNull].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// list comma
-	// should point to values
-	stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInListComma]
-	stateToNodeMap[StateInListComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInListComma].TransitionEdges['\n'] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInListComma].TransitionEdges['\t'] = stateToNodeMap[StateInList]
-
-	addValueConnections(stateToNodeMap[StateInListComma], stateToNodeMap)
-
-	// list object end
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInListComma]
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	// TODO: not sure if this is needed
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// bool node
-	for _, r := range validBoolRunes {
-		stateToNodeMap[StateInBool].TransitionEdges[r] = stateToNodeMap[StateInBool]
-	}
-	stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	addEnds(stateToNodeMap[StateInBool], stateToNodeMap)
-	// stateToNodeMap[StateInBool].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// comma node
-	stateToNodeMap[StateInComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInComma].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	stateToNodeMap[StateInComma].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	// todo: review this space transition
-	// stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-
-	// space end value
-	// stateToNodeMap[StateInSpaceEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	b.stateToNodeMap = stateToNodeMap
-	if err := b.preComputeValidStates(); err != nil {
-		return err
-	}
-	return nil
-}
-
-func addEnds(node *PDA, stateToNodeMap map[JSONState]*PDA) {
-	node.TransitionEdges[','] = stateToNodeMap[StateInComma]
-	node.TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	node.TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-}
-
-func addValueConnections(node *PDA, stateToNodeMap map[JSONState]*PDA) {
-	node.TransitionEdges['"'] = stateToNodeMap[StateInString]
-	for _, r := range validNumberRunes {
-		node.TransitionEdges[r] = stateToNodeMap[StateInNumber]
-	}
-	// TODO(parthsareen): force the output and shift similar to structured outputs
-	node.TransitionEdges['t'] = stateToNodeMap[StateInBool]
-	node.TransitionEdges['f'] = stateToNodeMap[StateInBool]
-	node.TransitionEdges['n'] = stateToNodeMap[StateInNull]
-}
-
-func (b *PDAGraphBuilder) preComputeValidStates() error {
-	for _, node := range b.stateToNodeMap {
-		// if node.State == StateInObjectKey {
-		// 	if len(b.stateToNodeMap[StateInString].MaskTokenIDToNode) > 0 {
-		// 		b.stateToNodeMap[StateInObjectKey].MaskTokenIDToNode = b.stateToNodeMap[StateInString].MaskTokenIDToNode
-		// 		fmt.Println("copying string mask to object key mask")
-		// 	}
-		// }
-		if err := b.CreateMask(node); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (b *PDAGraphBuilder) preComputeTokenToStatesMap() error {
-	// TODO: make can be somewhere else too
-	b.tokenToStatesMap = make(map[int32][]JSONState)
-	for i, t := range b.decodedToks {
-		for _, r := range t {
-			if r == '"' {
-				b.tokenToStatesMap[int32(i)] = append(b.tokenToStatesMap[int32(i)], StateInString)
-			}
-		}
-	}
-	return nil
-}
-
-// TODO: the mask for obj key and string should be the same?
-func (b *PDAGraphBuilder) CreateMask(node *PDA) error {
-	if node == nil {
-		return fmt.Errorf("node cannot be nil")
-	}
-	for i := range b.decodedToks {
-		token := b.decodedToks[i]
-		// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
-		if b.proc.Is(int32(i), model.SpecialEOS) || b.proc.Is(int32(i), model.SpecialBOS) || token == "" || token == "\"\"" {
-			continue
-		}
-		curNode := node
-		valid := true
-		consumedSpecialRunes := make(map[rune]bool)
-		for _, r := range token {
-			curNode, valid = isRuneValid(r, curNode, consumedSpecialRunes)
-			if curNode == nil || !valid {
-				break
-			}
-		}
-		if valid {
-			node.MaskTokenIDToNode[int32(i)] = curNode
-		}
-	}
-	return nil
-}
-
-func isRuneValid(r rune, curNode *PDA, consumedSpecialRunes map[rune]bool) (*PDA, bool) {
-	if consumedSpecialRunes[r] {
-		return nil, false
-	}
-
-	specialRune := slices.Contains(stringInvalidRunes, r)
-	if specialRune {
-		if curNode.State == StateInString || curNode.State == StateInObjectKey {
-			return nil, false
-		}
-	}
-
-	// Check for specific rune transition
-	if nextNode, ok := curNode.TransitionEdges[r]; ok {
-		// fmt.Println("next node", nextNode)
-		if specialRune {
-			if curNode.State == nextNode.State {
-				return nil, false
-			}
-			consumedSpecialRunes[r] = true
-		}
-		return nextNode, true
-	}
-
-	// Check for sentinel value - if present, any rune is valid
-	if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
-		return nextNode, true
-	}
-
-	return nil, false
-}
--- a/sample/pushdown_runner.go
+++ b/sample/pushdown_runner.go
@@ -1,264 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"math"
-	"runtime"
-	"time"
-
-	"github.com/ollama/ollama/model"
-)
-
-// TODO: safety in case of invalid json
-// TODO: partial JSON matching?
-// TODO: interfaces to cleanup with return values
-// TODO this interface shouldn't be the sampler - should just use Sampler
-// TODO: add penalties for string \n stuff
-// TODO: minimize number of fwd passes if there is only one match
-// TODO: greedy sample initially and then backtrack if no match
-
-type PushdownSampler struct {
-	PDAGraphBuilder
-	curNode      *PDA
-	braceStack   []rune
-	stateCounter uint32
-}
-
-// graph should be built once and reused per tokenizer
-func NewPushdownSampler(proc model.TextProcessor) (*PushdownSampler, error) {
-	start := time.Now()
-
-	fmt.Println("--------------------------------")
-	fmt.Println("PDA sampler")
-	fmt.Println("--------------------------------")
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	before := m.Alloc
-	fmt.Printf("Alloc = %.2f MB\n", float64(before)/(1024*1024))
-
-	vocab := proc.Vocab()
-	decodedToks := make([]string, len(vocab.Values))
-	for i := range vocab.Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return nil, err
-		}
-		decodedToks[i] = token
-	}
-
-	gb := &PDAGraphBuilder{
-		proc:        proc,
-		decodedToks: decodedToks,
-	}
-
-	if err := gb.BuildGraph(); err != nil {
-		return nil, err
-	}
-
-	runtime.ReadMemStats(&m)
-	after := m.Alloc
-	fmt.Printf("Alloc = %.2f MB\n", float64(after)/(1024*1024))
-	fmt.Printf("Graph memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	fmt.Printf("Graph build time = %v\n", time.Since(start))
-
-	// TODO: this can be simplified
-	return &PushdownSampler{
-		curNode:         gb.stateToNodeMap[StateStart],
-		PDAGraphBuilder: *gb,
-		braceStack:      []rune{},
-		stateCounter:    0,
-	}, nil
-}
-
-// TODO: need to add resampling logic if the first sample was not good
-// greedy sample + backtrack?
-func (s *PushdownSampler) Apply(logits []float32) ([]float32, error) {
-	switch s.curNode.State {
-	case StateInString:
-		return s.maskLogits(logits, s.curNode)
-
-	case StateInListEnd:
-		// force finish if no braces left
-		if len(s.braceStack) == 0 {
-			s.curNode = NewPDANode(StateTerminate)
-			return forceFinish(s, logits)
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	case StateTerminate:
-		return forceFinish(s, logits)
-
-	case StateInObjectEnd:
-		// force finish if no braces left
-		if len(s.braceStack) == 0 {
-			s.curNode = NewPDANode(StateTerminate)
-			return forceFinish(s, logits)
-		}
-
-		peek := s.braceStack[len(s.braceStack)-1]
-		if peek == rune('[') {
-			s.curNode = s.stateToNodeMap[StateInListObjectEnd]
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	case StateInComma:
-		peek := s.braceStack[len(s.braceStack)-1]
-		if peek == rune('[') {
-			s.curNode = s.stateToNodeMap[StateInListComma]
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	default:
-		fmt.Println("masking logits current state", s.curNode.State)
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-	}
-}
-
-func forceFinish(s *PushdownSampler, logits []float32) ([]float32, error) {
-	for i := range logits {
-		if s.proc.Is(int32(i), model.SpecialEOS) {
-			logits[i] = 1.0
-		} else {
-			logits[i] = float32(math.Inf(-1))
-		}
-	}
-	return logits, nil
-}
-
-func (s *PushdownSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
-	fmt.Println("current state - updating", s.curNode.State)
-	mappedString, err := s.proc.Decode(tokenSlice)
-	if err != nil {
-		return nil, err
-	}
-	fmt.Printf(">>> mappedString: %q\n", mappedString)
-
-	// Special handling for EOS token in terminate state
-	if s.curNode.State == StateTerminate {
-		for _, tokenID := range tokenSlice {
-			if s.proc.Is(tokenID, model.SpecialEOS) {
-				return tokenSlice, nil
-			}
-		}
-	}
-
-	// flag := -1
-	// endBraceRunes := []rune{'}', ']'}
-	for _, r := range mappedString {
-		// TODO: if this is enabled again, make sure to appropriately handle the state transitions
-		// if slices.Contains(endBraceRunes, r) && len(s.braceStack) == 0 {
-		// 	fmt.Printf("stack is empty, extra closing brace %c\n", r)
-		// 	// flag = i
-		// 	break
-
-		// }
-		if r == rune('{') {
-			s.braceStack = append(s.braceStack, r)
-		}
-		if r == rune('[') {
-			s.braceStack = append(s.braceStack, r)
-		}
-		if r == rune('}') {
-			if len(s.braceStack) == 0 {
-				return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
-			}
-			top := s.braceStack[len(s.braceStack)-1]
-			if top != rune('{') {
-				return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '{')
-			}
-			s.braceStack = s.braceStack[:len(s.braceStack)-1]
-		}
-
-		if r == rune(']') {
-			if len(s.braceStack) == 0 {
-				return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
-			}
-			top := s.braceStack[len(s.braceStack)-1]
-			if top != rune('[') {
-				return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '[')
-			}
-			s.braceStack = s.braceStack[:len(s.braceStack)-1]
-		}
-	}
-
-	// if flag != -1 {
-	// 	tokenSlice = tokenSlice[:flag]
-	// }
-	// fmt.Println("flag!", flag)
-	for _, tokenID := range tokenSlice {
-		// transition to the next node
-		nextNode, ok := s.curNode.MaskTokenIDToNode[tokenID]
-		if !ok {
-			return nil, fmt.Errorf("invalid token: %q", mappedString)
-		}
-		fmt.Println("transitioning to", nextNode.State)
-
-		// TODO: add a penalty for staying in the same state too long
-		if nextNode.State == s.curNode.State {
-			s.stateCounter++
-		} else {
-			s.stateCounter = 0
-		}
-		s.curNode = nextNode
-		fmt.Println("updated curNode state", s.curNode.State)
-	}
-	return tokenSlice, nil
-}
-
-// greedy sample + backtrack?
-func (s *PushdownSampler) maskLogits(logits []float32, node *PDA) ([]float32, error) {
-	// Create a new slice with same length as logits, initialized to -Inf
-	maskedLogits := make([]float32, len(logits))
-	for i := range maskedLogits {
-		maskedLogits[i] = float32(math.Inf(-1))
-	}
-
-	// Only update values for valid token IDs from the mask map
-	for tokenID := range node.MaskTokenIDToNode {
-		if int(tokenID) < len(logits) {
-			maskedLogits[tokenID] = logits[tokenID]
-		}
-	}
-
-	return maskedLogits, nil
-}
-
-func (s *PushdownSampler) fastMaskLogits(logits []float32, node *PDA) ([]float32, error) {
-	maxLogit := float32(math.Inf(-1))
-	maxIndex := -1
-
-	// Find the maximum logit value among valid tokens
-	for tokenID := range node.MaskTokenIDToNode {
-		if int(tokenID) < len(logits) && logits[tokenID] > maxLogit {
-			maxLogit = logits[tokenID]
-			maxIndex = int(tokenID)
-		}
-	}
-
-	if maxIndex == -1 {
-		return nil, fmt.Errorf("no valid tokens found in mask")
-	}
-
-	logits[0] = float32(maxIndex)
-	return logits, nil
-	// return maxIndex, nil
-}
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -17,34 +17,15 @@ type token struct {
 }

 type Sampler struct {
-	rng           *rand.Rand
-	topK          int
-	topP          float32
-	minP          float32
-	temperature   float32
-	grammar       *Grammar
-	JSONSampler   *JSONSampler
-	PythonSampler *PythonSampler
+	rng         *rand.Rand
+	topK        int
+	topP        float32
+	minP        float32
+	temperature float32
+	grammar     *Grammar
 }

 func (s *Sampler) Sample(logits []float32) (int32, error) {
-	if len(logits) == 0 {
-		return -1, errors.New("sample: no logits provided to sample")
-	}
-
-	var err error
-	if s.JSONSampler != nil {
-		logits, err = s.JSONSampler.Apply(logits)
-		if err != nil {
-			return -1, err
-		}
-	}
-	if s.PythonSampler != nil {
-		logits, err = s.PythonSampler.ApplyMask(logits)
-		if err != nil {
-			return -1, err
-		}
-	}
 	tokens := make([]token, len(logits))
 	for i := range logits {
 		tokens[i].id = int32(i)
@@ -113,6 +94,13 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)

+	// TODO: this should fall back to greedy sampling
+	// or topP, topK values etc should be such that
+	// there are always tokens to sample from
+	if len(tokens) == 0 {
+		return token{}, errors.New("no tokens to sample from")
+	}
+
 	var r float32
 	if s.rng != nil {
 		r = s.rng.Float32()
@@ -135,14 +123,11 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 		return 1
 	})

-	if math.IsNaN(float64(sum)) {
-		return token{}, errors.New("sample: logits sum to NaN, check model output")
-	}
 	return tokens[idx], nil
 }

 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar, jsonSampler *JSONSampler, pythonSampler *PythonSampler) Sampler {
+func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
 	var rng *rand.Rand
 	if seed != -1 {
 		// PCG requires two parameters: sequence and stream
@@ -170,14 +155,12 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 	}

 	return Sampler{
-		rng:           rng,
-		topK:          topK,
-		topP:          topP,
-		minP:          minP,
-		temperature:   temperature,
-		grammar:       grammar,
-		JSONSampler:   jsonSampler,
-		PythonSampler: pythonSampler,
+		rng:         rng,
+		topK:        topK,
+		topP:        topP,
+		minP:        minP,
+		temperature: temperature,
+		grammar:     grammar,
 	}
 }

--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -1,7 +1,6 @@
 package sample

 import (
-	"math"
 	"math/rand/v2"
 	"testing"
 )
@@ -30,29 +29,6 @@ func TestWeighted(t *testing.T) {
 	if want != got {
 		t.Errorf("index mismatch: want %d, got %d", want, got)
 	}
-
-	// Test very high p
-	logits = []float32{1.0, 0.9999999999999999, 0.5, 0.1}
-	// Use extremely small topP to filter out all tokens
-	sampler = NewSampler(1.0, 0, 1e-10, 0, 0, nil)
-	got, err = sampler.Sample(logits)
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	// Should get the token with the highest logit
-	want = int32(0)
-	if want != got {
-		t.Errorf("index mismatch: want %d, got %d", want, got)
-	}
-
-	logits = []float32{float32(math.NaN()), float32(math.NaN()), float32(math.NaN())}
-	sampler = NewSampler(1, 0, 0.95, 0.05, 0, nil)
-	got, err = sampler.Sample(logits)
-	if err == nil {
-		t.Errorf("expected error, got %d", got)
-		return
-	}
 }

 func BenchmarkSample(b *testing.B) {
--- a/sample/structured_outputs.go
+++ b/sample/structured_outputs.go
@@ -1,299 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"log/slog"
-	"runtime"
-	"time"
-
-	"github.com/ollama/ollama/grammar/jsonschema"
-	"github.com/ollama/ollama/model"
-)
-
-type JSONSampler struct {
-	schema        *jsonschema.Schema
-	propIdx       int
-	propToNodeMap map[string]*PDA
-	pdaSampler    *PushdownSampler
-	decodedToks   []string
-}
-
-func NewJSONSampler(proc model.TextProcessor, schema *jsonschema.Schema) (*JSONSampler, error) {
-	slog.Info("NewJSONSampler", "schema", schema)
-	if proc == nil {
-		return nil, fmt.Errorf("TextProcessor cannot be nil")
-	}
-
-	pdaSampler, err := NewPushdownSampler(proc)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create PushdownSampler: %w", err)
-	}
-
-	if schema == nil {
-		return &JSONSampler{
-			schema:        nil,
-			propIdx:       -1,
-			propToNodeMap: nil,
-			pdaSampler:    pdaSampler,
-		}, nil
-	}
-
-	// fmt.Println("schema not nil")
-	so := &JSONSampler{
-		schema:        schema,
-		propIdx:       -1,
-		propToNodeMap: make(map[string]*PDA),
-		pdaSampler:    pdaSampler,
-	}
-
-	so.schemaToGraph()
-
-	// Benchmark token decoding
-	start := time.Now()
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	before := m.Alloc
-
-	vocab := proc.Vocab()
-	decodedToks := make([]string, len(vocab.Values))
-	for i := range vocab.Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return nil, err
-		}
-		decodedToks[i] = token
-	}
-	so.decodedToks = decodedToks
-
-	runtime.ReadMemStats(&m)
-	after := m.Alloc
-	fmt.Printf("Token decode memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	fmt.Printf("Token decode time = %v\n", time.Since(start))
-
-	fmt.Println("--------------------------------")
-	fmt.Println("SOSampler")
-	fmt.Println("--------------------------------")
-	// Benchmark this section
-	start = time.Now()
-	runtime.ReadMemStats(&m)
-	before = m.Alloc
-
-	// TODO: still messed up
-	// TODO: recursion use case
-	// key masks
-	for _, prop := range so.schema.Properties {
-		node := so.propToNodeMap[prop.Name]
-		// propName -> node
-		curState := node.State
-		fromNode := node
-		so.pdaSampler.CreateMask(fromNode)
-		for curState == StateInStructuredKey {
-			// there is only one edge
-			for r, toNode := range fromNode.TransitionEdges {
-				fmt.Println("rune", r, "edge", toNode.State)
-				so.pdaSampler.CreateMask(toNode)
-				fmt.Printf("created mask for %c\n", r)
-				curState = toNode.State
-				fmt.Println("next state", curState)
-				// TODO: theres an extra gen for " right now
-				fromNode = toNode
-			}
-		}
-
-		if curState != StateInColon {
-			return nil, fmt.Errorf("expected state to be StateInColon, got %v", curState)
-		}
-
-		// so.pdaSampler.CreateMask(fromNode)
-
-		fromNode = fromNode.TransitionEdges[' ']
-
-		so.pdaSampler.CreateMask(fromNode)
-		curState = fromNode.State
-		for _, toNode := range fromNode.TransitionEdges {
-			fmt.Println("toNode", toNode.State)
-		}
-	}
-
-	// runtime.ReadMemStats(&m)
-	// after = m.Alloc
-	// fmt.Printf("Mask creation memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	// fmt.Printf("Mask creation time = %v\n", time.Since(start))
-	// fmt.Println("--------------------------------")
-
-	return so, nil
-}
-
-func (s *JSONSampler) schemaToGraph() {
-	schemaType := s.schema.EffectiveType()
-	switch schemaType {
-	case "object":
-		// TODO: see if we need to connect these to the JSON graph
-
-		// each prop is a key
-		for _, prop := range s.schema.Properties {
-			// name of key
-			name := prop.Name
-			keyNode := &PDA{
-				State:             StateInStructuredKey, // this is unchanging, will impact sampling
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-
-			prevNode := keyNode
-			for _, r := range name {
-				runeNode := &PDA{
-					State:             StateInStructuredKey, // this is unchanging, will impact sampling
-					TransitionEdges:   make(map[rune]*PDA),
-					MaskTokenIDToNode: make(map[int32]*PDA),
-				}
-				// fmt.Println("runeNode created", runeNode.State)
-				// fmt.Printf("runeNode created %c\n", r)
-
-				// since alloc on heap connections wil still map
-				prevNode.TransitionEdges[r] = runeNode
-				prevNode = runeNode
-			}
-
-			// point to end of object key node after all chars are done
-			// prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInObjectKeyEnd]
-
-			// link to value node
-			// Create a node for the end of the key (after the closing quote)
-			stringEndNode := &PDA{
-				State:             StateInStructuredKey,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges['"'] = stringEndNode
-			prevNode = stringEndNode
-
-			// Add transition for colon after key
-			colonNode := &PDA{
-				State:             StateInColon,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges[':'] = colonNode
-			prevNode = colonNode
-
-			// Add transition for space after colon
-			spaceNode := &PDA{
-				State:             StateInSpaceToValue,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges[' '] = spaceNode
-			prevNode = spaceNode
-
-			value := prop.Type
-			switch value {
-			case "object":
-				fmt.Println("object under key: ", name)
-			case "array":
-				fmt.Println("array under key: ", name)
-			case "string":
-				fmt.Println("string under key: ", name)
-				prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInString]
-			case "number":
-				fmt.Println("number under key: ", name)
-				for _, r := range validNumberRunes {
-					prevNode.TransitionEdges[r] = s.pdaSampler.stateToNodeMap[StateInNumber]
-				}
-			case "boolean":
-				fmt.Println("boolean under key: ", name)
-				prevNode.TransitionEdges['t'] = s.pdaSampler.stateToNodeMap[StateInBool]
-				prevNode.TransitionEdges['f'] = s.pdaSampler.stateToNodeMap[StateInBool]
-				prevNode.TransitionEdges['n'] = s.pdaSampler.stateToNodeMap[StateInNull]
-			}
-
-			// points to start of the key
-			s.propToNodeMap[name] = keyNode
-			fmt.Println("name", name, "keyNode", keyNode.State)
-		}
-	}
-	// TODO: do values + recursion
-}
-
-func (s *JSONSampler) Apply(logits []float32) ([]float32, error) {
-	if s.schema == nil {
-		return s.pdaSampler.Apply(logits)
-	}
-
-	switch s.pdaSampler.curNode.State {
-	// TODO: doesnt account for multi rune case
-	case StateInObjectKey:
-		if s.propIdx > len(s.schema.Properties)-1 {
-			return nil, fmt.Errorf("propIdx out of bounds")
-		}
-		// fmt.Println("in object key - structured outputs")
-		// TODO: this tracking should probably be coming from a stack to track nested objects
-		// simple case
-		s.propIdx++
-		fmt.Println("propIdx", s.propIdx)
-		prop := s.schema.Properties[s.propIdx]
-		fmt.Println("prop", prop.Name)
-		s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
-		fmt.Println("changed curNode state to", s.pdaSampler.curNode.State)
-		logits, err := s.pdaSampler.maskLogits(logits, s.pdaSampler.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	default:
-
-		// Will only happen for the last prop - can also be precomputed.
-		if s.propIdx == len(s.schema.Properties)-1 {
-			// todo: if i incremenet propidx then i know im in last value as well
-			switch s.pdaSampler.curNode.State {
-			case StateInObjectEnd:
-				fmt.Println("<<<<< in obj end - generating mask for", s.pdaSampler.curNode.State)
-				s.pdaSampler.curNode.TransitionEdges = make(map[rune]*PDA)
-				s.pdaSampler.curNode = NewPDANode(StateTerminate)
-				s.propIdx++
-
-			// TODO: this needs to be optimized in some way, computing mask on the fly is expensive
-			case StateInNumber, StateInString, StateInBool, StateInNull, StateInListEnd:
-				fmt.Println("<<<<< last prop - generating mask for", s.pdaSampler.curNode.State)
-				delete(s.pdaSampler.curNode.TransitionEdges, ',')
-				s.pdaSampler.curNode.MaskTokenIDToNode = make(map[int32]*PDA)
-
-				s.pdaSampler.CreateMask(s.pdaSampler.curNode)
-				s.propIdx++
-			}
-		}
-		return s.pdaSampler.Apply(logits)
-	}
-}
-
-func (s *JSONSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
-	tokenSlice, err := s.pdaSampler.UpdateState(tokenSlice)
-	if err != nil {
-		return nil, err
-	}
-
-	if s.schema == nil {
-		// Don't need to update state for unconstrained JSON sampling
-		return tokenSlice, nil
-	}
-
-	switch s.pdaSampler.curNode.State {
-	case StateInObjectKey:
-		s.propIdx++
-		fmt.Println("propIdx", s.propIdx)
-		prop := s.schema.Properties[s.propIdx]
-		fmt.Println("prop", prop.Name)
-		s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
-		// TODO: this does not work - mike
-		// str, err := s.pdaSampler.proc.Decode(tokenSlice)
-		// if err != nil {
-		// 	return nil, err
-		// }
-		// fmt.Println("str", str)
-
-		return tokenSlice, nil
-	default:
-		return tokenSlice, nil
-	}
-}
--- a/sample/structured_python.go
+++ b/sample/structured_python.go
@@ -1,352 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"math"
-	"slices"
-
-	"github.com/ollama/ollama/model"
-)
-
-type PythonState int
-
-const (
-	PythonStateStart PythonState = iota
-	StateInFunction
-	StateInFunctionArgs
-	StateInFunctionArgsType
-	StateInFunctionEnd
-	PStateInString
-	PStateInStringEnd
-	PStateInNumber
-	PStateInList
-	PStateInListEnd
-	PStateInDict
-	PStateInDictEnd
-	PStateInTuple
-	PStateInTupleEnd
-	PStateTerminate
-)
-
-func (s PythonState) String() string {
-	switch s {
-	case PythonStateStart:
-		return "PythonStateStart"
-	case StateInFunction:
-		return "StateInFunction"
-	case StateInFunctionArgs:
-		return "StateInFunctionArgs"
-	case StateInFunctionArgsType:
-		return "StateInFunctionArgsType"
-	case StateInFunctionEnd:
-		return "StateInFunctionEnd"
-	case PStateInString:
-		return "PStateInString"
-	case PStateInStringEnd:
-		return "PStateInStringEnd"
-	case PStateInNumber:
-		return "PStateInNumber"
-	case PStateInList:
-		return "PStateInList"
-	case PStateInListEnd:
-		return "PStateInListEnd"
-	case PStateInDict:
-		return "PStateInDict"
-	case PStateInDictEnd:
-		return "PStateInDictEnd"
-	case PStateInTuple:
-		return "PStateInTuple"
-	case PStateInTupleEnd:
-		return "PStateInTupleEnd"
-	case PStateTerminate:
-		return "PStateTerminate"
-	default:
-		return fmt.Sprintf("PythonState(%d)", s)
-	}
-}
-
-var PythonStates = []PythonState{
-	PythonStateStart,
-	StateInFunction,
-	StateInFunctionArgs,
-	StateInFunctionArgsType,
-	StateInFunctionEnd,
-	PStateInString,
-	PStateInStringEnd,
-	PStateInNumber,
-	PStateInList,
-	PStateInListEnd,
-	PStateInDict,
-	PStateInDictEnd,
-	PStateInTuple,
-	PStateInTupleEnd,
-	PStateTerminate,
-}
-
-type Node struct {
-	State             PythonState
-	TransitionEdges   map[rune]*Node
-	MaskTokenIDToNode map[int32]*Node
-}
-
-func NewNode(state PythonState) *Node {
-	return &Node{
-		State:             state,
-		TransitionEdges:   make(map[rune]*Node),
-		MaskTokenIDToNode: make(map[int32]*Node),
-	}
-}
-
-type PythonFunction struct {
-	Name  string
-	Args  []string
-	Types []string
-}
-
-type PythonSampler struct {
-	stateToNodes map[PythonState]*Node
-	proc         model.TextProcessor
-	decodedToks  []string
-	curNode      *Node
-	completed    int
-	functions    []PythonFunction
-}
-
-func (s *PythonSampler) Init(functions []PythonFunction, proc model.TextProcessor) error {
-	s.proc = proc
-	s.functions = functions
-	decodedToks := make([]string, len(proc.Vocab().Values))
-	for i := range proc.Vocab().Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return err
-		}
-		decodedToks[i] = token
-	}
-	s.decodedToks = decodedToks
-	s.BuildGraph()
-	for _, function := range functions {
-		prevNode := s.stateToNodes[PythonStateStart]
-
-		for _, r := range function.Name {
-			nextNode := NewNode(StateInFunction)
-			prevNode.TransitionEdges[r] = nextNode
-			if err := s.CreateMask(nextNode); err != nil {
-				return err
-			}
-			fmt.Println("prevNode", prevNode.State)
-			fmt.Printf("transition edge: %q\n", r)
-			fmt.Println("nextNode", nextNode.State)
-			prevNode = nextNode
-		}
-		prevNode.TransitionEdges['('] = s.stateToNodes[StateInFunctionArgs]
-		s.CreateMask(prevNode)
-		prevNode = s.stateToNodes[StateInFunctionArgs]
-		for i, arg := range function.Args {
-			for _, r := range arg {
-				nextNode := NewNode(StateInFunctionArgs)
-				prevNode.TransitionEdges[r] = nextNode
-				s.CreateMask(prevNode)
-				prevNode = nextNode
-			}
-			prevNode.TransitionEdges[','] = s.stateToNodes[StateInFunctionArgs]
-			// prevNode = s.stateToNodes[StateInFunctionArgs]
-			prevNode.TransitionEdges['='] = NewNode(StateInFunctionArgsType)
-			s.CreateMask(prevNode)
-			prevNode = prevNode.TransitionEdges['=']
-			switch function.Types[i] {
-			case "string":
-				prevNode.TransitionEdges['"'] = s.stateToNodes[PStateInString]
-				s.CreateMask(prevNode.TransitionEdges['"'])
-			case "number":
-				prevNode.TransitionEdges['"'] = s.stateToNodes[PStateInNumber]
-				s.CreateMask(prevNode.TransitionEdges['"'])
-			}
-		}
-
-	}
-	s.curNode = s.stateToNodes[PythonStateStart]
-	fmt.Println("curNode", s.curNode.State)
-	fmt.Println("transition edges", s.curNode.TransitionEdges)
-	if err := s.CreateMask(s.curNode); err != nil {
-		return err
-	}
-	fmt.Println("maskTokenIDToNode", s.curNode.MaskTokenIDToNode)
-	for tokenID, node := range s.curNode.MaskTokenIDToNode {
-		fmt.Printf("tokenID: %d, node: %v\n", s.decodedToks[tokenID], node.State)
-	}
-
-	return nil
-}
-
-func (s *PythonSampler) BuildGraph() error {
-	s.stateToNodes = make(map[PythonState]*Node)
-	for _, state := range PythonStates {
-		s.stateToNodes[state] = NewNode(state)
-	}
-
-	for _, state := range s.stateToNodes {
-		if err := s.CreateMask(state); err != nil {
-			return err
-		}
-	}
-
-	// String
-	s.stateToNodes[PStateInString].TransitionEdges[rune(-1)] = s.stateToNodes[PStateInString]
-	s.stateToNodes[PStateInString].TransitionEdges['"'] = s.stateToNodes[PStateInStringEnd]
-
-	// String end
-	s.stateToNodes[PStateInStringEnd].TransitionEdges[','] = s.stateToNodes[StateInFunctionArgs]
-	// s.stateToNodes[PStateInStringEnd].TransitionEdges[')'] = s.stateToNodes[PStateTerminate]
-	// Number
-	for _, r := range validNumberRunes {
-		s.stateToNodes[PStateInNumber].TransitionEdges[r] = s.stateToNodes[PStateInNumber]
-	}
-	s.stateToNodes[PStateInNumber].TransitionEdges[')'] = s.stateToNodes[PStateTerminate]
-	s.stateToNodes[PStateInNumber].TransitionEdges[','] = s.stateToNodes[StateInFunctionArgs]
-	s.stateToNodes[PStateInNumber].TransitionEdges[' '] = s.stateToNodes[StateInFunctionArgs]
-
-	return nil
-}
-
-func (s *PythonSampler) ApplyMask(logits []float32) ([]float32, error) {
-	if s.curNode.State == PStateTerminate {
-		logits, err := finish(s, logits)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-	}
-	logits, err := s.maskLogits(logits, s.curNode)
-	if err != nil {
-		return nil, err
-	}
-	return logits, nil
-}
-
-func (s *PythonSampler) UpdateState(token int32) error {
-	mappedString, err := s.proc.Decode([]int32{token})
-	if err != nil {
-		return err
-	}
-	fmt.Printf(">>> mappedString: %q\n", mappedString)
-
-	if s.curNode.State == PStateTerminate {
-		if s.proc.Is(token, model.SpecialEOS) {
-			return nil
-		}
-	}
-	nextNode, ok := s.curNode.MaskTokenIDToNode[token]
-	if !ok {
-		return fmt.Errorf("invalid token: %q", mappedString)
-	}
-
-	if mappedString == "\"" {
-		if s.curNode.State == PStateInStringEnd {
-			s.completed++
-		}
-		if s.completed == len(s.functions) {
-			s.curNode.TransitionEdges[')'] = s.stateToNodes[PStateTerminate]
-			s.CreateMask(s.curNode)
-		}
-	}
-	s.curNode = nextNode
-	fmt.Println("curNode", s.curNode.State)
-	for r, node := range s.curNode.TransitionEdges {
-		fmt.Printf("transition edge: %q -> %v\n", r, node.State)
-	}
-	if err := s.CreateMask(s.curNode); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (s *PythonSampler) CreateMask(node *Node) error {
-	if node == nil {
-		return fmt.Errorf("node cannot be nil")
-	}
-	for i := range s.decodedToks {
-		token := s.decodedToks[i]
-		// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
-		if s.proc.Is(int32(i), model.SpecialEOS) || s.proc.Is(int32(i), model.SpecialBOS) || token == "" || token == "\"\"" {
-			continue
-		}
-		curNode := node
-		valid := true
-		consumedSpecialRunes := make(map[rune]bool)
-		for _, r := range token {
-			curNode, valid = isRValid(r, curNode, consumedSpecialRunes)
-			if curNode == nil || !valid {
-				break
-			}
-		}
-		if valid {
-			if curNode.State == StateInFunction {
-				// fmt.Println("cm curNode", curNode.State)
-				// fmt.Println("cm token", s.decodedToks[i])
-			}
-			node.MaskTokenIDToNode[int32(i)] = curNode
-		}
-	}
-	return nil
-}
-
-func isRValid(r rune, curNode *Node, consumedSpecialRunes map[rune]bool) (*Node, bool) {
-	if consumedSpecialRunes[r] {
-		return nil, false
-	}
-
-	specialRune := slices.Contains(stringInvalidRunes, r)
-	if specialRune {
-		if curNode.State == PStateInString || curNode.State == PStateInStringEnd {
-			return nil, false
-		}
-	}
-
-	// Check for specific rune transition
-	if nextNode, ok := curNode.TransitionEdges[r]; ok {
-		// fmt.Println("next node", nextNode)
-		if specialRune {
-			if curNode.State == nextNode.State {
-				return nil, false
-			}
-			consumedSpecialRunes[r] = true
-		}
-		return nextNode, true
-	}
-
-	// Check for sentinel value - if present, any rune is valid
-	if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
-		return nextNode, true
-	}
-
-	return nil, false
-}
-
-func (s *PythonSampler) maskLogits(logits []float32, node *Node) ([]float32, error) {
-	// Create a new slice with same length as logits, initialized to -Inf
-	maskedLogits := make([]float32, len(logits))
-	for i := range maskedLogits {
-		maskedLogits[i] = float32(math.Inf(-1))
-	}
-
-	// Only update values for valid token IDs from the mask map
-	for tokenID := range node.MaskTokenIDToNode {
-		if int(tokenID) < len(logits) {
-			maskedLogits[tokenID] = logits[tokenID]
-		}
-	}
-
-	return maskedLogits, nil
-}
-
-func finish(s *PythonSampler, logits []float32) ([]float32, error) {
-	for i := range logits {
-		if s.proc.Is(int32(i), model.SpecialEOS) {
-			logits[i] = 1.0
-		} else {
-			logits[i] = float32(math.Inf(-1))
-		}
-	}
-	return logits, nil
-}
--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -168,53 +168,27 @@ func TestTopP(t *testing.T) {
 	softmax(tokens)
 	tokens = topK(tokens, 20)

-	// Test with very high p value
-	got := topP(tokens, 1.0)
+	// Then apply topP
+	tokens = topP(tokens, 0.95)

-	// Should keep all tokens since p is 1
-	if len(got) != len(input) {
-		t.Errorf("topP(1.0): should keep all tokens, got %d, want %d", len(got), len(input))
-	}
-
-	// Test with normal p value
-	got = topP(tokens, 0.95)
-
-	if len(got) > 3 {
+	// Should keep tokens until cumsum > 0.95
+	if len(tokens) > 3 {
 		t.Errorf("topP(0.95): kept too many tokens: got %d", len(tokens))
-		t.Logf("got: %v", got)
+		t.Logf("got: %v", tokens)
 	}

 	// Test edge case - ensure at least one token remains
-	input = []float32{-1e6, -1e6, -1e7}
+	input = []float32{-1e6, -1e6, -1e6} // One dominant token
 	tokens = toTokens(input)
-	tokens = topK(tokens, 20)
 	softmax(tokens)
-	got = topP(tokens, 0.0)
-	if len(got) < 1 {
+	tokens = topP(tokens, 0.0) // Very small p
+	if len(tokens) < 1 {
 		t.Error("topP should keep at least one token")
 	}
-
-	// Test with zero p value
-	got = topP(tokens, 0.0)
-
-	// Should keep only the highest probability token
-	if len(got) != 1 {
-		t.Errorf("topP(0.0): should keep only one token, got %d", len(got))
-		t.Logf("got: %v", got)
-	}
-
-	tokens = toTokens(input)
-	tokens = topK(tokens, 20)
-	softmax(tokens)
-	got = topP(tokens, 1e-10)
-	if len(got) == 0 {
-		t.Errorf("topP(1e-10): should keep at least one token, got %d", len(got))
-		t.Logf("got: %v", got)
-	}
 }

 func TestMinP(t *testing.T) {
-	input := []float32{-2, 0, -1, -3, 2, 1, 4, 3}
+	input := []float32{-3, -2, -1, 0, 1, 2, 4, 3}
 	tokens := toTokens(input)

 	// First apply temperature and softmax
@@ -251,48 +225,30 @@ func TestMinP(t *testing.T) {
 		t.Logf("got: %v", tokens)
 	}

-	// Test with single token
-	tokens = toTokens(input[:1])
-	tokens = topK(tokens, 20)
-	softmax(tokens)
-	tokens = minP(tokens, 0.1)
-
-	// Should keep only the highest probability token
-	if len(tokens) != 1 {
-		t.Errorf("minP(0.1): should return single token, got %d", len(tokens))
-		t.Logf("got: %v", tokens)
-	}
-
 	input = []float32{1e-10, 1e-10, 1e-10}
 	tokens = toTokens(input)
 	softmax(tokens)
 	tokens = minP(tokens, 1.0)
 	if len(tokens) < 1 {
 		t.Error("minP should keep at least one token even with extreme probabilities")
-		got := minP(tokens, 1.0)
+	}
+}

-		if len(got) != 1 {
-			t.Errorf("minP(1.0): should keep all tokens, got %d, want %d", len(got), len(tokens))
-		}
+func TestSortLogits(t *testing.T) {
+	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
+	tokens := toTokens(input)

-		// Test with normal p value
-		got = minP(tokens, 0.2)
+	tokens = topK(tokens, 20)

-		// Should keep tokens with prob >= 0.2 * max_prob
-		if len(got) > 3 {
-			t.Errorf("minP(0.2): kept too many tokens: got %d", len(got))
-			t.Logf("got: %v", got)
-		}
-
-		// Test with zero p value
-		got = minP(tokens, 0.0)
-
-		// Should keep only the highest probability token
-		if len(got) != len(tokens) {
-			t.Errorf("minP(0.0): should keep only one token, got %d", len(got))
-			t.Logf("got: %v", got)
+	for i := 1; i < len(tokens); i++ {
+		if tokens[i].value > tokens[i-1].value {
+			t.Errorf("sortLogits: tokens not sorted in descending order at index %d: %f > %f",
+				i, tokens[i].value, tokens[i-1].value)
 		}
 	}
+
+	want := []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
+	compareLogits(t, "sortLogits", want, tokens)
 }

 func BenchmarkTransforms(b *testing.B) {
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -37,6 +37,7 @@ import (
 	"golang.org/x/sync/errgroup"

 	"github.com/ollama/ollama/server/internal/cache/blob"
+	"github.com/ollama/ollama/server/internal/internal/backoff"
 	"github.com/ollama/ollama/server/internal/internal/names"

 	_ "embed"
@@ -59,11 +60,6 @@ var (
 	// ErrCached is passed to [Trace.PushUpdate] when a layer already
 	// exists. It is a non-fatal error and is never returned by [Registry.Push].
 	ErrCached = errors.New("cached")
-
-	// ErrIncomplete is returned by [Registry.Pull] when a model pull was
-	// incomplete due to one or more layer download failures. Users that
-	// want specific errors should use [WithTrace].
-	ErrIncomplete = errors.New("incomplete")
 )

 // Defaults
@@ -217,6 +213,12 @@ type Registry struct {
 	// request. If zero, [DefaultChunkingThreshold] is used.
 	ChunkingThreshold int64

+	// MaxChunkSize is the maximum size of a chunk to download. If zero,
+	// the default is [DefaultMaxChunkSize].
+	//
+	// It is only used when a layer is larger than [MaxChunkingThreshold].
+	MaxChunkSize int64
+
 	// Mask, if set, is the name used to convert non-fully qualified names
 	// to fully qualified names. If empty, [DefaultMask] is used.
 	Mask string
@@ -276,19 +278,8 @@ func DefaultRegistry() (*Registry, error) {

 func UserAgent() string {
 	buildinfo, _ := debug.ReadBuildInfo()
-
-	version := buildinfo.Main.Version
-	if version == "(devel)" {
-		// When using `go run .` the version is "(devel)". This is seen
-		// as an invalid version by ollama.com and so it defaults to
-		// "needs upgrade" for some requests, such as pulls. These
-		// checks can be skipped by using the special version "v0.0.0",
-		// so we set it to that here.
-		version = "v0.0.0"
-	}
-
 	return fmt.Sprintf("ollama/%s (%s %s) Go/%s",
-		version,
+		buildinfo.Main.Version,
 		runtime.GOARCH,
 		runtime.GOOS,
 		runtime.Version(),
@@ -434,14 +425,13 @@ func canRetry(err error) bool {
 //
 // It always calls update with a nil error.
 type trackingReader struct {
-	l      *Layer
-	r      io.Reader
-	update func(l *Layer, n int64, err error)
+	r io.Reader
+	n *atomic.Int64
 }

 func (r *trackingReader) Read(p []byte) (n int, err error) {
 	n, err = r.r.Read(p)
-	r.update(r.l, int64(n), nil)
+	r.n.Add(int64(n))
 	return
 }

@@ -457,11 +447,6 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 	if err != nil {
 		return err
 	}
-
-	// TODO(bmizerany): decide if this should be considered valid. Maybe
-	// server-side we special case '{}' to have some special meaning? Maybe
-	// "archiving" a tag (which is how we reason about it in the registry
-	// already, just with a different twist).
 	if len(m.Layers) == 0 {
 		return fmt.Errorf("%w: no layers", ErrManifestInvalid)
 	}
@@ -471,7 +456,11 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 		return err
 	}

-	// TODO(bmizerany): work to remove the need to do this
+	exists := func(l *Layer) bool {
+		info, err := c.Get(l.Digest)
+		return err == nil && info.Size == l.Size
+	}
+
 	layers := m.Layers
 	if m.Config != nil && m.Config.Digest.IsValid() {
 		layers = append(layers, m.Config)
@@ -479,97 +468,99 @@ func (r *Registry) Pull(ctx context.Context, name string) error {

 	// Send initial layer trace events to allow clients to have an
 	// understanding of work to be done before work starts.
-	var expected int64
 	t := traceFromContext(ctx)
-	for _, l := range layers {
+	skip := make([]bool, len(layers))
+	for i, l := range layers {
 		t.update(l, 0, nil)
-		expected += l.Size
+		if exists(l) {
+			skip[i] = true
+			t.update(l, l.Size, ErrCached)
+		}
 	}

-	var received atomic.Int64
-	var g errgroup.Group
+	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(r.maxStreams())
-	for _, l := range layers {
-		info, err := c.Get(l.Digest)
-		if err == nil && info.Size == l.Size {
-			received.Add(l.Size)
-			t.update(l, l.Size, ErrCached)
+	for i, l := range layers {
+		if skip[i] {
 			continue
 		}

-		var wg sync.WaitGroup
 		chunked, err := c.Chunked(l.Digest, l.Size)
 		if err != nil {
 			t.update(l, 0, err)
 			continue
 		}
+		defer chunked.Close()

+		var progress atomic.Int64
 		for cs, err := range r.chunksums(ctx, name, l) {
 			if err != nil {
-				// Chunksum stream interrupted. Note in trace
-				// log and let in-flight downloads complete.
-				// This will naturally trigger ErrIncomplete
-				// since received < expected bytes.
-				t.update(l, 0, err)
+				t.update(l, progress.Load(), err)
 				break
 			}

-			wg.Add(1)
 			g.Go(func() (err error) {
-				defer func() {
-					if err == nil {
-						received.Add(cs.Chunk.Size())
-					} else {
-						err = fmt.Errorf("error downloading %s: %w", cs.Digest.Short(), err)
+				defer func() { t.update(l, progress.Load(), err) }()
+
+				for _, err := range backoff.Loop(ctx, 3*time.Second) {
+					if err != nil {
+						return err
 					}
-					wg.Done()
-				}()
+					err := func() error {
+						req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
+						if err != nil {
+							return err
+						}
+						req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", cs.Chunk.Start, cs.Chunk.End))
+						res, err := sendRequest(r.client(), req)
+						if err != nil {
+							return err
+						}
+						defer res.Body.Close()

-				req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
-				if err != nil {
-					return err
-				}
-				req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", cs.Chunk.Start, cs.Chunk.End))
-				res, err := sendRequest(r.client(), req)
-				if err != nil {
-					return err
-				}
-				defer res.Body.Close()
+						// Count bytes towards
+						// progress, as they arrive, so
+						// that our bytes piggyback
+						// other chunk updates on
+						// completion.
+						//
+						// This tactic is enough to
+						// show "smooth" progress given
+						// the current CLI client. In
+						// the near future, the server
+						// should report download rate
+						// since it knows better than
+						// a client that is measuring
+						// rate based on wall-clock
+						// time-since-last-update.
+						body := &trackingReader{r: res.Body, n: &progress}

-				body := &trackingReader{l: l, r: res.Body, update: t.update}
-				return chunked.Put(cs.Chunk, cs.Digest, body)
+						err = chunked.Put(cs.Chunk, cs.Digest, body)
+						if err != nil {
+							return err
+						}
+
+						return nil
+					}()
+					if !canRetry(err) {
+						return err
+					}
+				}
+				return nil
 			})
 		}
-
-		// Close writer immediately after downloads finish, not at Pull
-		// exit. Using defer would keep file descriptors open until all
-		// layers complete, potentially exhausting system limits with
-		// many layers.
-		//
-		// The WaitGroup tracks when all chunks finish downloading,
-		// allowing precise writer closure in a background goroutine.
-		// Each layer briefly uses one extra goroutine while at most
-		// maxStreams()-1 chunks download in parallel.
-		//
-		// This caps file descriptors at maxStreams() instead of
-		// growing with layer count.
-		g.Go(func() error {
-			wg.Wait()
-			chunked.Close()
-			return nil
-		})
 	}
 	if err := g.Wait(); err != nil {
 		return err
 	}
-	if received.Load() != expected {
-		return fmt.Errorf("%w: received %d/%d", ErrIncomplete, received.Load(), expected)
-	}

+	// store the manifest blob
 	md := blob.DigestFromBytes(m.Data)
 	if err := blob.PutBytes(c, md, m.Data); err != nil {
 		return err
 	}
+
+	// commit the manifest with a link
 	return c.Link(m.Name, md)
 }

--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -17,7 +17,6 @@ import (
 	"reflect"
 	"slices"
 	"strings"
-	"sync"
 	"testing"
 	"time"

@@ -25,28 +24,6 @@ import (
 	"github.com/ollama/ollama/server/internal/testutil"
 )

-func ExampleRegistry_cancelOnFirstError() {
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	ctx = WithTrace(ctx, &Trace{
-		Update: func(l *Layer, n int64, err error) {
-			if err != nil {
-				// Discontinue pulling layers if there is an
-				// error instead of continuing to pull more
-				// data.
-				cancel()
-			}
-		},
-	})
-
-	var r Registry
-	if err := r.Pull(ctx, "model"); err != nil {
-		// panic for demo purposes
-		panic(err)
-	}
-}
-
 func TestManifestMarshalJSON(t *testing.T) {
 	// All manifests should contain an "empty" config object.
 	var m Manifest
@@ -79,21 +56,21 @@ func (rr recordRoundTripper) RoundTrip(req *http.Request) (*http.Response, error

 // newClient constructs a cache with predefined manifests for testing. The manifests are:
 //
-//	empty:         no data
-//	zero:          no layers
-//	single:        one layer with the contents "exists"
-//	multiple:      two layers with the contents "exists" and "here"
-//	notfound:      a layer that does not exist in the cache
-//	null:          one null layer (e.g. [null])
-//	sizemismatch:  one valid layer, and one with a size mismatch (file size is less than the reported size)
-//	invalid:       a layer with invalid JSON data
+//	empty: no data
+//	zero: no layers
+//	single: one layer with the contents "exists"
+//	multiple: two layers with the contents "exists" and "here"
+//	notfound: a layer that does not exist in the cache
+//	null: one null layer (e.g. [null])
+//	sizemismatch: one valid layer, and one with a size mismatch (file size is less than the reported size)
+//	invalid: a layer with invalid JSON data
 //
 // Tests that want to ensure the client does not communicate with the upstream
 // registry should pass a nil handler, which will cause a panic if
 // communication is attempted.
 //
 // To simulate a network error, pass a handler that returns a 499 status code.
-func newClient(t *testing.T, upstreamRegistry http.HandlerFunc) (*Registry, *blob.DiskCache) {
+func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	t.Helper()

 	c, err := blob.Open(t.TempDir())
@@ -111,7 +88,7 @@ func newClient(t *testing.T, upstreamRegistry http.HandlerFunc) (*Registry, *blo
 	r := &Registry{
 		Cache: c,
 		HTTPClient: &http.Client{
-			Transport: recordRoundTripper(upstreamRegistry),
+			Transport: recordRoundTripper(h),
 		},
 	}

@@ -790,79 +767,3 @@ func TestUnlink(t *testing.T) {
 		}
 	})
 }
-
-func TestPullChunksums(t *testing.T) {
-	check := testutil.Checker(t)
-
-	content := "hello"
-	var chunksums string
-	contentDigest := func() blob.Digest {
-		return blob.DigestFromBytes(content)
-	}
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
-		switch {
-		case strings.Contains(r.URL.Path, "/manifests/latest"):
-			fmt.Fprintf(w, `{"layers":[{"digest":%q,"size":%d}]}`, contentDigest(), len(content))
-		case strings.HasSuffix(r.URL.Path, "/chunksums/"+contentDigest().String()):
-			loc := fmt.Sprintf("http://blob.store/v2/library/test/blobs/%s", contentDigest())
-			w.Header().Set("Content-Location", loc)
-			io.WriteString(w, chunksums)
-		case strings.Contains(r.URL.Path, "/blobs/"+contentDigest().String()):
-			http.ServeContent(w, r, contentDigest().String(), time.Time{}, strings.NewReader(content))
-		default:
-			t.Errorf("unexpected request: %v", r)
-			http.NotFound(w, r)
-		}
-	})
-
-	rc.MaxStreams = 1        // prevent concurrent chunk downloads
-	rc.ChunkingThreshold = 1 // for all blobs to be chunked
-
-	var mu sync.Mutex
-	var reads []int64
-	ctx := WithTrace(t.Context(), &Trace{
-		Update: func(l *Layer, n int64, err error) {
-			t.Logf("Update: %v %d %v", l, n, err)
-			mu.Lock()
-			reads = append(reads, n)
-			mu.Unlock()
-		},
-	})
-
-	chunksums = fmt.Sprintf("%s 0-2\n%s 3-4\n",
-		blob.DigestFromBytes("hel"),
-		blob.DigestFromBytes("lo"),
-	)
-	err := rc.Pull(ctx, "test")
-	check(err)
-	wantReads := []int64{
-		0, // initial signaling of layer pull starting
-		3, // first chunk read
-		2, // second chunk read
-	}
-	if !slices.Equal(reads, wantReads) {
-		t.Errorf("reads = %v; want %v", reads, wantReads)
-	}
-
-	mw, err := rc.Resolve(t.Context(), "test")
-	check(err)
-	mg, err := rc.ResolveLocal("test")
-	check(err)
-	if !reflect.DeepEqual(mw, mg) {
-		t.Errorf("mw = %v; mg = %v", mw, mg)
-	}
-	for i := range mg.Layers {
-		_, err = c.Get(mg.Layers[i].Digest)
-		if err != nil {
-			t.Errorf("Get(%v): %v", mg.Layers[i].Digest, err)
-		}
-	}
-
-	// missing chunks
-	content = "llama"
-	chunksums = fmt.Sprintf("%s 0-1\n", blob.DigestFromBytes("ll"))
-	err = rc.Pull(ctx, "missingchunks")
-	if err == nil {
-		t.Error("expected error because of missing chunks")
-	}
-}
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -200,7 +200,7 @@ type params struct {
 	//
 	// Unfortunately, this API was designed to be a bit awkward. Stream is
 	// defined to default to true if not present, so we need a way to check
-	// if the client decisively set it to false. So, we use a pointer to a
+	// if the client decisively it to false. So, we use a pointer to a
 	// bool. Gross.
 	//
 	// Use [stream()] to get the correct value for this field.
@@ -280,17 +280,17 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	progress := make(map[*ollama.Layer]int64)

 	progressCopy := make(map[*ollama.Layer]int64, len(progress))
-	flushProgress := func() {
+	pushUpdate := func() {
 		defer maybeFlush()

-		// TODO(bmizerany): Flushing every layer in one update doesn't
-		// scale well. We could flush only the modified layers or track
-		// the full download. Needs further consideration, though it's
-		// fine for now.
+		// TODO(bmizerany): This scales poorly with more layers due to
+		// needing to flush out them all in one big update. We _could_
+		// just flush on the changed ones, or just track the whole
+		// download. Needs more thought. This is fine for now.
 		mu.Lock()
 		maps.Copy(progressCopy, progress)
 		mu.Unlock()
-		for l, n := range progressCopy {
+		for l, n := range progress {
 			enc.Encode(progressUpdateJSON{
 				Digest:    l.Digest,
 				Total:     l.Size,
@@ -298,26 +298,19 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 			})
 		}
 	}
-	defer flushProgress()

-	t := time.NewTicker(1000 * time.Hour) // "unstarted" timer
+	t := time.NewTicker(time.Hour) // "unstarted" timer
 	start := sync.OnceFunc(func() {
-		flushProgress() // flush initial state
+		pushUpdate()
 		t.Reset(100 * time.Millisecond)
 	})
 	ctx := ollama.WithTrace(r.Context(), &ollama.Trace{
 		Update: func(l *ollama.Layer, n int64, err error) {
 			if n > 0 {
-				// Block flushing progress updates until every
-				// layer is accounted for. Clients depend on a
-				// complete model size to calculate progress
-				// correctly; if they use an incomplete total,
-				// progress indicators would erratically jump
-				// as new layers are registered.
-				start()
+				start() // flush initial state
 			}
 			mu.Lock()
-			progress[l] += n
+			progress[l] = n
 			mu.Unlock()
 		},
 	})
@@ -330,9 +323,9 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	for {
 		select {
 		case <-t.C:
-			flushProgress()
+			pushUpdate()
 		case err := <-done:
-			flushProgress()
+			pushUpdate()
 			if err != nil {
 				var status string
 				if errors.Is(err, ollama.ErrModelNotFound) {
--- a/server/model.go
+++ b/server/model.go
@@ -82,7 +82,7 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
 	for _, layer := range layers {
 		if s := layer.GGML.KV().ChatTemplate(); s != "" {
 			if t, err := template.Named(s); err != nil {
-				slog.Debug("template detection", "error", err, "template", s)
+				slog.Debug("template detection", "error", err)
 			} else {
 				layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
 				if err != nil {
--- a/template/gemma3-instruct.gotmpl
+++ b/template/gemma3-instruct.gotmpl
@@ -1,13 +0,0 @@
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 }}
-{{- if eq .Role "user" }}<start_of_turn>user
-{{- if and (eq $i 1) $.System }}
-{{ $.System }}
-{{ end }}
-{{ .Content }}<end_of_turn>
-{{ else if eq .Role "assistant" }}<start_of_turn>model
-{{ .Content }}<end_of_turn>
-{{ end }}
-{{- if $last }}<start_of_turn>model
-{{ end }}
-{{- end }}
--- a/template/gemma3-instruct.json
+++ b/template/gemma3-instruct.json
@@ -1,6 +0,0 @@
-{
-  "stop": [
-    "<end_of_turn>"
-  ],
-  "temperature": 0.1
-}
--- a/template/index.json
+++ b/template/index.json
@@ -87,10 +87,6 @@
    "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
    "name": "gemma-instruct"
  },
-  {
-    "template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
-    "name": "gemma3-instruct"
-  },
  {
    "template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
    "name": "llama3-instruct"
--- a/template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user
+++ b/template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user
@@ -1,10 +0,0 @@
-<start_of_turn>user
-You are a helpful assistant.
-
-Hello, how are you?<end_of_turn>
-<start_of_turn>model
-I'm doing great. How can I help you today?<end_of_turn>
-<start_of_turn>user
-I'd like to show off how chat templating works!<end_of_turn>
-<start_of_turn>model
-
--- a/template/testdata/gemma3-instruct.gotmpl/user
+++ b/template/testdata/gemma3-instruct.gotmpl/user
@@ -1,4 +0,0 @@
-<start_of_turn>user
-Hello, how are you?<end_of_turn>
-<start_of_turn>model
-
--- a/template/testdata/gemma3-instruct.gotmpl/user-assistant-user
+++ b/template/testdata/gemma3-instruct.gotmpl/user-assistant-user
@@ -1,8 +0,0 @@
-<start_of_turn>user
-Hello, how are you?<end_of_turn>
-<start_of_turn>model
-I'm doing great. How can I help you today?<end_of_turn>
-<start_of_turn>user
-I'd like to show off how chat templating works!<end_of_turn>
-<start_of_turn>model
-
--- a/types/bfloat16/LICENSE
+++ b/types/bfloat16/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Tristan Rice
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/types/bfloat16/bfloat16.go
+++ b/types/bfloat16/bfloat16.go
@@ -0,0 +1,57 @@
+// Vendored code from https://github.com/d4l3k/go-bfloat16
+// unsafe pointer replaced by "math"
+package bfloat16
+
+import "math"
+
+type BF16 uint16
+
+func FromBytes(buf []byte) BF16 {
+	return BF16(uint16(buf[0]) + uint16(buf[1])<<8)
+}
+
+func ToBytes(b BF16) []byte {
+	return []byte{byte(b & 0xFF), byte(b >> 8)}
+}
+
+func Decode(buf []byte) []BF16 {
+	var out []BF16
+	for i := 0; i < len(buf); i += 2 {
+		out = append(out, FromBytes(buf[i:]))
+	}
+	return out
+}
+
+func Encode(f []BF16) []byte {
+	var out []byte
+	for _, a := range f {
+		out = append(out, ToBytes(a)...)
+	}
+	return out
+}
+
+func DecodeFloat32(buf []byte) []float32 {
+	var out []float32
+	for i := 0; i < len(buf); i += 2 {
+		out = append(out, ToFloat32(FromBytes(buf[i:])))
+	}
+	return out
+}
+
+func EncodeFloat32(f []float32) []byte {
+	var out []byte
+	for _, a := range f {
+		out = append(out, ToBytes(FromFloat32(a))...)
+	}
+	return out
+}
+
+func ToFloat32(b BF16) float32 {
+	u32 := uint32(b) << 16
+	return math.Float32frombits(u32)
+}
+
+func FromFloat32(f float32) BF16 {
+	u32 := math.Float32bits(f)
+	return BF16(u32 >> 16)
+}
--- a/types/bfloat16/bfloat16_test.go
+++ b/types/bfloat16/bfloat16_test.go
@@ -0,0 +1,53 @@
+package bfloat16
+
+import (
+	"crypto/rand"
+	"reflect"
+	"testing"
+)
+
+func randomBytes(n int) []byte {
+	out := make([]byte, n)
+	if _, err := rand.Read(out); err != nil {
+		panic(err)
+	}
+	return out
+}
+
+func TestEncodeDecode(t *testing.T) {
+	b := randomBytes(1024)
+	bf16 := Decode(b)
+	out := Encode(bf16)
+	if !reflect.DeepEqual(b, out) {
+		t.Fatalf("%+v != %+v", b, out)
+	}
+}
+
+func TestEncodeDecodeFloat32(t *testing.T) {
+	b := randomBytes(1024)
+	bf16 := DecodeFloat32(b)
+	out := EncodeFloat32(bf16)
+	if !reflect.DeepEqual(b, out) {
+		t.Fatalf("%+v != %+v", b, out)
+	}
+}
+
+func TestBasicFloat32(t *testing.T) {
+	var in float32 = 1.0
+	out := ToFloat32(FromFloat32(in))
+	if !reflect.DeepEqual(in, out) {
+		t.Fatalf("%+v != %+v", in, out)
+	}
+}
+
+func TestComplexFloat32(t *testing.T) {
+	var in float32 = 123456789123456789.123456789
+	var want float32 = 123286039799267328.0
+	out := ToFloat32(FromFloat32(in))
+	if in == out {
+		t.Fatalf("no loss of precision")
+	}
+	if out != want {
+		t.Fatalf("%.16f != %.16f", want, out)
+	}
+}