Apply suggestion from @ParthSareen

Co-authored-by: Parth Sareen <parth.sareen@ollama.com>
prefer tokenizer.json
2026-01-02 04:29:51 -05:00 · 2025-12-17 15:29:55 -08:00 · 2025-12-17 15:14:08 -08:00
1 changed files with 11 additions and 4 deletions
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -204,12 +204,14 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)

 type tokenizer struct {
 	AddedTokens []token `json:"added_tokens"`
-	Model       struct {
+	Decoder     struct {
+		Type string `json:"type"`
+	} `json:"decoder"`
+	Model struct {
 		Type   string          `json:"type"`
 		Vocab  map[string]int  `json:"vocab"`
 		Merges json.RawMessage `json:"merges"`
 	} `json:"model"`
-
 	PreTokenizer struct {
 		PreTokenizers []struct {
 			Type    string `json:"type"`
@@ -246,6 +248,11 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}

+	model := "gpt2"
+	if t.Decoder.Type == "Sequence" {
+		model = "llama"
+	}
+
 	tokens := make(map[int]token, len(t.Model.Vocab))
 	for k, v := range t.Model.Vocab {
 		tokens[v] = token{
@@ -259,7 +266,7 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		tokens[token.ID] = token
 	}

-	v := Vocabulary{Model: "gpt2"}
+	v := Vocabulary{Model: model}
 	for _, k := range slices.Sorted(maps.Keys(tokens)) {
 		token := tokens[k]
 		v.Tokens = append(v.Tokens, token.Content)
@@ -283,8 +290,8 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
 		Pattern string
 		Func    func(fs.FS) (*Vocabulary, error)
 	}{
-		{"tokenizer.model", parseSentencePiece},
 		{"tokenizer.json", parseVocabularyFromTokenizer},
+		{"tokenizer.model", parseSentencePiece},
 	}

 	for _, pattern := range patterns {
Author	SHA1	Message	Date
Michael Yang	4a0f90ec38	Apply suggestion from @ParthSareen Co-authored-by: Parth Sareen <parth.sareen@ollama.com>	2025-12-17 15:29:55 -08:00
Michael Yang	bddcb3fb16	prefer tokenizer.json check tokenizer decoder for tokenizer type	2025-12-17 15:14:08 -08:00