Compare commits

...

2 Commits

Author SHA1 Message Date
Michael Yang
4a0f90ec38 Apply suggestion from @ParthSareen
Co-authored-by: Parth Sareen <parth.sareen@ollama.com>
2025-12-17 15:29:55 -08:00
Michael Yang
bddcb3fb16 prefer tokenizer.json
check tokenizer decoder for tokenizer type
2025-12-17 15:14:08 -08:00

View File

@@ -204,12 +204,14 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
type tokenizer struct {
AddedTokens []token `json:"added_tokens"`
Model struct {
Decoder struct {
Type string `json:"type"`
} `json:"decoder"`
Model struct {
Type string `json:"type"`
Vocab map[string]int `json:"vocab"`
Merges json.RawMessage `json:"merges"`
} `json:"model"`
PreTokenizer struct {
PreTokenizers []struct {
Type string `json:"type"`
@@ -246,6 +248,11 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
return nil, err
}
model := "gpt2"
if t.Decoder.Type == "Sequence" {
model = "llama"
}
tokens := make(map[int]token, len(t.Model.Vocab))
for k, v := range t.Model.Vocab {
tokens[v] = token{
@@ -259,7 +266,7 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
tokens[token.ID] = token
}
v := Vocabulary{Model: "gpt2"}
v := Vocabulary{Model: model}
for _, k := range slices.Sorted(maps.Keys(tokens)) {
token := tokens[k]
v.Tokens = append(v.Tokens, token.Content)
@@ -283,8 +290,8 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
Pattern string
Func func(fs.FS) (*Vocabulary, error)
}{
{"tokenizer.model", parseSentencePiece},
{"tokenizer.json", parseVocabularyFromTokenizer},
{"tokenizer.model", parseSentencePiece},
}
for _, pattern := range patterns {