chore(gallery-agent): extract readme

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-13 20:31:49 +00:00
parent be1b8d56c9
commit 7ce675af21
2 changed files with 69 additions and 4 deletions

View File

@@ -141,6 +141,65 @@ func resolveReadme(client *hfapi.Client, modelID string, hfTags []string) (strin
return cleanTextContent(content), nil
}
// extractDescription turns a raw HuggingFace README into a concise plain-text
// description suitable for embedding in gallery/index.yaml: strips YAML
// frontmatter, HTML tags/comments, markdown images, link URLs (keeping the
// link text), markdown tables, and then truncates at a paragraph boundary
// around ~1200 characters. Raw README should still be used for icon
// extraction — call this only for the `description:` field.
func extractDescription(readme string) string {
s := readme
// Strip leading YAML frontmatter: `---\n...\n---\n` at start of file.
if strings.HasPrefix(strings.TrimLeft(s, " \t\n"), "---") {
trimmed := strings.TrimLeft(s, " \t\n")
rest := strings.TrimPrefix(trimmed, "---")
if idx := strings.Index(rest, "\n---"); idx >= 0 {
after := rest[idx+len("\n---"):]
after = strings.TrimPrefix(after, "\n")
s = after
}
}
// Strip HTML comments and tags.
s = regexp.MustCompile(`(?s)<!--.*?-->`).ReplaceAllString(s, "")
s = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(s, "")
// Strip markdown images entirely.
s = regexp.MustCompile(`!\[[^\]]*\]\([^)]*\)`).ReplaceAllString(s, "")
// Replace markdown links `[text](url)` with just `text`.
s = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(s, "$1")
// Drop table lines and horizontal rules.
var kept []string
for _, line := range strings.Split(s, "\n") {
t := strings.TrimSpace(line)
if strings.HasPrefix(t, "|") {
continue
}
if strings.HasPrefix(t, ":--") || strings.HasPrefix(t, "---") || strings.HasPrefix(t, "===") {
continue
}
kept = append(kept, line)
}
s = strings.Join(kept, "\n")
// Normalise whitespace.
s = cleanTextContent(s)
// Truncate at a paragraph boundary around maxLen chars.
const maxLen = 1200
if len(s) > maxLen {
cut := strings.LastIndex(s[:maxLen], "\n\n")
if cut < maxLen/3 {
cut = maxLen
}
s = strings.TrimRight(s[:cut], " \t\n") + "\n\n..."
}
return s
}
// cleanTextContent removes trailing spaces/tabs and collapses multiple empty
// lines so README content embeds cleanly into YAML without lint noise.
func cleanTextContent(text string) string {

View File

@@ -200,18 +200,24 @@ func main() {
}
// Deterministic README resolution: follow base_model tag if set.
// Keep the raw (HTML-bearing) README around while we extract the
// icon, then strip it down to a plain-text description for the
// `description:` YAML field.
readme, err := resolveReadme(client, m.ModelID, m.Tags)
if err == nil {
pm.ReadmeContent = readme
pm.ReadmeContentPreview = truncateString(readme, 200)
} else {
if err != nil {
fmt.Printf(" Warning: failed to fetch README: %v\n", err)
}
pm.ReadmeContent = readme
pm.License = licenseFromTags(m.Tags)
pm.Tags = curatedTags(m.Tags)
pm.Icon = extractModelIcon(pm)
if pm.ReadmeContent != "" {
pm.ReadmeContent = extractDescription(pm.ReadmeContent)
pm.ReadmeContentPreview = truncateString(pm.ReadmeContent, 200)
}
fmt.Printf(" License: %s, Tags: %v, Icon: %s\n", pm.License, pm.Tags, pm.Icon)
processed = append(processed, pm)
}