diff --git a/.github/gallery-agent/helpers.go b/.github/gallery-agent/helpers.go index e90dc6ba2..bc66eebef 100644 --- a/.github/gallery-agent/helpers.go +++ b/.github/gallery-agent/helpers.go @@ -141,6 +141,65 @@ func resolveReadme(client *hfapi.Client, modelID string, hfTags []string) (strin return cleanTextContent(content), nil } +// extractDescription turns a raw HuggingFace README into a concise plain-text +// description suitable for embedding in gallery/index.yaml: strips YAML +// frontmatter, HTML tags/comments, markdown images, link URLs (keeping the +// link text), markdown tables, and then truncates at a paragraph boundary +// around ~1200 characters. Raw README should still be used for icon +// extraction — call this only for the `description:` field. +func extractDescription(readme string) string { + s := readme + + // Strip leading YAML frontmatter: `---\n...\n---\n` at start of file. + if strings.HasPrefix(strings.TrimLeft(s, " \t\n"), "---") { + trimmed := strings.TrimLeft(s, " \t\n") + rest := strings.TrimPrefix(trimmed, "---") + if idx := strings.Index(rest, "\n---"); idx >= 0 { + after := rest[idx+len("\n---"):] + after = strings.TrimPrefix(after, "\n") + s = after + } + } + + // Strip HTML comments and tags. + s = regexp.MustCompile(`(?s)`).ReplaceAllString(s, "") + s = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(s, "") + + // Strip markdown images entirely. + s = regexp.MustCompile(`!\[[^\]]*\]\([^)]*\)`).ReplaceAllString(s, "") + // Replace markdown links `[text](url)` with just `text`. + s = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(s, "$1") + + // Drop table lines and horizontal rules. + var kept []string + for _, line := range strings.Split(s, "\n") { + t := strings.TrimSpace(line) + if strings.HasPrefix(t, "|") { + continue + } + if strings.HasPrefix(t, ":--") || strings.HasPrefix(t, "---") || strings.HasPrefix(t, "===") { + continue + } + kept = append(kept, line) + } + s = strings.Join(kept, "\n") + + // Normalise whitespace. + s = cleanTextContent(s) + + // Truncate at a paragraph boundary around maxLen chars. + const maxLen = 1200 + if len(s) > maxLen { + cut := strings.LastIndex(s[:maxLen], "\n\n") + if cut < maxLen/3 { + cut = maxLen + } + s = strings.TrimRight(s[:cut], " \t\n") + "\n\n..." + } + + return s +} + // cleanTextContent removes trailing spaces/tabs and collapses multiple empty // lines so README content embeds cleanly into YAML without lint noise. func cleanTextContent(text string) string { diff --git a/.github/gallery-agent/main.go b/.github/gallery-agent/main.go index 5201573b3..d9c6449f7 100644 --- a/.github/gallery-agent/main.go +++ b/.github/gallery-agent/main.go @@ -200,18 +200,24 @@ func main() { } // Deterministic README resolution: follow base_model tag if set. + // Keep the raw (HTML-bearing) README around while we extract the + // icon, then strip it down to a plain-text description for the + // `description:` YAML field. readme, err := resolveReadme(client, m.ModelID, m.Tags) - if err == nil { - pm.ReadmeContent = readme - pm.ReadmeContentPreview = truncateString(readme, 200) - } else { + if err != nil { fmt.Printf(" Warning: failed to fetch README: %v\n", err) } + pm.ReadmeContent = readme pm.License = licenseFromTags(m.Tags) pm.Tags = curatedTags(m.Tags) pm.Icon = extractModelIcon(pm) + if pm.ReadmeContent != "" { + pm.ReadmeContent = extractDescription(pm.ReadmeContent) + pm.ReadmeContentPreview = truncateString(pm.ReadmeContent, 200) + } + fmt.Printf(" License: %s, Tags: %v, Icon: %s\n", pm.License, pm.Tags, pm.Icon) processed = append(processed, pm) }