chore(gallery-agent): extract readme

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-08 16:57:08 -04:00 · 2026-04-13 20:31:49 +00:00
parent be1b8d56c9
commit 7ce675af21
2 changed files with 69 additions and 4 deletions
--- a/.github/gallery-agent/helpers.go
+++ b/.github/gallery-agent/helpers.go
@@ -141,6 +141,65 @@ func resolveReadme(client *hfapi.Client, modelID string, hfTags []string) (strin
 	return cleanTextContent(content), nil
 }

+// extractDescription turns a raw HuggingFace README into a concise plain-text
+// description suitable for embedding in gallery/index.yaml: strips YAML
+// frontmatter, HTML tags/comments, markdown images, link URLs (keeping the
+// link text), markdown tables, and then truncates at a paragraph boundary
+// around ~1200 characters. Raw README should still be used for icon
+// extraction — call this only for the `description:` field.
+func extractDescription(readme string) string {
+	s := readme
+
+	// Strip leading YAML frontmatter: `---\n...\n---\n` at start of file.
+	if strings.HasPrefix(strings.TrimLeft(s, " \t\n"), "---") {
+		trimmed := strings.TrimLeft(s, " \t\n")
+		rest := strings.TrimPrefix(trimmed, "---")
+		if idx := strings.Index(rest, "\n---"); idx >= 0 {
+			after := rest[idx+len("\n---"):]
+			after = strings.TrimPrefix(after, "\n")
+			s = after
+		}
+	}
+
+	// Strip HTML comments and tags.
+	s = regexp.MustCompile(`(?s)<!--.*?-->`).ReplaceAllString(s, "")
+	s = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(s, "")
+
+	// Strip markdown images entirely.
+	s = regexp.MustCompile(`!\[[^\]]*\]\([^)]*\)`).ReplaceAllString(s, "")
+	// Replace markdown links `[text](url)` with just `text`.
+	s = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(s, "$1")
+
+	// Drop table lines and horizontal rules.
+	var kept []string
+	for _, line := range strings.Split(s, "\n") {
+		t := strings.TrimSpace(line)
+		if strings.HasPrefix(t, "|") {
+			continue
+		}
+		if strings.HasPrefix(t, ":--") || strings.HasPrefix(t, "---") || strings.HasPrefix(t, "===") {
+			continue
+		}
+		kept = append(kept, line)
+	}
+	s = strings.Join(kept, "\n")
+
+	// Normalise whitespace.
+	s = cleanTextContent(s)
+
+	// Truncate at a paragraph boundary around maxLen chars.
+	const maxLen = 1200
+	if len(s) > maxLen {
+		cut := strings.LastIndex(s[:maxLen], "\n\n")
+		if cut < maxLen/3 {
+			cut = maxLen
+		}
+		s = strings.TrimRight(s[:cut], " \t\n") + "\n\n..."
+	}
+
+	return s
+}
+
 // cleanTextContent removes trailing spaces/tabs and collapses multiple empty
 // lines so README content embeds cleanly into YAML without lint noise.
 func cleanTextContent(text string) string {
--- a/.github/gallery-agent/main.go
+++ b/.github/gallery-agent/main.go
@@ -200,18 +200,24 @@ func main() {
 		}

 		// Deterministic README resolution: follow base_model tag if set.
+		// Keep the raw (HTML-bearing) README around while we extract the
+		// icon, then strip it down to a plain-text description for the
+		// `description:` YAML field.
 		readme, err := resolveReadme(client, m.ModelID, m.Tags)
-		if err == nil {
-			pm.ReadmeContent = readme
-			pm.ReadmeContentPreview = truncateString(readme, 200)
-		} else {
+		if err != nil {
 			fmt.Printf("  Warning: failed to fetch README: %v\n", err)
 		}
+		pm.ReadmeContent = readme

 		pm.License = licenseFromTags(m.Tags)
 		pm.Tags = curatedTags(m.Tags)
 		pm.Icon = extractModelIcon(pm)

+		if pm.ReadmeContent != "" {
+			pm.ReadmeContent = extractDescription(pm.ReadmeContent)
+			pm.ReadmeContentPreview = truncateString(pm.ReadmeContent, 200)
+		}
+
 		fmt.Printf("  License: %s, Tags: %v, Icon: %s\n", pm.License, pm.Tags, pm.Icon)
 		processed = append(processed, pm)
 	}