import {Readability} from '@mozilla/readability' import {JSONContent} from '@tiptap/core' import {debug} from 'common/logger' import {JSDOM} from 'jsdom' import {marked} from 'marked' export function htmlToJSONContent(html: string, url: string): JSONContent { // Tier 1: Try __NEXT_DATA__ (Next.js, free, structured) const nextData = extractNextData(html) const nextContent = nextDataToJSONContent(nextData) // Tier 2: Try Readability on raw HTML (works for ~75% of the web) const result = tryReadability(html, url) if (nextContent.content) result.content = [...nextContent.content, ...(result.content || [])] return result // Tier 3: Puppeteer fallback (CSR catch-all, expensive, high mem usage, and needs chrome deps in container — only if needed) // To implement if really needed (i.e., lots of users want to extract profile info from client-side rendered pages) // const renderedHtml = await fetchWithBrowser(url) // return tryReadability(renderedHtml, url) ?? emptyContent() } function extractNextData(html: string): Record | null { const match = html.match(/