From 524adfda753760f18dbf5a54022ff3166e3f784c Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Sun, 5 Apr 2026 11:55:37 +0200 Subject: [PATCH] Enhance `htmlToJSONContent` to support Next.js `__NEXT_DATA__` extraction and improve Readability fallback handling --- backend/shared/src/parse.ts | 81 +++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/backend/shared/src/parse.ts b/backend/shared/src/parse.ts index 6ea0d7c4..df4290aa 100644 --- a/backend/shared/src/parse.ts +++ b/backend/shared/src/parse.ts @@ -1,23 +1,80 @@ +import {Readability} from '@mozilla/readability' import {JSONContent} from '@tiptap/core' import {debug} from 'common/logger' import {JSDOM} from 'jsdom' import {marked} from 'marked' export function htmlToJSONContent(html: string, url: string): JSONContent { - const originalDom = new JSDOM(html, {url}) - const classStyles = extractClassStyles(originalDom.window.document) + // Tier 1: Try __NEXT_DATA__ (Next.js, free, structured) + const nextData = extractNextData(html) + const nextContent = nextDataToJSONContent(nextData) - // const isGoogleDoc = !!extractGoogleDocId(url) - // if (!isGoogleDoc) { - // const reader = new Readability(originalDom.window.document) - // const article = reader.parse() - // if (article?.content) { - // const cleanDom = new JSDOM(article.content) - // return parseHtmlBodyToJSONContent(cleanDom.window.document, classStyles) - // } - // } + // Tier 2: Try Readability on raw HTML (works for ~75% of the web) + const result = tryReadability(html, url) - return parseHtmlBodyToJSONContent(originalDom.window.document, classStyles) + if (nextContent.content) result.content = [...nextContent.content, ...(result.content || [])] + + return result + + // Tier 3: Puppeteer fallback (CSR catch-all, expensive, high mem usage, and needs chrome deps in container — only if needed) + // To implement if really needed (i.e., lots of users want to extract profile info from client-side rendered pages) + // const renderedHtml = await fetchWithBrowser(url) + // return tryReadability(renderedHtml, url) ?? emptyContent() +} + +function extractNextData(html: string): Record | null { + const match = html.match(/