Files
Compass/backend/shared/src/parse.ts
2026-03-29 23:05:58 +02:00

496 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import {JSONContent} from '@tiptap/core'
import {debug} from 'common/logger'
import {JSDOM} from 'jsdom'
import {marked} from 'marked'
export function htmlToJSONContent(html: string, url: string): JSONContent {
const originalDom = new JSDOM(html, {url})
const classStyles = extractClassStyles(originalDom.window.document)
// const isGoogleDoc = !!extractGoogleDocId(url)
// if (!isGoogleDoc) {
// const reader = new Readability(originalDom.window.document)
// const article = reader.parse()
// if (article?.content) {
// const cleanDom = new JSDOM(article.content)
// return parseHtmlBodyToJSONContent(cleanDom.window.document, classStyles)
// }
// }
return parseHtmlBodyToJSONContent(originalDom.window.document, classStyles)
}
function plainTextToJSONContent(text: string): JSONContent {
const paragraphs = text
.split(/\n{2,}/) // split on blank lines
.map((p) => p.trim())
.filter(Boolean)
.map((p) => ({
type: 'paragraph' as const,
content: [{type: 'text' as const, text: p}],
}))
return {type: 'doc', content: paragraphs}
}
function extractClassStyles(document: Document): Map<string, Record<string, string>> {
const classStyles = new Map<string, Record<string, string>>()
for (const styleEl of document.querySelectorAll('style')) {
const css = styleEl.textContent ?? ''
// Match .className { prop: value; prop: value }
const ruleRegex = /\.([a-zA-Z0-9_-]+)\s*\{([^}]+)}/g
let match
while ((match = ruleRegex.exec(css)) !== null) {
const className = match[1]
const declarations = match[2]
const styles = parseStyleString(declarations)
classStyles.set(className, styles)
}
}
return classStyles
}
export function parseHtmlBodyToJSONContent(
document: Document,
classStyles?: Map<string, Record<string, string>>,
): JSONContent {
const body = document.body
classStyles ??= extractClassStyles(document)
const content = parseBlockElements(body.children, classStyles)
return {type: 'doc', content}
}
function parseBlockElements(
children: HTMLCollection | Element[],
classStyles: Map<string, Record<string, string>>,
): JSONContent[] {
const content: JSONContent[] = []
for (const el of Array.from(children)) {
const tag = el.tagName.toLowerCase()
const node = parseBlockElement(el, tag, classStyles)
if (!node) continue
if ((node as any).type === '__fragment') {
// Recursively flatten — fragments can contain fragments
content.push(...flattenFragment(node as any))
} else {
content.push(node)
}
}
return content
}
function flattenFragment(node: any): JSONContent[] {
return node.content.flatMap((child: any) =>
child.type === '__fragment' ? flattenFragment(child) : [child],
)
}
function parseBlockElement(
el: Element,
tag: string,
classStyles: Map<string, Record<string, string>>,
): JSONContent | null {
// console.debug('parseBlockElement', {tag, el})
// Headings h1h6
if (/^h[1-6]$/.test(tag)) {
return {
type: 'heading',
attrs: {level: parseInt(tag[1])},
content: parseInlineElements(el, classStyles),
}
}
// Paragraph
if (tag === 'p') {
const inline = parseInlineElements(el, classStyles)
return inline.length > 0 ? {type: 'paragraph', content: inline} : null
}
// Lists
if (tag === 'ol') {
return {
type: 'orderedList',
attrs: {start: 1}, // ← required by TipTap's OrderedList extension
content: parseListItems(el, classStyles),
}
}
if (tag === 'ul') {
return {
type: 'bulletList',
attrs: {},
content: parseListItems(el, classStyles),
}
}
// Blockquote
if (tag === 'blockquote') {
return {
type: 'blockquote',
content: parseBlockElements(el.children, classStyles),
}
}
// Code block <pre><code>...</code></pre>
if (tag === 'pre') {
const codeEl = el.querySelector('code')
const language = codeEl?.className.match(/language-(\w+)/)?.[1] ?? null
return {
type: 'codeBlock',
attrs: {language},
content: [{type: 'text', text: (codeEl ?? el).textContent ?? ''}],
}
}
// Inline code outside of pre (treat as paragraph)
if (tag === 'code') {
return {
type: 'paragraph',
content: [{type: 'text', text: el.textContent ?? '', marks: [{type: 'code'}]}],
}
}
// Horizontal rule
if (tag === 'hr') {
return {type: 'horizontalRule'}
}
// Image
if (tag === 'img') {
const src = el.getAttribute('src')
if (!src || !src.startsWith('http')) return null
return {
type: 'image',
attrs: {
src,
alt: el.getAttribute('alt') ?? null,
title: el.getAttribute('title') ?? null,
},
}
}
// Figure (image + optional caption)
if (tag === 'figure') {
const img = el.querySelector('img')
const caption = el.querySelector('figcaption')?.textContent ?? null
const src = img?.getAttribute('src')
if (!src || !src.startsWith('http')) return null
return {
type: 'image',
attrs: {
src: img?.getAttribute('src'),
alt: img?.getAttribute('alt') ?? caption,
title: caption,
},
}
}
// Table
if (tag === 'table') {
return parseTable(el, classStyles)
}
// Container elements — recurse into children
if (['div', 'section', 'article', 'main', 'header', 'footer', 'aside'].includes(tag)) {
const inner = parseBlockElementsWithText(el, classStyles)
if (inner.length === 0) return null
if (inner.length === 1) return inner[0]
// Always use fragment — never paragraph — for multiple block children
return {type: '__fragment', content: inner} as any
}
return null
}
function parseBlockElementsWithText(
el: Element,
classStyles: Map<string, Record<string, string>>,
): JSONContent[] {
const content: JSONContent[] = []
for (const child of el.childNodes) {
// Bare text node directly in a div — wrap in paragraph
if (child.nodeType === 3) {
const text = (child.textContent ?? '').trim()
if (text) content.push({type: 'paragraph', content: [{type: 'text', text}]})
continue
}
if (child.nodeType !== 1) continue
const childEl = child as Element
const tag = childEl.tagName.toLowerCase()
// Treat span.section-header as a heading
if (tag === 'span' && childEl.classList.contains('section-header')) {
const text = childEl.textContent?.trim()
if (text) content.push({type: 'heading', attrs: {level: 2}, content: [{type: 'text', text}]})
continue
}
const node = parseBlockElement(childEl, tag, classStyles)
if (!node) continue
if ((node as any).type === '__fragment') {
content.push(...flattenFragment(node as any))
} else {
content.push(node)
}
}
return content
}
function parseStyleString(style: string): Record<string, string> {
return Object.fromEntries(
style
.split(';')
.map((s) => s.trim())
.filter(Boolean)
.map((declaration) => {
const [prop, ...rest] = declaration.split(':')
const value = rest.join(':').trim()
// Convert kebab-case to camelCase (e.g. font-weight → fontWeight)
const camelProp = prop.trim().replace(/-([a-z])/g, (_, c) => c.toUpperCase())
return [camelProp, value]
}),
)
}
function parseListItems(
listEl: Element,
classStyles: Map<string, Record<string, string>>,
): JSONContent[] {
return Array.from(listEl.querySelectorAll(':scope > li')).map((li) => {
const nestedList = li.querySelector('ul, ol')
const blockContent: JSONContent[] = [
{type: 'paragraph', content: parseInlineElements(li, classStyles, true)},
]
if (nestedList) {
const nestedTag = nestedList.tagName.toLowerCase()
blockContent.push({
type: nestedTag === 'ul' ? 'bulletList' : 'orderedList',
content: parseListItems(nestedList, classStyles),
})
}
return {type: 'listItem', content: blockContent}
})
}
function parseTable(
tableEl: Element,
classStyles: Map<string, Record<string, string>>,
): JSONContent {
const rows = Array.from(tableEl.querySelectorAll('tr'))
return {
type: 'table',
content: rows.map((row, rowIndex) => ({
type: 'tableRow',
content: Array.from(row.querySelectorAll('td, th')).map((cell) => ({
type: rowIndex === 0 || cell.tagName.toLowerCase() === 'th' ? 'tableHeader' : 'tableCell',
attrs: {
colspan: parseInt(cell.getAttribute('colspan') ?? '1'),
rowspan: parseInt(cell.getAttribute('rowspan') ?? '1'),
},
content: [{type: 'paragraph', content: parseInlineElements(cell, classStyles)}],
})),
})),
}
}
function parseInlineElements(
el: Element,
classStyles: Map<string, Record<string, string>>,
skipNested = false,
): JSONContent[] {
const nodes: JSONContent[] = []
for (const child of el.childNodes) {
// Plain text node
if (child.nodeType === 3) {
let text = child.textContent ?? ''
// Remove HTML tags from text
text = text.replace('<aside>', '\n').replace('</aside>', '\n')
if (text.trim()) nodes.push({type: 'text', text})
continue
}
if (child.nodeType !== 1) continue
const childEl = child as Element
const tag = childEl.tagName.toLowerCase()
// Skip nested lists when extracting list item text
if (skipNested && ['ul', 'ol'].includes(tag)) continue
// Line break
if (tag === 'br') {
nodes.push({type: 'hardBreak'})
continue
}
// Inline image
if (tag === 'img') {
const src = childEl.getAttribute('src')
if (src && src.startsWith('http')) nodes.push({type: 'image', attrs: {src}})
continue
}
// Marks
const marks = getMarks(childEl, tag, classStyles)
const isInlineContainer = [
'span',
'a',
'strong',
'em',
'b',
'i',
'u',
's',
'mark',
'code',
'label',
].includes(tag)
const hasChildElements = childEl.children.length > 0
if (isInlineContainer && hasChildElements) {
// Recurse into children and apply this element's marks on top
const innerNodes = parseInlineElements(childEl, classStyles, skipNested)
for (const inner of innerNodes) {
if (inner.type === 'text' && marks.length > 0) {
// Merge marks — avoid duplicates
const existingTypes = new Set((inner.marks ?? []).map((m: any) => m.type))
const newMarks = marks.filter((m) => !existingTypes.has(m.type as string))
nodes.push({
...inner,
marks: [...(inner.marks ?? []), ...newMarks],
} as JSONContent)
} else {
nodes.push(inner)
}
}
continue
}
const text = childEl.textContent ?? ''
if (!text) continue
nodes.push({
type: 'text',
text,
...(marks.length > 0 && {marks: marks as Array<{type: string; attrs?: Record<string, any>}>}),
})
}
return nodes
}
function getMarks(
el: Element,
tag: string,
classStyles: Map<string, Record<string, string>>,
): JSONContent[] {
const marks: JSONContent[] = []
if (['b', 'strong'].includes(tag)) marks.push({type: 'bold'})
if (['i', 'em'].includes(tag)) marks.push({type: 'italic'})
if (tag === 'u') marks.push({type: 'underline'})
if (['s', 'strike', 'del'].includes(tag)) marks.push({type: 'strike'})
if (tag === 'code') marks.push({type: 'code'})
if (tag === 'mark') marks.push({type: 'highlight'})
if (tag === 'a') {
const href = cleanHref(el.getAttribute('href') ?? '')
marks.push({
type: 'link',
attrs: {href, target: '_blank'},
})
}
const style: Record<string, string> = {}
const classes = Array.from(el.classList)
for (const cls of classes) {
const resolved = classStyles.get(cls)
if (resolved) Object.assign(style, resolved)
}
const inlineStyle = parseStyleString(el.getAttribute('style') ?? '')
Object.assign(style, inlineStyle)
if (!marks.find((m) => m.type === 'bold') && /^(bold|[7-9]\d{2})$/.test(style.fontWeight ?? '')) {
marks.push({type: 'bold'})
}
if (!marks.find((m) => m.type === 'italic') && style.fontStyle === 'italic') {
marks.push({type: 'italic'})
}
if (style.textDecoration?.includes('underline') && !marks.find((m) => m.type === 'underline')) {
marks.push({type: 'underline'})
}
if (style.textDecoration?.includes('line-through') && !marks.find((m) => m.type === 'strike')) {
marks.push({type: 'strike'})
}
return marks
}
function cleanHref(href: string): string {
try {
const url = new URL(href)
if (url.hostname === 'www.google.com' && url.pathname === '/url') {
return url.searchParams.get('q') ?? href
}
} catch (error) {
debug('Invalid URL:', href, error)
}
return href
}
export function extractGoogleDocId(url: string) {
const patterns = [
/\/document\/d\/([a-zA-Z0-9-_]+)/, // standard /d/{id}/ format
/id=([a-zA-Z0-9-_]+)/, // ?id= query param format
/^([a-zA-Z0-9-_]+)$/, // raw ID passed directly
]
for (const pattern of patterns) {
const match = url.match(pattern)
if (match) return match[1]
}
return null
}
function markdownToJSONContent(markdown: string): JSONContent {
const html = marked(markdown) as string
const dom = new JSDOM(html)
return parseHtmlBodyToJSONContent(dom.window.document)
}
export function convertToJSONContent(
content: string,
contentType: string,
url: string,
): JSONContent {
if (contentType.includes('text/html')) {
return htmlToJSONContent(content, url) // use Readability for articles
}
if (contentType.includes('text/markdown') || url.endsWith('.md')) {
return markdownToJSONContent(content)
}
// plain text fallback
return plainTextToJSONContent(content)
}