mirror of
https://github.com/CompassConnections/Compass.git
synced 2026-04-03 22:44:35 -04:00
496 lines
14 KiB
TypeScript
496 lines
14 KiB
TypeScript
import {JSONContent} from '@tiptap/core'
|
||
import {debug} from 'common/logger'
|
||
import {JSDOM} from 'jsdom'
|
||
import {marked} from 'marked'
|
||
|
||
export function htmlToJSONContent(html: string, url: string): JSONContent {
|
||
const originalDom = new JSDOM(html, {url})
|
||
const classStyles = extractClassStyles(originalDom.window.document)
|
||
|
||
// const isGoogleDoc = !!extractGoogleDocId(url)
|
||
// if (!isGoogleDoc) {
|
||
// const reader = new Readability(originalDom.window.document)
|
||
// const article = reader.parse()
|
||
// if (article?.content) {
|
||
// const cleanDom = new JSDOM(article.content)
|
||
// return parseHtmlBodyToJSONContent(cleanDom.window.document, classStyles)
|
||
// }
|
||
// }
|
||
|
||
return parseHtmlBodyToJSONContent(originalDom.window.document, classStyles)
|
||
}
|
||
|
||
function plainTextToJSONContent(text: string): JSONContent {
|
||
const paragraphs = text
|
||
.split(/\n{2,}/) // split on blank lines
|
||
.map((p) => p.trim())
|
||
.filter(Boolean)
|
||
.map((p) => ({
|
||
type: 'paragraph' as const,
|
||
content: [{type: 'text' as const, text: p}],
|
||
}))
|
||
|
||
return {type: 'doc', content: paragraphs}
|
||
}
|
||
|
||
function extractClassStyles(document: Document): Map<string, Record<string, string>> {
|
||
const classStyles = new Map<string, Record<string, string>>()
|
||
|
||
for (const styleEl of document.querySelectorAll('style')) {
|
||
const css = styleEl.textContent ?? ''
|
||
|
||
// Match .className { prop: value; prop: value }
|
||
const ruleRegex = /\.([a-zA-Z0-9_-]+)\s*\{([^}]+)}/g
|
||
let match
|
||
while ((match = ruleRegex.exec(css)) !== null) {
|
||
const className = match[1]
|
||
const declarations = match[2]
|
||
const styles = parseStyleString(declarations)
|
||
classStyles.set(className, styles)
|
||
}
|
||
}
|
||
|
||
return classStyles
|
||
}
|
||
|
||
export function parseHtmlBodyToJSONContent(
|
||
document: Document,
|
||
classStyles?: Map<string, Record<string, string>>,
|
||
): JSONContent {
|
||
const body = document.body
|
||
classStyles ??= extractClassStyles(document)
|
||
const content = parseBlockElements(body.children, classStyles)
|
||
return {type: 'doc', content}
|
||
}
|
||
|
||
function parseBlockElements(
|
||
children: HTMLCollection | Element[],
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent[] {
|
||
const content: JSONContent[] = []
|
||
|
||
for (const el of Array.from(children)) {
|
||
const tag = el.tagName.toLowerCase()
|
||
const node = parseBlockElement(el, tag, classStyles)
|
||
if (!node) continue
|
||
|
||
if ((node as any).type === '__fragment') {
|
||
// Recursively flatten — fragments can contain fragments
|
||
content.push(...flattenFragment(node as any))
|
||
} else {
|
||
content.push(node)
|
||
}
|
||
}
|
||
|
||
return content
|
||
}
|
||
|
||
function flattenFragment(node: any): JSONContent[] {
|
||
return node.content.flatMap((child: any) =>
|
||
child.type === '__fragment' ? flattenFragment(child) : [child],
|
||
)
|
||
}
|
||
|
||
function parseBlockElement(
|
||
el: Element,
|
||
tag: string,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent | null {
|
||
// console.debug('parseBlockElement', {tag, el})
|
||
// Headings h1–h6
|
||
if (/^h[1-6]$/.test(tag)) {
|
||
return {
|
||
type: 'heading',
|
||
attrs: {level: parseInt(tag[1])},
|
||
content: parseInlineElements(el, classStyles),
|
||
}
|
||
}
|
||
|
||
// Paragraph
|
||
if (tag === 'p') {
|
||
const inline = parseInlineElements(el, classStyles)
|
||
return inline.length > 0 ? {type: 'paragraph', content: inline} : null
|
||
}
|
||
|
||
// Lists
|
||
if (tag === 'ol') {
|
||
return {
|
||
type: 'orderedList',
|
||
attrs: {start: 1}, // ← required by TipTap's OrderedList extension
|
||
content: parseListItems(el, classStyles),
|
||
}
|
||
}
|
||
if (tag === 'ul') {
|
||
return {
|
||
type: 'bulletList',
|
||
attrs: {},
|
||
content: parseListItems(el, classStyles),
|
||
}
|
||
}
|
||
// Blockquote
|
||
if (tag === 'blockquote') {
|
||
return {
|
||
type: 'blockquote',
|
||
content: parseBlockElements(el.children, classStyles),
|
||
}
|
||
}
|
||
|
||
// Code block <pre><code>...</code></pre>
|
||
if (tag === 'pre') {
|
||
const codeEl = el.querySelector('code')
|
||
const language = codeEl?.className.match(/language-(\w+)/)?.[1] ?? null
|
||
return {
|
||
type: 'codeBlock',
|
||
attrs: {language},
|
||
content: [{type: 'text', text: (codeEl ?? el).textContent ?? ''}],
|
||
}
|
||
}
|
||
|
||
// Inline code outside of pre (treat as paragraph)
|
||
if (tag === 'code') {
|
||
return {
|
||
type: 'paragraph',
|
||
content: [{type: 'text', text: el.textContent ?? '', marks: [{type: 'code'}]}],
|
||
}
|
||
}
|
||
|
||
// Horizontal rule
|
||
if (tag === 'hr') {
|
||
return {type: 'horizontalRule'}
|
||
}
|
||
|
||
// Image
|
||
if (tag === 'img') {
|
||
const src = el.getAttribute('src')
|
||
if (!src || !src.startsWith('http')) return null
|
||
return {
|
||
type: 'image',
|
||
attrs: {
|
||
src,
|
||
alt: el.getAttribute('alt') ?? null,
|
||
title: el.getAttribute('title') ?? null,
|
||
},
|
||
}
|
||
}
|
||
|
||
// Figure (image + optional caption)
|
||
if (tag === 'figure') {
|
||
const img = el.querySelector('img')
|
||
const caption = el.querySelector('figcaption')?.textContent ?? null
|
||
const src = img?.getAttribute('src')
|
||
if (!src || !src.startsWith('http')) return null
|
||
return {
|
||
type: 'image',
|
||
attrs: {
|
||
src: img?.getAttribute('src'),
|
||
alt: img?.getAttribute('alt') ?? caption,
|
||
title: caption,
|
||
},
|
||
}
|
||
}
|
||
|
||
// Table
|
||
if (tag === 'table') {
|
||
return parseTable(el, classStyles)
|
||
}
|
||
|
||
// Container elements — recurse into children
|
||
if (['div', 'section', 'article', 'main', 'header', 'footer', 'aside'].includes(tag)) {
|
||
const inner = parseBlockElementsWithText(el, classStyles)
|
||
if (inner.length === 0) return null
|
||
if (inner.length === 1) return inner[0]
|
||
|
||
// Always use fragment — never paragraph — for multiple block children
|
||
return {type: '__fragment', content: inner} as any
|
||
}
|
||
|
||
return null
|
||
}
|
||
|
||
function parseBlockElementsWithText(
|
||
el: Element,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent[] {
|
||
const content: JSONContent[] = []
|
||
|
||
for (const child of el.childNodes) {
|
||
// Bare text node directly in a div — wrap in paragraph
|
||
if (child.nodeType === 3) {
|
||
const text = (child.textContent ?? '').trim()
|
||
if (text) content.push({type: 'paragraph', content: [{type: 'text', text}]})
|
||
continue
|
||
}
|
||
|
||
if (child.nodeType !== 1) continue
|
||
const childEl = child as Element
|
||
const tag = childEl.tagName.toLowerCase()
|
||
|
||
// Treat span.section-header as a heading
|
||
if (tag === 'span' && childEl.classList.contains('section-header')) {
|
||
const text = childEl.textContent?.trim()
|
||
if (text) content.push({type: 'heading', attrs: {level: 2}, content: [{type: 'text', text}]})
|
||
continue
|
||
}
|
||
|
||
const node = parseBlockElement(childEl, tag, classStyles)
|
||
if (!node) continue
|
||
|
||
if ((node as any).type === '__fragment') {
|
||
content.push(...flattenFragment(node as any))
|
||
} else {
|
||
content.push(node)
|
||
}
|
||
}
|
||
|
||
return content
|
||
}
|
||
|
||
function parseStyleString(style: string): Record<string, string> {
|
||
return Object.fromEntries(
|
||
style
|
||
.split(';')
|
||
.map((s) => s.trim())
|
||
.filter(Boolean)
|
||
.map((declaration) => {
|
||
const [prop, ...rest] = declaration.split(':')
|
||
const value = rest.join(':').trim()
|
||
// Convert kebab-case to camelCase (e.g. font-weight → fontWeight)
|
||
const camelProp = prop.trim().replace(/-([a-z])/g, (_, c) => c.toUpperCase())
|
||
return [camelProp, value]
|
||
}),
|
||
)
|
||
}
|
||
|
||
function parseListItems(
|
||
listEl: Element,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent[] {
|
||
return Array.from(listEl.querySelectorAll(':scope > li')).map((li) => {
|
||
const nestedList = li.querySelector('ul, ol')
|
||
const blockContent: JSONContent[] = [
|
||
{type: 'paragraph', content: parseInlineElements(li, classStyles, true)},
|
||
]
|
||
|
||
if (nestedList) {
|
||
const nestedTag = nestedList.tagName.toLowerCase()
|
||
blockContent.push({
|
||
type: nestedTag === 'ul' ? 'bulletList' : 'orderedList',
|
||
content: parseListItems(nestedList, classStyles),
|
||
})
|
||
}
|
||
|
||
return {type: 'listItem', content: blockContent}
|
||
})
|
||
}
|
||
|
||
function parseTable(
|
||
tableEl: Element,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent {
|
||
const rows = Array.from(tableEl.querySelectorAll('tr'))
|
||
|
||
return {
|
||
type: 'table',
|
||
content: rows.map((row, rowIndex) => ({
|
||
type: 'tableRow',
|
||
content: Array.from(row.querySelectorAll('td, th')).map((cell) => ({
|
||
type: rowIndex === 0 || cell.tagName.toLowerCase() === 'th' ? 'tableHeader' : 'tableCell',
|
||
attrs: {
|
||
colspan: parseInt(cell.getAttribute('colspan') ?? '1'),
|
||
rowspan: parseInt(cell.getAttribute('rowspan') ?? '1'),
|
||
},
|
||
content: [{type: 'paragraph', content: parseInlineElements(cell, classStyles)}],
|
||
})),
|
||
})),
|
||
}
|
||
}
|
||
|
||
function parseInlineElements(
|
||
el: Element,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
skipNested = false,
|
||
): JSONContent[] {
|
||
const nodes: JSONContent[] = []
|
||
|
||
for (const child of el.childNodes) {
|
||
// Plain text node
|
||
if (child.nodeType === 3) {
|
||
let text = child.textContent ?? ''
|
||
|
||
// Remove HTML tags from text
|
||
text = text.replace('<aside>', '\n').replace('</aside>', '\n')
|
||
|
||
if (text.trim()) nodes.push({type: 'text', text})
|
||
continue
|
||
}
|
||
|
||
if (child.nodeType !== 1) continue
|
||
const childEl = child as Element
|
||
const tag = childEl.tagName.toLowerCase()
|
||
|
||
// Skip nested lists when extracting list item text
|
||
if (skipNested && ['ul', 'ol'].includes(tag)) continue
|
||
|
||
// Line break
|
||
if (tag === 'br') {
|
||
nodes.push({type: 'hardBreak'})
|
||
continue
|
||
}
|
||
|
||
// Inline image
|
||
if (tag === 'img') {
|
||
const src = childEl.getAttribute('src')
|
||
if (src && src.startsWith('http')) nodes.push({type: 'image', attrs: {src}})
|
||
continue
|
||
}
|
||
|
||
// Marks
|
||
const marks = getMarks(childEl, tag, classStyles)
|
||
|
||
const isInlineContainer = [
|
||
'span',
|
||
'a',
|
||
'strong',
|
||
'em',
|
||
'b',
|
||
'i',
|
||
'u',
|
||
's',
|
||
'mark',
|
||
'code',
|
||
'label',
|
||
].includes(tag)
|
||
const hasChildElements = childEl.children.length > 0
|
||
|
||
if (isInlineContainer && hasChildElements) {
|
||
// Recurse into children and apply this element's marks on top
|
||
const innerNodes = parseInlineElements(childEl, classStyles, skipNested)
|
||
for (const inner of innerNodes) {
|
||
if (inner.type === 'text' && marks.length > 0) {
|
||
// Merge marks — avoid duplicates
|
||
const existingTypes = new Set((inner.marks ?? []).map((m: any) => m.type))
|
||
const newMarks = marks.filter((m) => !existingTypes.has(m.type as string))
|
||
nodes.push({
|
||
...inner,
|
||
marks: [...(inner.marks ?? []), ...newMarks],
|
||
} as JSONContent)
|
||
} else {
|
||
nodes.push(inner)
|
||
}
|
||
}
|
||
continue
|
||
}
|
||
|
||
const text = childEl.textContent ?? ''
|
||
if (!text) continue
|
||
|
||
nodes.push({
|
||
type: 'text',
|
||
text,
|
||
...(marks.length > 0 && {marks: marks as Array<{type: string; attrs?: Record<string, any>}>}),
|
||
})
|
||
}
|
||
|
||
return nodes
|
||
}
|
||
|
||
function getMarks(
|
||
el: Element,
|
||
tag: string,
|
||
classStyles: Map<string, Record<string, string>>,
|
||
): JSONContent[] {
|
||
const marks: JSONContent[] = []
|
||
|
||
if (['b', 'strong'].includes(tag)) marks.push({type: 'bold'})
|
||
if (['i', 'em'].includes(tag)) marks.push({type: 'italic'})
|
||
if (tag === 'u') marks.push({type: 'underline'})
|
||
if (['s', 'strike', 'del'].includes(tag)) marks.push({type: 'strike'})
|
||
if (tag === 'code') marks.push({type: 'code'})
|
||
if (tag === 'mark') marks.push({type: 'highlight'})
|
||
|
||
if (tag === 'a') {
|
||
const href = cleanHref(el.getAttribute('href') ?? '')
|
||
marks.push({
|
||
type: 'link',
|
||
attrs: {href, target: '_blank'},
|
||
})
|
||
}
|
||
|
||
const style: Record<string, string> = {}
|
||
const classes = Array.from(el.classList)
|
||
for (const cls of classes) {
|
||
const resolved = classStyles.get(cls)
|
||
if (resolved) Object.assign(style, resolved)
|
||
}
|
||
const inlineStyle = parseStyleString(el.getAttribute('style') ?? '')
|
||
Object.assign(style, inlineStyle)
|
||
|
||
if (!marks.find((m) => m.type === 'bold') && /^(bold|[7-9]\d{2})$/.test(style.fontWeight ?? '')) {
|
||
marks.push({type: 'bold'})
|
||
}
|
||
|
||
if (!marks.find((m) => m.type === 'italic') && style.fontStyle === 'italic') {
|
||
marks.push({type: 'italic'})
|
||
}
|
||
|
||
if (style.textDecoration?.includes('underline') && !marks.find((m) => m.type === 'underline')) {
|
||
marks.push({type: 'underline'})
|
||
}
|
||
|
||
if (style.textDecoration?.includes('line-through') && !marks.find((m) => m.type === 'strike')) {
|
||
marks.push({type: 'strike'})
|
||
}
|
||
|
||
return marks
|
||
}
|
||
|
||
function cleanHref(href: string): string {
|
||
try {
|
||
const url = new URL(href)
|
||
if (url.hostname === 'www.google.com' && url.pathname === '/url') {
|
||
return url.searchParams.get('q') ?? href
|
||
}
|
||
} catch (error) {
|
||
debug('Invalid URL:', href, error)
|
||
}
|
||
return href
|
||
}
|
||
|
||
export function extractGoogleDocId(url: string) {
|
||
const patterns = [
|
||
/\/document\/d\/([a-zA-Z0-9-_]+)/, // standard /d/{id}/ format
|
||
/id=([a-zA-Z0-9-_]+)/, // ?id= query param format
|
||
/^([a-zA-Z0-9-_]+)$/, // raw ID passed directly
|
||
]
|
||
|
||
for (const pattern of patterns) {
|
||
const match = url.match(pattern)
|
||
if (match) return match[1]
|
||
}
|
||
|
||
return null
|
||
}
|
||
|
||
function markdownToJSONContent(markdown: string): JSONContent {
|
||
const html = marked(markdown) as string
|
||
const dom = new JSDOM(html)
|
||
return parseHtmlBodyToJSONContent(dom.window.document)
|
||
}
|
||
|
||
export function convertToJSONContent(
|
||
content: string,
|
||
contentType: string,
|
||
url: string,
|
||
): JSONContent {
|
||
if (contentType.includes('text/html')) {
|
||
return htmlToJSONContent(content, url) // use Readability for articles
|
||
}
|
||
|
||
if (contentType.includes('text/markdown') || url.endsWith('.md')) {
|
||
return markdownToJSONContent(content)
|
||
}
|
||
|
||
// plain text fallback
|
||
return plainTextToJSONContent(content)
|
||
}
|