// Copyright (C) 2023-2025 Lightpanda (Selecy SAS) // // Francis Bouvier // Pierre Tachoire // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as // published by the Free Software Foundation, either version 3 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . mod sink; mod types; #[cfg(debug_assertions)] #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; use std::cell::Cell; use std::os::raw::{c_uchar, c_void}; use types::*; use encoding_rs::Encoding; use html5ever::interface::tree_builder::QuirksMode; use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName}; #[no_mangle] pub extern "C" fn html5ever_parse_document( html: *mut c_uchar, len: usize, document: Ref, ctx: Ref, create_element_callback: CreateElementCallback, get_data_callback: GetDataCallback, append_callback: AppendCallback, parse_error_callback: ParseErrorCallback, pop_callback: PopCallback, create_comment_callback: CreateCommentCallback, create_processing_instruction: CreateProcessingInstruction, append_doctype_to_document: AppendDoctypeToDocumentCallback, add_attrs_if_missing_callback: AddAttrsIfMissingCallback, get_template_contents_callback: GetTemplateContentsCallback, remove_from_parent_callback: RemoveFromParentCallback, reparent_children_callback: ReparentChildrenCallback, append_before_sibling_callback: AppendBeforeSiblingCallback, append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, ) -> () { if html.is_null() || len == 0 { return (); } let arena = typed_arena::Arena::new(); let sink = sink::Sink { ctx: ctx, arena: &arena, document: document, quirks_mode: Cell::new(QuirksMode::NoQuirks), pop_callback: pop_callback, append_callback: append_callback, get_data_callback: get_data_callback, parse_error_callback: parse_error_callback, create_element_callback: create_element_callback, create_comment_callback: create_comment_callback, create_processing_instruction: create_processing_instruction, append_doctype_to_document: append_doctype_to_document, add_attrs_if_missing_callback: add_attrs_if_missing_callback, get_template_contents_callback: get_template_contents_callback, remove_from_parent_callback: remove_from_parent_callback, reparent_children_callback: reparent_children_callback, append_before_sibling_callback: append_before_sibling_callback, append_based_on_parent_node_callback: append_based_on_parent_node_callback, }; let bytes = unsafe { std::slice::from_raw_parts(html, len) }; parse_document(sink, Default::default()) .from_utf8() .one(bytes); } /// Parse an HTML document with encoding conversion. /// If charset is provided, converts from that encoding to UTF-8 before parsing. /// Uses Cow internally so no allocation if content is already valid UTF-8. #[no_mangle] pub extern "C" fn html5ever_parse_document_with_encoding( html: *mut c_uchar, len: usize, charset: *const c_uchar, charset_len: usize, document: Ref, ctx: Ref, create_element_callback: CreateElementCallback, get_data_callback: GetDataCallback, append_callback: AppendCallback, parse_error_callback: ParseErrorCallback, pop_callback: PopCallback, create_comment_callback: CreateCommentCallback, create_processing_instruction: CreateProcessingInstruction, append_doctype_to_document: AppendDoctypeToDocumentCallback, add_attrs_if_missing_callback: AddAttrsIfMissingCallback, get_template_contents_callback: GetTemplateContentsCallback, remove_from_parent_callback: RemoveFromParentCallback, reparent_children_callback: ReparentChildrenCallback, append_before_sibling_callback: AppendBeforeSiblingCallback, append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, ) -> () { if html.is_null() || len == 0 { return (); } let input = unsafe { std::slice::from_raw_parts(html, len) }; let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) }; // Decode to UTF-8. Returns Cow - no allocation if already valid UTF-8. let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8); let (decoded, _, _) = encoding.decode(input); let arena = typed_arena::Arena::new(); let sink = sink::Sink { ctx: ctx, arena: &arena, document: document, quirks_mode: Cell::new(QuirksMode::NoQuirks), pop_callback: pop_callback, append_callback: append_callback, get_data_callback: get_data_callback, parse_error_callback: parse_error_callback, create_element_callback: create_element_callback, create_comment_callback: create_comment_callback, create_processing_instruction: create_processing_instruction, append_doctype_to_document: append_doctype_to_document, add_attrs_if_missing_callback: add_attrs_if_missing_callback, get_template_contents_callback: get_template_contents_callback, remove_from_parent_callback: remove_from_parent_callback, reparent_children_callback: reparent_children_callback, append_before_sibling_callback: append_before_sibling_callback, append_based_on_parent_node_callback: append_based_on_parent_node_callback, }; // Parse directly from decoded string parse_document(sink, Default::default()) .one(StrTendril::from(decoded.as_ref())); } // === Encoding API for TextDecoder === /// Result of encoding label lookup #[repr(C)] pub struct EncodingInfo { /// 0 = not found, 1 = found pub found: u8, /// Opaque handle to the encoding (actually &'static Encoding) pub handle: *const c_void, /// Length of canonical name pub name_len: usize, /// Pointer to canonical encoding name (static, lowercase) pub name_ptr: *const c_uchar, } /// Look up an encoding by its label (case-insensitive, whitespace-trimmed) #[no_mangle] pub extern "C" fn encoding_for_label( label: *const c_uchar, label_len: usize, ) -> EncodingInfo { if label.is_null() || label_len == 0 { return EncodingInfo { found: 0, name_len: 0, handle: std::ptr::null(), name_ptr: std::ptr::null(), }; } let label_bytes = unsafe { std::slice::from_raw_parts(label, label_len) }; match Encoding::for_label(label_bytes) { Some(encoding) => { let name = encoding.name(); EncodingInfo { found: 1, name_len: name.len(), name_ptr: name.as_ptr(), handle: encoding as *const _ as *const c_void, } } None => EncodingInfo { found: 0, name_len: 0, name_ptr: std::ptr::null(), handle: std::ptr::null(), }, } } /// Calculate maximum UTF-8 buffer size needed for decoding #[no_mangle] pub extern "C" fn encoding_max_utf8_buffer_length( handle: *const c_void, input_len: usize, ) -> usize { if handle.is_null() { return 0; } let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; let decoder = encoding.new_decoder(); decoder.max_utf8_buffer_length(input_len).unwrap_or(0) } /// Result of decoding operation #[repr(C)] pub struct DecodeResult { /// 0 = no errors, 1 = had malformed sequences (replaced with U+FFFD) pub had_errors: u8, /// Number of input bytes consumed pub bytes_read: usize, /// Number of UTF-8 bytes written to output buffer pub bytes_written: usize, } /// Decode bytes from source encoding to UTF-8 /// For streaming, set is_last=0; for final/complete decode, set is_last=1 #[no_mangle] pub extern "C" fn encoding_decode( handle: *const c_void, input: *const c_uchar, input_len: usize, output: *mut c_uchar, output_len: usize, is_last: u8, ) -> DecodeResult { if handle.is_null() || output.is_null() { return DecodeResult { had_errors: 1, bytes_read: 0, bytes_written: 0, }; } let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; let input_bytes = if input.is_null() || input_len == 0 { &[] } else { unsafe { std::slice::from_raw_parts(input, input_len) } }; let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) }; let mut decoder = encoding.new_decoder(); let last = is_last != 0; let (result, bytes_read, bytes_written, had_errors) = decoder.decode_to_utf8(input_bytes, output_slice, last); // If output buffer was too small, we still report what we could process let _ = result; // CoderResult::InputEmpty or CoderResult::OutputFull DecodeResult { had_errors: if had_errors { 1 } else { 0 }, bytes_read, bytes_written, } } // === Streaming Decoder API === use encoding_rs::Decoder; /// Create a streaming decoder that maintains state across calls #[no_mangle] pub extern "C" fn encoding_decoder_new(handle: *const c_void) -> *mut c_void { if handle.is_null() { return std::ptr::null_mut(); } let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; let decoder = Box::new(encoding.new_decoder()); Box::into_raw(decoder) as *mut c_void } /// Decode using a streaming decoder (maintains state for incomplete sequences) #[no_mangle] pub extern "C" fn encoding_decoder_decode( decoder_ptr: *mut c_void, input: *const c_uchar, input_len: usize, output: *mut c_uchar, output_len: usize, is_last: u8, ) -> DecodeResult { if decoder_ptr.is_null() || output.is_null() { return DecodeResult { had_errors: 1, bytes_read: 0, bytes_written: 0, }; } let decoder: &mut Decoder = unsafe { &mut *(decoder_ptr as *mut Decoder) }; let input_bytes = if input.is_null() || input_len == 0 { &[] } else { unsafe { std::slice::from_raw_parts(input, input_len) } }; let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_len) }; let last = is_last != 0; let (result, bytes_read, bytes_written, had_errors) = decoder.decode_to_utf8(input_bytes, output_slice, last); let _ = result; DecodeResult { had_errors: if had_errors { 1 } else { 0 }, bytes_read, bytes_written, } } /// Free a streaming decoder #[no_mangle] pub extern "C" fn encoding_decoder_free(decoder_ptr: *mut c_void) { if !decoder_ptr.is_null() { unsafe { drop(Box::from_raw(decoder_ptr as *mut Decoder)); } } } // === Encoding API (UTF-8 to legacy encoding with NCR fallback) === /// Result of encoding operation #[repr(C)] pub struct EncodeResult { /// 0 = success, 1 = output buffer too small pub status: u8, /// Number of input bytes consumed pub bytes_read: usize, /// Number of bytes written to output buffer pub bytes_written: usize, } /// Encode UTF-8 to a legacy encoding, replacing unencodable characters with /// HTML decimal numeric character references (&#codepoint;). /// /// This is used for URL query string encoding per WHATWG URL spec. /// encoding_rs's encode_from_utf8 already produces NCRs for unmappable chars. #[no_mangle] pub extern "C" fn encoding_encode_with_ncr( handle: *const c_void, input: *const c_uchar, input_len: usize, output: *mut c_uchar, output_capacity: usize, ) -> EncodeResult { if handle.is_null() || output.is_null() { return EncodeResult { status: 1, bytes_read: 0, bytes_written: 0, }; } let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; let input_str = if input.is_null() || input_len == 0 { "" } else { let bytes = unsafe { std::slice::from_raw_parts(input, input_len) }; match std::str::from_utf8(bytes) { Ok(s) => s, Err(_) => { return EncodeResult { status: 1, bytes_read: 0, bytes_written: 0, }; } } }; // For UTF-8 encoding, just copy directly (no NCR needed) if encoding == encoding_rs::UTF_8 { if input_len > output_capacity { return EncodeResult { bytes_read: 0, bytes_written: 0, status: 1, }; } let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; output_slice[..input_len].copy_from_slice(input_str.as_bytes()); return EncodeResult { bytes_read: input_len, bytes_written: input_len, status: 0, }; } let output_slice = unsafe { std::slice::from_raw_parts_mut(output, output_capacity) }; let mut encoder = encoding.new_encoder(); // encode_from_utf8 automatically produces NCRs for unmappable characters let (result, bytes_read, bytes_written, _had_unmappables) = encoder.encode_from_utf8(input_str, output_slice, true); match result { encoding_rs::CoderResult::InputEmpty => EncodeResult { bytes_read, bytes_written, status: 0, }, encoding_rs::CoderResult::OutputFull => EncodeResult { bytes_read, bytes_written, status: 1, }, } } /// Calculate maximum output buffer size needed for encoding with NCR fallback. /// Worst case: every character becomes &#codepoint; where codepoint is up to 7 digits. #[no_mangle] pub extern "C" fn encoding_max_encode_buffer_length( handle: *const c_void, input_len: usize, ) -> usize { if handle.is_null() { return 0; } let encoding: &'static Encoding = unsafe { &*(handle as *const Encoding) }; let encoder = encoding.new_encoder(); // This returns the max buffer size accounting for NCR expansion encoder .max_buffer_length_from_utf8_if_no_unmappables(input_len) .map(|len| { // Add extra space for potential NCRs (each char could become &#nnnnnn; = 10 bytes) // But realistically, most chars are mappable, so add 2x as safety margin len.saturating_mul(2) }) .unwrap_or(input_len * 10) } #[no_mangle] pub extern "C" fn html5ever_parse_fragment( html: *mut c_uchar, len: usize, document: Ref, ctx: Ref, create_element_callback: CreateElementCallback, get_data_callback: GetDataCallback, append_callback: AppendCallback, parse_error_callback: ParseErrorCallback, pop_callback: PopCallback, create_comment_callback: CreateCommentCallback, create_processing_instruction: CreateProcessingInstruction, append_doctype_to_document: AppendDoctypeToDocumentCallback, add_attrs_if_missing_callback: AddAttrsIfMissingCallback, get_template_contents_callback: GetTemplateContentsCallback, remove_from_parent_callback: RemoveFromParentCallback, reparent_children_callback: ReparentChildrenCallback, append_before_sibling_callback: AppendBeforeSiblingCallback, append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, ) -> () { if html.is_null() || len == 0 { return (); } let arena = typed_arena::Arena::new(); let sink = sink::Sink { ctx: ctx, arena: &arena, document: document, quirks_mode: Cell::new(QuirksMode::NoQuirks), pop_callback: pop_callback, append_callback: append_callback, get_data_callback: get_data_callback, parse_error_callback: parse_error_callback, create_element_callback: create_element_callback, create_comment_callback: create_comment_callback, create_processing_instruction: create_processing_instruction, append_doctype_to_document: append_doctype_to_document, add_attrs_if_missing_callback: add_attrs_if_missing_callback, get_template_contents_callback: get_template_contents_callback, remove_from_parent_callback: remove_from_parent_callback, reparent_children_callback: reparent_children_callback, append_before_sibling_callback: append_before_sibling_callback, append_based_on_parent_node_callback: append_based_on_parent_node_callback, }; let bytes = unsafe { std::slice::from_raw_parts(html, len) }; parse_fragment( sink, Default::default(), QualName::new(None, ns!(html), LocalName::from("body")), vec![], // attributes false, // context_element_allows_scripting ) .from_utf8() .one(bytes); } #[no_mangle] pub extern "C" fn html5ever_attribute_iterator_next( c_iter: *const c_void, ) -> CNullable { let iter: &mut CAttributeIterator = unsafe { &mut *(c_iter as *mut CAttributeIterator) }; let pos = iter.pos; if pos == iter.vec.len() { return CNullable::::none(); } let attr = &iter.vec[pos]; iter.pos += 1; CNullable::::some(CAttribute { name: CQualName::create(&attr.name), value: StringSlice { ptr: attr.value.as_ptr(), len: attr.value.len(), }, }) } #[no_mangle] pub extern "C" fn html5ever_attribute_iterator_count(c_iter: *const c_void) -> usize { let iter: &mut CAttributeIterator = unsafe { &mut *(c_iter as *mut CAttributeIterator) }; return iter.vec.len(); } #[cfg(debug_assertions)] #[repr(C)] pub struct Memory { pub resident: usize, pub allocated: usize, } #[cfg(debug_assertions)] #[no_mangle] pub extern "C" fn html5ever_get_memory_usage() -> Memory { use tikv_jemalloc_ctl::{epoch, stats}; // many statistics are cached and only updated when the epoch is advanced. drop(epoch::advance()); Memory { resident: stats::resident::read().unwrap_or(0), allocated: stats::allocated::read().unwrap_or(0), } } // Streaming parser API // The Parser type from html5ever implements TendrilSink and supports streaming pub struct StreamingParser { #[allow(dead_code)] arena: Box>, parser: Box, } #[no_mangle] pub extern "C" fn html5ever_streaming_parser_create( document: Ref, ctx: Ref, create_element_callback: CreateElementCallback, get_data_callback: GetDataCallback, append_callback: AppendCallback, parse_error_callback: ParseErrorCallback, pop_callback: PopCallback, create_comment_callback: CreateCommentCallback, create_processing_instruction: CreateProcessingInstruction, append_doctype_to_document: AppendDoctypeToDocumentCallback, add_attrs_if_missing_callback: AddAttrsIfMissingCallback, get_template_contents_callback: GetTemplateContentsCallback, remove_from_parent_callback: RemoveFromParentCallback, reparent_children_callback: ReparentChildrenCallback, append_before_sibling_callback: AppendBeforeSiblingCallback, append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, ) -> *mut c_void { let arena = Box::new(typed_arena::Arena::new()); // SAFETY: We're creating a self-referential structure here. // The arena is stored in the StreamingParser and lives as long as the parser. // The sink contains a reference to the arena that's valid for the parser's lifetime. let arena_ref: &'static typed_arena::Arena = unsafe { std::mem::transmute(arena.as_ref()) }; let sink = sink::Sink { ctx: ctx, arena: arena_ref, document: document, quirks_mode: Cell::new(QuirksMode::NoQuirks), pop_callback: pop_callback, append_callback: append_callback, get_data_callback: get_data_callback, parse_error_callback: parse_error_callback, create_element_callback: create_element_callback, create_comment_callback: create_comment_callback, create_processing_instruction: create_processing_instruction, append_doctype_to_document: append_doctype_to_document, add_attrs_if_missing_callback: add_attrs_if_missing_callback, get_template_contents_callback: get_template_contents_callback, remove_from_parent_callback: remove_from_parent_callback, reparent_children_callback: reparent_children_callback, append_before_sibling_callback: append_before_sibling_callback, append_based_on_parent_node_callback: append_based_on_parent_node_callback, }; // Create a parser which implements TendrilSink for streaming parsing let parser = parse_document(sink, ParseOpts::default()); let streaming_parser = Box::new(StreamingParser { arena, parser: Box::new(parser), }); return Box::into_raw(streaming_parser) as *mut c_void; } #[no_mangle] pub extern "C" fn html5ever_streaming_parser_feed( parser_ptr: *mut c_void, html: *const c_uchar, len: usize, ) -> i32 { if parser_ptr.is_null() || html.is_null() || len == 0 { return 0; } let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { let streaming_parser = unsafe { &mut *(parser_ptr as *mut StreamingParser) }; let bytes = unsafe { std::slice::from_raw_parts(html, len) }; // Convert bytes to UTF-8 string if let Ok(s) = std::str::from_utf8(bytes) { let tendril = StrTendril::from(s); // Feed the chunk to the parser // The Parser implements TendrilSink, so we can call process() on it let parser = streaming_parser .parser .downcast_mut::>() .expect("Invalid parser type"); parser.process(tendril); } })); match result { Ok(_) => 0, // Success Err(_) => -1, // Panic occurred } } #[no_mangle] pub extern "C" fn html5ever_streaming_parser_finish(parser_ptr: *mut c_void) { if parser_ptr.is_null() { return; } let streaming_parser = unsafe { Box::from_raw(parser_ptr as *mut StreamingParser) }; // Extract and finish the parser let parser = streaming_parser .parser .downcast::>() .expect("Invalid parser type"); // Finish consumes the parser, which will call finish() on the sink parser.finish(); // Note: The arena will be dropped here automatically } #[no_mangle] pub extern "C" fn html5ever_streaming_parser_destroy(parser_ptr: *mut c_void) { if parser_ptr.is_null() { return; } // Drop the parser box without finishing // This is for cases where you want to cancel parsing unsafe { drop(Box::from_raw(parser_ptr as *mut StreamingParser)); } } #[no_mangle] pub extern "C" fn xml5ever_parse_document( xml: *mut c_uchar, len: usize, document: Ref, ctx: Ref, create_element_callback: CreateElementCallback, get_data_callback: GetDataCallback, append_callback: AppendCallback, parse_error_callback: ParseErrorCallback, pop_callback: PopCallback, create_comment_callback: CreateCommentCallback, create_processing_instruction: CreateProcessingInstruction, append_doctype_to_document: AppendDoctypeToDocumentCallback, add_attrs_if_missing_callback: AddAttrsIfMissingCallback, get_template_contents_callback: GetTemplateContentsCallback, remove_from_parent_callback: RemoveFromParentCallback, reparent_children_callback: ReparentChildrenCallback, append_before_sibling_callback: AppendBeforeSiblingCallback, append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, ) -> () { if xml.is_null() || len == 0 { return (); } let arena = typed_arena::Arena::new(); let sink = sink::Sink { ctx: ctx, arena: &arena, document: document, quirks_mode: Cell::new(QuirksMode::NoQuirks), pop_callback: pop_callback, append_callback: append_callback, get_data_callback: get_data_callback, parse_error_callback: parse_error_callback, create_element_callback: create_element_callback, create_comment_callback: create_comment_callback, create_processing_instruction: create_processing_instruction, append_doctype_to_document: append_doctype_to_document, add_attrs_if_missing_callback: add_attrs_if_missing_callback, get_template_contents_callback: get_template_contents_callback, remove_from_parent_callback: remove_from_parent_callback, reparent_children_callback: reparent_children_callback, append_before_sibling_callback: append_before_sibling_callback, append_based_on_parent_node_callback: append_based_on_parent_node_callback, }; let bytes = unsafe { std::slice::from_raw_parts(xml, len) }; xml5ever::driver::parse_document(sink, xml5ever::driver::XmlParseOpts::default()) .from_utf8() .one(bytes); }