diff --git a/README.md b/README.md
index 02834eee..8f117bbd 100644
--- a/README.md
+++ b/README.md
@@ -25,23 +25,15 @@ Not a Chromium fork. Not a WebKit patch. A new browser, written in Zig.
](https://github.com/lightpanda-io/demo)
-_chromedp requesting 933 real web pages over the network on a AWS EC2 m5.large instance.
-See [benchmark details](https://github.com/lightpanda-io/demo/blob/main/BENCHMARKS.md#crawler-benchmark)._
+## Benchmarks
-Lightpanda is the open-source browser made for headless usage:
+Requesting 933 real web pages over the network on a AWS EC2 m5.large instance.
+See [benchmark details](https://github.com/lightpanda-io/demo/blob/main/BENCHMARKS.md#crawler-benchmark).
-- Javascript execution
-- Support of Web APIs (partial, WIP)
-- Compatible with Playwright[^1], Puppeteer, chromedp through [CDP](https://chromedevtools.github.io/devtools-protocol/)
-
-Fast web automation for AI agents, LLM training, scraping and testing:
-
-- Ultra-low memory footprint (16x less than Chrome)
-- Exceptionally fast execution (9x faster than Chrome)
-- Instant startup
-
-[^1]: **Playwright support disclaimer:**
-Due to the nature of Playwright, a script that works with the current version of the browser may not function correctly with a future version. Playwright uses an intermediate JavaScript layer that selects an execution strategy based on the browser's available features. If Lightpanda adds a new [Web API](https://developer.mozilla.org/en-US/docs/Web/API), Playwright may choose to execute different code for the same script. This new code path could attempt to use features that are not yet implemented. Lightpanda makes an effort to add compatibility tests, but we can't cover all scenarios. If you encounter an issue, please create a [GitHub issue](https://github.com/lightpanda-io/browser/issues) and include the last known working version of the script.
+| Metric | Lightpanda | Headless Chrome | Difference |
+| :---- | :---- | :---- | :---- |
+| Memory (peak, 100 pages) | 123MB | 2GB | ~16 less |
+| Execution time (100 pages) | 5s | 46s | ~9x faster |
## Quick start
@@ -58,12 +50,16 @@ curl -L -o lightpanda https://github.com/lightpanda-io/browser/releases/download
chmod a+x ./lightpanda
```
+[Linux aarch64 is also available](https://github.com/lightpanda-io/browser/releases/tag/nightly)
+
*For MacOS*
```console
curl -L -o lightpanda https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-aarch64-macos && \
chmod a+x ./lightpanda
```
+[MacOS x86_64 is also available](https://github.com/lightpanda-io/browser/releases/tag/nightly)
+
*For Windows + WSL2*
The Lightpanda browser is compatible to run on windows inside WSL. Follow the Linux instruction for installation from a WSL terminal.
@@ -82,57 +78,25 @@ docker run -d --name lightpanda -p 127.0.0.1:9222:9222 lightpanda/browser:nightl
### Dump a URL
```console
-./lightpanda fetch --obey-robots --log-format pretty --log-level info https://demo-browser.lightpanda.io/campfire-commerce/
+./lightpanda fetch --obey-robots --dump html --log-format pretty --log-level info https://demo-browser.lightpanda.io/campfire-commerce/
```
-```console
-INFO telemetry : telemetry status . . . . . . . . . . . . . [+0ms]
- disabled = false
-INFO page : navigate . . . . . . . . . . . . . . . . . . . . [+6ms]
- url = https://demo-browser.lightpanda.io/campfire-commerce/
- method = GET
- reason = address_bar
- body = false
- req_id = 1
-
-INFO browser : executing script . . . . . . . . . . . . . . [+118ms]
- src = https://demo-browser.lightpanda.io/campfire-commerce/script.js
- kind = javascript
- cacheable = true
-
-INFO http : request complete . . . . . . . . . . . . . . . . [+140ms]
- source = xhr
- url = https://demo-browser.lightpanda.io/campfire-commerce/json/product.json
- status = 200
- len = 4770
-
-INFO http : request complete . . . . . . . . . . . . . . . . [+141ms]
- source = fetch
- url = https://demo-browser.lightpanda.io/campfire-commerce/json/reviews.json
- status = 200
- len = 1615
-
-```
+You can use `--dump markdown` to convert directly into markdown.
+`--wait-until`, `--wait-ms`, `--wait-selector` and `--wait-script` are
+available to adjust waiting time before dump.
### Start a CDP server
```console
./lightpanda serve --obey-robots --log-format pretty --log-level info --host 127.0.0.1 --port 9222
```
-```console
-INFO telemetry : telemetry status . . . . . . . . . . . . . [+0ms]
- disabled = false
-
-INFO app : server running . . . . . . . . . . . . . . . . . [+0ms]
- address = 127.0.0.1:9222
-```
-
Once the CDP server started, you can run a Puppeteer script by configuring the
`browserWSEndpoint`.
-```js
-'use strict'
+Example Puppeteer script
+```js
import puppeteer from 'puppeteer-core';
// use browserWSEndpoint to pass the Lightpanda's CDP server address.
@@ -159,6 +123,27 @@ await page.close();
await context.close();
await browser.disconnect();
```
+
");
@@ -975,7 +980,7 @@ fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
}
switch (self._parse_state) {
- .html => |*buf| try buf.appendSlice(self.arena, data),
+ .html => |*html| try html.buf.appendSlice(self.arena, data),
.text => |*buf| {
// we have to escape the data...
var v = data;
@@ -1024,8 +1029,13 @@ fn pageDoneCallback(ctx: *anyopaque) !void {
var parser = Parser.init(parse_arena, self.document.asNode(), self);
switch (self._parse_state) {
- .html => |buf| {
- parser.parse(buf.items);
+ .html => |*html_state| {
+ const raw_html = html_state.buf.items;
+ if (html_state.needsEncodingConversion()) {
+ parser.parseWithEncoding(raw_html, html_state.mime.charsetString());
+ } else {
+ parser.parse(raw_html);
+ }
self._script_manager.staticScriptsDone();
self._parse_state = .complete;
},
@@ -1092,7 +1102,6 @@ fn pageErrorCallback(ctx: *anyopaque, err: anyerror) void {
return;
};
}
-
pub fn isGoingAway(self: *const Page) bool {
if (self._queued_navigation != null) {
return true;
@@ -3166,11 +3175,21 @@ const ParseState = union(enum) {
pre,
complete,
err: anyerror,
- html: std.ArrayList(u8),
+ html: Html,
text: std.ArrayList(u8),
image: std.ArrayList(u8),
raw: std.ArrayList(u8),
raw_done: []const u8,
+
+ const Html = struct {
+ mime: Mime,
+ buf: std.ArrayList(u8) = .empty,
+
+ fn needsEncodingConversion(self: *const Html) bool {
+ const charset = self.mime.charsetString();
+ return !std.ascii.eqlIgnoreCase(charset, "utf-8") and !std.ascii.eqlIgnoreCase(charset, "utf8");
+ }
+ };
};
const LoadState = enum {
diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig
index f259bdd5..0c06cbcc 100644
--- a/src/browser/parser/Parser.zig
+++ b/src/browser/parser/Parser.zig
@@ -103,6 +103,32 @@ pub fn parse(self: *Parser, html: []const u8) void {
);
}
+/// Parse HTML with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub fn parseWithEncoding(self: *Parser, html: []const u8, charset: []const u8) void {
+ h5e.html5ever_parse_document_with_encoding(
+ html.ptr,
+ html.len,
+ charset.ptr,
+ charset.len,
+ &self.container,
+ self,
+ createElementCallback,
+ getDataCallback,
+ appendCallback,
+ parseErrorCallback,
+ popCallback,
+ createCommentCallback,
+ createProcessingInstruction,
+ appendDoctypeToDocument,
+ addAttrsIfMissingCallback,
+ getTemplateContentsCallback,
+ removeFromParentCallback,
+ reparentChildrenCallback,
+ appendBeforeSiblingCallback,
+ appendBasedOnParentNodeCallback,
+ );
+}
+
pub fn parseXML(self: *Parser, xml: []const u8) void {
h5e.xml5ever_parse_document(
xml.ptr,
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index 8ee873e2..f6f81583 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -39,6 +39,30 @@ pub extern "c" fn html5ever_parse_document(
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
) void;
+/// Parse HTML document with encoding conversion. Converts from charset to UTF-8 before parsing.
+pub extern "c" fn html5ever_parse_document_with_encoding(
+ html: [*c]const u8,
+ len: usize,
+ charset: [*c]const u8,
+ charset_len: usize,
+ doc: *anyopaque,
+ ctx: *anyopaque,
+ createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
+ elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
+ appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
+ popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
+ createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
+ createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
+ appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
+ addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
+ getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
+ removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
+ reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
+ appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
+) void;
+
pub extern "c" fn html5ever_parse_fragment(
html: [*c]const u8,
len: usize,
diff --git a/src/browser/tests/page/encoding.html b/src/browser/tests/page/encoding.html
new file mode 100644
index 00000000..af532b82
--- /dev/null
+++ b/src/browser/tests/page/encoding.html
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/browser/tests/page/encoding/content_type.GB2312.html b/src/browser/tests/page/encoding/content_type.GB2312.html
new file mode 100644
index 00000000..818566a8
--- /dev/null
+++ b/src/browser/tests/page/encoding/content_type.GB2312.html
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/src/browser/tests/page/encoding/gbk.html b/src/browser/tests/page/encoding/gbk.html
new file mode 100644
index 00000000..68a6106a
--- /dev/null
+++ b/src/browser/tests/page/encoding/gbk.html
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/src/browser/tests/page/encoding/latin1.html b/src/browser/tests/page/encoding/latin1.html
new file mode 100644
index 00000000..be7d46ee
--- /dev/null
+++ b/src/browser/tests/page/encoding/latin1.html
@@ -0,0 +1,4 @@
+
+
+Caf
+
diff --git a/src/browser/tests/page/encoding/no_charset.html b/src/browser/tests/page/encoding/no_charset.html
new file mode 100644
index 00000000..818566a8
--- /dev/null
+++ b/src/browser/tests/page/encoding/no_charset.html
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/src/browser/tests/page/encoding/shift_jis.html b/src/browser/tests/page/encoding/shift_jis.html
new file mode 100644
index 00000000..2984bb6c
--- /dev/null
+++ b/src/browser/tests/page/encoding/shift_jis.html
@@ -0,0 +1,4 @@
+
+
+{
+
diff --git a/src/html5ever/Cargo.lock b/src/html5ever/Cargo.lock
index d94a7fd7..5faadb8e 100644
--- a/src/html5ever/Cargo.lock
+++ b/src/html5ever/Cargo.lock
@@ -30,6 +30,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
[[package]]
name = "find-msvc-tools"
version = "0.1.2"
@@ -67,6 +76,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
name = "litefetch-html5ever"
version = "0.1.0"
dependencies = [
+ "encoding_rs",
"html5ever",
"string_cache 0.9.0",
"tikv-jemalloc-ctl",
diff --git a/src/html5ever/Cargo.toml b/src/html5ever/Cargo.toml
index b4004404..b82ac73b 100644
--- a/src/html5ever/Cargo.toml
+++ b/src/html5ever/Cargo.toml
@@ -15,6 +15,7 @@ typed-arena = "2.0.2"
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
xml5ever = "0.35.0"
+encoding_rs = "0.8"
[profile.release]
lto = true
diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs
index 29d62539..6fab9763 100644
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -27,6 +27,7 @@ use std::cell::Cell;
use std::os::raw::{c_uchar, c_void};
use types::*;
+use encoding_rs::Encoding;
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
@@ -85,6 +86,71 @@ pub extern "C" fn html5ever_parse_document(
.one(bytes);
}
+/// Parse an HTML document with encoding conversion.
+/// If charset is provided, converts from that encoding to UTF-8 before parsing.
+/// Uses Cow internally so no allocation if content is already valid UTF-8.
+#[no_mangle]
+pub extern "C" fn html5ever_parse_document_with_encoding(
+ html: *mut c_uchar,
+ len: usize,
+ charset: *const c_uchar,
+ charset_len: usize,
+ document: Ref,
+ ctx: Ref,
+ create_element_callback: CreateElementCallback,
+ get_data_callback: GetDataCallback,
+ append_callback: AppendCallback,
+ parse_error_callback: ParseErrorCallback,
+ pop_callback: PopCallback,
+ create_comment_callback: CreateCommentCallback,
+ create_processing_instruction: CreateProcessingInstruction,
+ append_doctype_to_document: AppendDoctypeToDocumentCallback,
+ add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
+ get_template_contents_callback: GetTemplateContentsCallback,
+ remove_from_parent_callback: RemoveFromParentCallback,
+ reparent_children_callback: ReparentChildrenCallback,
+ append_before_sibling_callback: AppendBeforeSiblingCallback,
+ append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
+) -> () {
+ if html.is_null() || len == 0 {
+ return ();
+ }
+
+ let input = unsafe { std::slice::from_raw_parts(html, len) };
+ let charset_bytes = unsafe { std::slice::from_raw_parts(charset, charset_len) };
+
+ // Decode to UTF-8. Returns Cow - no allocation if already valid UTF-8.
+ let encoding = Encoding::for_label(charset_bytes).unwrap_or(encoding_rs::UTF_8);
+ let (decoded, _, _) = encoding.decode(input);
+
+ let arena = typed_arena::Arena::new();
+
+ let sink = sink::Sink {
+ ctx: ctx,
+ arena: &arena,
+ document: document,
+ quirks_mode: Cell::new(QuirksMode::NoQuirks),
+ pop_callback: pop_callback,
+ append_callback: append_callback,
+ get_data_callback: get_data_callback,
+ parse_error_callback: parse_error_callback,
+ create_element_callback: create_element_callback,
+ create_comment_callback: create_comment_callback,
+ create_processing_instruction: create_processing_instruction,
+ append_doctype_to_document: append_doctype_to_document,
+ add_attrs_if_missing_callback: add_attrs_if_missing_callback,
+ get_template_contents_callback: get_template_contents_callback,
+ remove_from_parent_callback: remove_from_parent_callback,
+ reparent_children_callback: reparent_children_callback,
+ append_before_sibling_callback: append_before_sibling_callback,
+ append_based_on_parent_node_callback: append_based_on_parent_node_callback,
+ };
+
+ // Parse directly from decoded string
+ parse_document(sink, Default::default())
+ .one(StrTendril::from(decoded.as_ref()));
+}
+
#[no_mangle]
pub extern "C" fn html5ever_parse_fragment(
html: *mut c_uchar,
diff --git a/src/network/cache/FsCache.zig b/src/network/cache/FsCache.zig
index 3d67a945..b719ce23 100644
--- a/src/network/cache/FsCache.zig
+++ b/src/network/cache/FsCache.zig
@@ -476,6 +476,10 @@ test "FsCache: put override" {
}
test "FsCache: garbage file" {
+ const LogFilter = @import("../../testing.zig").LogFilter;
+ const filter: LogFilter = .init(&.{.cache});
+ defer filter.deinit();
+
var setup = try setupCache();
defer {
setup.cache.deinit();