From 36b4c83b39a4d91aba88b1511e87f7a9818991cf Mon Sep 17 00:00:00 2001 From: Zoltan Kochan Date: Thu, 7 May 2026 08:14:24 +0200 Subject: [PATCH] fix: skip Content-Length check when response is content-encoded (#11508) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip Content-Length check when response is content-encoded Per the HTTP spec, Content-Length refers to the encoded payload when Content-Encoding is set, but undici fetch yields decoded bytes — so the strict size check rejected legitimate downloads with ERR_PNPM_BAD_TARBALL_SIZE. Closes #11506. * fix(tarball-fetcher): match v10's no-compression behavior Verified the user's report against v10 source: v10 worked because it called node-fetch with `compress: false` (network/fetch/src/fetchFromRegistry.ts on the v10 branch), which suppressed Accept-Encoding and prevented auto-decompression. v11's switch to undici fetch lost that opt-out — undici sends Accept-Encoding: gzip, deflate, br by default and transparently decodes the body, while keeping Content-Length pointed at the encoded payload (confirmed empirically). The strict size check then rejects the download. Restore v10's effective behavior by sending Accept-Encoding: identity for tarball requests, and as defense in depth against misbehaving servers that stamp Content-Encoding regardless, skip the strict size check when the response declares a non-identity Content-Encoding. * fix(tarball-fetcher): parse Content-Encoding as a coding list Per RFC 9110 §8.4 the header is a comma-separated, case-insensitive list that may include whitespace and mixed codings (e.g. `gzip, identity`). The previous string-equality check misclassified those — the response is now treated as encoded iff any coding is non-`identity`. --- .changeset/tarball-content-encoding.md | 6 ++ .../src/remoteTarballFetcher.ts | 21 ++++- fetching/tarball-fetcher/test/fetch.ts | 94 +++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 .changeset/tarball-content-encoding.md diff --git a/.changeset/tarball-content-encoding.md b/.changeset/tarball-content-encoding.md new file mode 100644 index 0000000000..fb1d9425ba --- /dev/null +++ b/.changeset/tarball-content-encoding.md @@ -0,0 +1,6 @@ +--- +"@pnpm/fetching.tarball-fetcher": patch +"pnpm": patch +--- + +Fixed `ERR_PNPM_BAD_TARBALL_SIZE` when a registry serves tarballs with an end-to-end `Content-Encoding` (e.g. `gzip`). Tarballs are already compressed, so the fetcher now requests them with `Accept-Encoding: identity` (matching pnpm v10's effective behavior) and, as defense in depth against misbehaving servers, no longer enforces the strict `Content-Length` check when the response declares a `Content-Encoding` — `Content-Length` in that case refers to the encoded payload, not the decoded bytes the fetch implementation yields [#11506](https://github.com/pnpm/pnpm/issues/11506). diff --git a/fetching/tarball-fetcher/src/remoteTarballFetcher.ts b/fetching/tarball-fetcher/src/remoteTarballFetcher.ts index 43a70d082d..085c7df0bc 100644 --- a/fetching/tarball-fetcher/src/remoteTarballFetcher.ts +++ b/fetching/tarball-fetcher/src/remoteTarballFetcher.ts @@ -120,6 +120,10 @@ export function createDownloader ( try { const res = await fetchFromRegistry(url, { authHeaderValue, + // Tarballs are already compressed; ask the server not to apply an additional + // Content-Encoding so Content-Length matches the body we receive and we don't + // waste CPU on round-trip re-compression. See https://github.com/pnpm/pnpm/issues/11506 + headers: { 'accept-encoding': 'identity' }, // The fetch library can retry requests on bad HTTP responses. // However, it is not enough to retry on bad HTTP responses only. // Requests should also be retried when the tarball's integrity check fails. @@ -133,7 +137,11 @@ export function createDownloader ( throw new FetchError({ url, authHeaderValue }, res) } - const contentLength = res.headers.has('content-length') && res.headers.get('content-length') + // When Content-Encoding is present, Content-Length refers to the encoded form + // of the data, not the decoded bytes that the fetch implementation yields. + // See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Content-Encoding + const isEncoded = isContentEncoded(res.headers.get('content-encoding')) + const contentLength = !isEncoded && res.headers.has('content-length') && res.headers.get('content-length') const parsedLength = typeof contentLength === 'string' ? parseInt(contentLength, 10) : NaN const size = Number.isFinite(parsedLength) && parsedLength >= 0 ? parsedLength : null if (opts.onStart != null) { @@ -213,3 +221,14 @@ export function createDownloader ( } } } + +// Per RFC 9110 §8.4, Content-Encoding is a comma-separated list of codings. +// The response is encoded unless every coding is `identity` (case-insensitive, +// surrounding whitespace ignored). +function isContentEncoded (header: string | null): boolean { + if (header == null) return false + return header + .split(',') + .map(coding => coding.trim().toLowerCase()) + .some(coding => coding !== '' && coding !== 'identity') +} diff --git a/fetching/tarball-fetcher/test/fetch.ts b/fetching/tarball-fetcher/test/fetch.ts index 2aea9598c8..065db6dc56 100644 --- a/fetching/tarball-fetcher/test/fetch.ts +++ b/fetching/tarball-fetcher/test/fetch.ts @@ -116,6 +116,100 @@ test('fail when tarball size does not match content-length', async () => { ) }) +test('request tarballs with Accept-Encoding: identity', async () => { + // Tarballs are already gzipped. Asking the server for an additional Content-Encoding + // wastes CPU and (when servers misbehave per RFC 7230 §3.3.2) breaks the size check + // because Content-Length refers to the encoded payload but undici yields decoded bytes. + // See: https://github.com/pnpm/pnpm/issues/11506 + const tarballContent = fs.readFileSync(tarballPath) + const mockPool = mockAgent.get(registry) + + let observedAcceptEncoding: string | string[] | undefined + mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(({ headers }) => { + observedAcceptEncoding = (headers as Record)['accept-encoding'] + return { statusCode: 200, data: tarballContent } + }) + + process.chdir(temporaryDirectory()) + + const resolution = { tarball: `${registry}/foo.tgz` } + + await fetch.remoteTarball(cafs, resolution, { + filesIndexFile, + lockfileDir: process.cwd(), + pkg, + }) + + expect(observedAcceptEncoding).toBe('identity') +}) + +test.each([ + ['gzip'], + ['GZIP'], + ['identity, gzip'], + ['gzip, identity'], + [' gzip '], +])("don't fail on content-length mismatch when Content-Encoding is %j", async (contentEncoding) => { + // Defense in depth: a misbehaving server may stamp Content-Encoding on the response + // even when the client sent Accept-Encoding: identity. In that case undici decodes + // the body and Content-Length (encoded form) no longer matches the bytes we read. + // Per RFC 9110 §8.4, Content-Encoding is a comma-separated, case-insensitive list, + // so the parser must handle whitespace, casing, and mixed codings. + const tarballContent = fs.readFileSync(tarballPath) + const mockPool = mockAgent.get(registry) + + mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, { + headers: { + 'Content-Length': (tarballSize + 100).toString(), + 'Content-Encoding': contentEncoding, + }, + }) + + process.chdir(temporaryDirectory()) + + const resolution = { tarball: `${registry}/foo.tgz` } + + const result = await fetch.remoteTarball(cafs, resolution, { + filesIndexFile, + lockfileDir: process.cwd(), + pkg, + }) + + expect(result.filesMap).toBeTruthy() +}) + +test('still enforce content-length when Content-Encoding is identity', async () => { + // identity (alone, with whitespace, or with mixed casing) means "no transformation"; + // Content-Length still reflects the bytes on the wire and the strict check applies. + const tarballContent = fs.readFileSync(tarballPath) + const mockPool = mockAgent.get(registry) + + mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, { + headers: { + 'Content-Length': (1024 * 1024).toString(), + 'Content-Encoding': ' Identity ', + }, + }) + mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, { + headers: { + 'Content-Length': (1024 * 1024).toString(), + 'Content-Encoding': ' Identity ', + }, + }) + + process.chdir(temporaryDirectory()) + + const resolution = { tarball: `${registry}/foo.tgz` } + + await expect( + fetch.remoteTarball(cafs, resolution, { + filesIndexFile, + lockfileDir: process.cwd(), + pkg, + }) + ).rejects.toThrow(BadTarballError) +}) + test('retry when tarball size does not match content-length', async () => { const tarballContent = fs.readFileSync(tarballPath) const mockPool = mockAgent.get(registry)