fix: skip Content-Length check when response is content-encoded (#11508)

* fix: skip Content-Length check when response is content-encoded

Per the HTTP spec, Content-Length refers to the encoded payload when
Content-Encoding is set, but undici fetch yields decoded bytes — so the
strict size check rejected legitimate downloads with
ERR_PNPM_BAD_TARBALL_SIZE. Closes #11506.

* fix(tarball-fetcher): match v10's no-compression behavior

Verified the user's report against v10 source: v10 worked because it
called node-fetch with `compress: false` (network/fetch/src/fetchFromRegistry.ts
on the v10 branch), which suppressed Accept-Encoding and prevented
auto-decompression. v11's switch to undici fetch lost that opt-out — undici
sends Accept-Encoding: gzip, deflate, br by default and transparently
decodes the body, while keeping Content-Length pointed at the encoded
payload (confirmed empirically). The strict size check then rejects the
download.

Restore v10's effective behavior by sending Accept-Encoding: identity for
tarball requests, and as defense in depth against misbehaving servers
that stamp Content-Encoding regardless, skip the strict size check when
the response declares a non-identity Content-Encoding.

* fix(tarball-fetcher): parse Content-Encoding as a coding list

Per RFC 9110 §8.4 the header is a comma-separated, case-insensitive list
that may include whitespace and mixed codings (e.g. `gzip, identity`).
The previous string-equality check misclassified those — the response is
now treated as encoded iff any coding is non-`identity`.
This commit is contained in:
Zoltan Kochan
2026-05-07 08:14:24 +02:00
parent 165eb57b49
commit 36b4c83b39
3 changed files with 120 additions and 1 deletions

View File

@@ -0,0 +1,6 @@
---
"@pnpm/fetching.tarball-fetcher": patch
"pnpm": patch
---
Fixed `ERR_PNPM_BAD_TARBALL_SIZE` when a registry serves tarballs with an end-to-end `Content-Encoding` (e.g. `gzip`). Tarballs are already compressed, so the fetcher now requests them with `Accept-Encoding: identity` (matching pnpm v10's effective behavior) and, as defense in depth against misbehaving servers, no longer enforces the strict `Content-Length` check when the response declares a `Content-Encoding``Content-Length` in that case refers to the encoded payload, not the decoded bytes the fetch implementation yields [#11506](https://github.com/pnpm/pnpm/issues/11506).

View File

@@ -120,6 +120,10 @@ export function createDownloader (
try {
const res = await fetchFromRegistry(url, {
authHeaderValue,
// Tarballs are already compressed; ask the server not to apply an additional
// Content-Encoding so Content-Length matches the body we receive and we don't
// waste CPU on round-trip re-compression. See https://github.com/pnpm/pnpm/issues/11506
headers: { 'accept-encoding': 'identity' },
// The fetch library can retry requests on bad HTTP responses.
// However, it is not enough to retry on bad HTTP responses only.
// Requests should also be retried when the tarball's integrity check fails.
@@ -133,7 +137,11 @@ export function createDownloader (
throw new FetchError({ url, authHeaderValue }, res)
}
const contentLength = res.headers.has('content-length') && res.headers.get('content-length')
// When Content-Encoding is present, Content-Length refers to the encoded form
// of the data, not the decoded bytes that the fetch implementation yields.
// See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Content-Encoding
const isEncoded = isContentEncoded(res.headers.get('content-encoding'))
const contentLength = !isEncoded && res.headers.has('content-length') && res.headers.get('content-length')
const parsedLength = typeof contentLength === 'string' ? parseInt(contentLength, 10) : NaN
const size = Number.isFinite(parsedLength) && parsedLength >= 0 ? parsedLength : null
if (opts.onStart != null) {
@@ -213,3 +221,14 @@ export function createDownloader (
}
}
}
// Per RFC 9110 §8.4, Content-Encoding is a comma-separated list of codings.
// The response is encoded unless every coding is `identity` (case-insensitive,
// surrounding whitespace ignored).
function isContentEncoded (header: string | null): boolean {
if (header == null) return false
return header
.split(',')
.map(coding => coding.trim().toLowerCase())
.some(coding => coding !== '' && coding !== 'identity')
}

View File

@@ -116,6 +116,100 @@ test('fail when tarball size does not match content-length', async () => {
)
})
test('request tarballs with Accept-Encoding: identity', async () => {
// Tarballs are already gzipped. Asking the server for an additional Content-Encoding
// wastes CPU and (when servers misbehave per RFC 7230 §3.3.2) breaks the size check
// because Content-Length refers to the encoded payload but undici yields decoded bytes.
// See: https://github.com/pnpm/pnpm/issues/11506
const tarballContent = fs.readFileSync(tarballPath)
const mockPool = mockAgent.get(registry)
let observedAcceptEncoding: string | string[] | undefined
mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(({ headers }) => {
observedAcceptEncoding = (headers as Record<string, string | string[]>)['accept-encoding']
return { statusCode: 200, data: tarballContent }
})
process.chdir(temporaryDirectory())
const resolution = { tarball: `${registry}/foo.tgz` }
await fetch.remoteTarball(cafs, resolution, {
filesIndexFile,
lockfileDir: process.cwd(),
pkg,
})
expect(observedAcceptEncoding).toBe('identity')
})
test.each([
['gzip'],
['GZIP'],
['identity, gzip'],
['gzip, identity'],
[' gzip '],
])("don't fail on content-length mismatch when Content-Encoding is %j", async (contentEncoding) => {
// Defense in depth: a misbehaving server may stamp Content-Encoding on the response
// even when the client sent Accept-Encoding: identity. In that case undici decodes
// the body and Content-Length (encoded form) no longer matches the bytes we read.
// Per RFC 9110 §8.4, Content-Encoding is a comma-separated, case-insensitive list,
// so the parser must handle whitespace, casing, and mixed codings.
const tarballContent = fs.readFileSync(tarballPath)
const mockPool = mockAgent.get(registry)
mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
headers: {
'Content-Length': (tarballSize + 100).toString(),
'Content-Encoding': contentEncoding,
},
})
process.chdir(temporaryDirectory())
const resolution = { tarball: `${registry}/foo.tgz` }
const result = await fetch.remoteTarball(cafs, resolution, {
filesIndexFile,
lockfileDir: process.cwd(),
pkg,
})
expect(result.filesMap).toBeTruthy()
})
test('still enforce content-length when Content-Encoding is identity', async () => {
// identity (alone, with whitespace, or with mixed casing) means "no transformation";
// Content-Length still reflects the bytes on the wire and the strict check applies.
const tarballContent = fs.readFileSync(tarballPath)
const mockPool = mockAgent.get(registry)
mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
headers: {
'Content-Length': (1024 * 1024).toString(),
'Content-Encoding': ' Identity ',
},
})
mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
headers: {
'Content-Length': (1024 * 1024).toString(),
'Content-Encoding': ' Identity ',
},
})
process.chdir(temporaryDirectory())
const resolution = { tarball: `${registry}/foo.tgz` }
await expect(
fetch.remoteTarball(cafs, resolution, {
filesIndexFile,
lockfileDir: process.cwd(),
pkg,
})
).rejects.toThrow(BadTarballError)
})
test('retry when tarball size does not match content-length', async () => {
const tarballContent = fs.readFileSync(tarballPath)
const mockPool = mockAgent.get(registry)