fix: skip Content-Length check when response is content-encoded (#11508)

* fix: skip Content-Length check when response is content-encoded Per the HTTP spec, Content-Length refers to the encoded payload when Content-Encoding is set, but undici fetch yields decoded bytes — so the strict size check rejected legitimate downloads with ERR_PNPM_BAD_TARBALL_SIZE. Closes #11506. * fix(tarball-fetcher): match v10's no-compression behavior Verified the user's report against v10 source: v10 worked because it called node-fetch with `compress: false` (network/fetch/src/fetchFromRegistry.ts on the v10 branch), which suppressed Accept-Encoding and prevented auto-decompression. v11's switch to undici fetch lost that opt-out — undici sends Accept-Encoding: gzip, deflate, br by default and transparently decodes the body, while keeping Content-Length pointed at the encoded payload (confirmed empirically). The strict size check then rejects the download. Restore v10's effective behavior by sending Accept-Encoding: identity for tarball requests, and as defense in depth against misbehaving servers that stamp Content-Encoding regardless, skip the strict size check when the response declares a non-identity Content-Encoding. * fix(tarball-fetcher): parse Content-Encoding as a coding list Per RFC 9110 §8.4 the header is a comma-separated, case-insensitive list that may include whitespace and mixed codings (e.g. `gzip, identity`). The previous string-equality check misclassified those — the response is now treated as encoded iff any coding is non-`identity`.
2026-05-19 06:07:59 -04:00 · 2026-05-07 08:14:24 +02:00
parent 165eb57b49
commit 36b4c83b39
3 changed files with 120 additions and 1 deletions
--- a/.changeset/tarball-content-encoding.md
+++ b/.changeset/tarball-content-encoding.md
@@ -0,0 +1,6 @@
+---
+"@pnpm/fetching.tarball-fetcher": patch
+"pnpm": patch
+---
+
+Fixed `ERR_PNPM_BAD_TARBALL_SIZE` when a registry serves tarballs with an end-to-end `Content-Encoding` (e.g. `gzip`). Tarballs are already compressed, so the fetcher now requests them with `Accept-Encoding: identity` (matching pnpm v10's effective behavior) and, as defense in depth against misbehaving servers, no longer enforces the strict `Content-Length` check when the response declares a `Content-Encoding` — `Content-Length` in that case refers to the encoded payload, not the decoded bytes the fetch implementation yields [#11506](https://github.com/pnpm/pnpm/issues/11506).
--- a/fetching/tarball-fetcher/src/remoteTarballFetcher.ts
+++ b/fetching/tarball-fetcher/src/remoteTarballFetcher.ts
@@ -120,6 +120,10 @@ export function createDownloader (
      try {
        const res = await fetchFromRegistry(url, {
          authHeaderValue,
+          // Tarballs are already compressed; ask the server not to apply an additional
+          // Content-Encoding so Content-Length matches the body we receive and we don't
+          // waste CPU on round-trip re-compression. See https://github.com/pnpm/pnpm/issues/11506
+          headers: { 'accept-encoding': 'identity' },
          // The fetch library can retry requests on bad HTTP responses.
          // However, it is not enough to retry on bad HTTP responses only.
          // Requests should also be retried when the tarball's integrity check fails.
@@ -133,7 +137,11 @@ export function createDownloader (
          throw new FetchError({ url, authHeaderValue }, res)
        }

-        const contentLength = res.headers.has('content-length') && res.headers.get('content-length')
+        // When Content-Encoding is present, Content-Length refers to the encoded form
+        // of the data, not the decoded bytes that the fetch implementation yields.
+        // See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Content-Encoding
+        const isEncoded = isContentEncoded(res.headers.get('content-encoding'))
+        const contentLength = !isEncoded && res.headers.has('content-length') && res.headers.get('content-length')
        const parsedLength = typeof contentLength === 'string' ? parseInt(contentLength, 10) : NaN
        const size = Number.isFinite(parsedLength) && parsedLength >= 0 ? parsedLength : null
        if (opts.onStart != null) {
@@ -213,3 +221,14 @@ export function createDownloader (
    }
  }
 }
+
+// Per RFC 9110 §8.4, Content-Encoding is a comma-separated list of codings.
+// The response is encoded unless every coding is `identity` (case-insensitive,
+// surrounding whitespace ignored).
+function isContentEncoded (header: string | null): boolean {
+  if (header == null) return false
+  return header
+    .split(',')
+    .map(coding => coding.trim().toLowerCase())
+    .some(coding => coding !== '' && coding !== 'identity')
+}
--- a/fetching/tarball-fetcher/test/fetch.ts
+++ b/fetching/tarball-fetcher/test/fetch.ts
@@ -116,6 +116,100 @@ test('fail when tarball size does not match content-length', async () => {
  )
 })

+test('request tarballs with Accept-Encoding: identity', async () => {
+  // Tarballs are already gzipped. Asking the server for an additional Content-Encoding
+  // wastes CPU and (when servers misbehave per RFC 7230 §3.3.2) breaks the size check
+  // because Content-Length refers to the encoded payload but undici yields decoded bytes.
+  // See: https://github.com/pnpm/pnpm/issues/11506
+  const tarballContent = fs.readFileSync(tarballPath)
+  const mockPool = mockAgent.get(registry)
+
+  let observedAcceptEncoding: string | string[] | undefined
+  mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(({ headers }) => {
+    observedAcceptEncoding = (headers as Record<string, string | string[]>)['accept-encoding']
+    return { statusCode: 200, data: tarballContent }
+  })
+
+  process.chdir(temporaryDirectory())
+
+  const resolution = { tarball: `${registry}/foo.tgz` }
+
+  await fetch.remoteTarball(cafs, resolution, {
+    filesIndexFile,
+    lockfileDir: process.cwd(),
+    pkg,
+  })
+
+  expect(observedAcceptEncoding).toBe('identity')
+})
+
+test.each([
+  ['gzip'],
+  ['GZIP'],
+  ['identity, gzip'],
+  ['gzip, identity'],
+  [' gzip '],
+])("don't fail on content-length mismatch when Content-Encoding is %j", async (contentEncoding) => {
+  // Defense in depth: a misbehaving server may stamp Content-Encoding on the response
+  // even when the client sent Accept-Encoding: identity. In that case undici decodes
+  // the body and Content-Length (encoded form) no longer matches the bytes we read.
+  // Per RFC 9110 §8.4, Content-Encoding is a comma-separated, case-insensitive list,
+  // so the parser must handle whitespace, casing, and mixed codings.
+  const tarballContent = fs.readFileSync(tarballPath)
+  const mockPool = mockAgent.get(registry)
+
+  mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
+    headers: {
+      'Content-Length': (tarballSize + 100).toString(),
+      'Content-Encoding': contentEncoding,
+    },
+  })
+
+  process.chdir(temporaryDirectory())
+
+  const resolution = { tarball: `${registry}/foo.tgz` }
+
+  const result = await fetch.remoteTarball(cafs, resolution, {
+    filesIndexFile,
+    lockfileDir: process.cwd(),
+    pkg,
+  })
+
+  expect(result.filesMap).toBeTruthy()
+})
+
+test('still enforce content-length when Content-Encoding is identity', async () => {
+  // identity (alone, with whitespace, or with mixed casing) means "no transformation";
+  // Content-Length still reflects the bytes on the wire and the strict check applies.
+  const tarballContent = fs.readFileSync(tarballPath)
+  const mockPool = mockAgent.get(registry)
+
+  mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
+    headers: {
+      'Content-Length': (1024 * 1024).toString(),
+      'Content-Encoding': ' Identity ',
+    },
+  })
+  mockPool.intercept({ path: '/foo.tgz', method: 'GET' }).reply(200, tarballContent, {
+    headers: {
+      'Content-Length': (1024 * 1024).toString(),
+      'Content-Encoding': ' Identity ',
+    },
+  })
+
+  process.chdir(temporaryDirectory())
+
+  const resolution = { tarball: `${registry}/foo.tgz` }
+
+  await expect(
+    fetch.remoteTarball(cafs, resolution, {
+      filesIndexFile,
+      lockfileDir: process.cwd(),
+      pkg,
+    })
+  ).rejects.toThrow(BadTarballError)
+})
+
 test('retry when tarball size does not match content-length', async () => {
  const tarballContent = fs.readFileSync(tarballPath)
  const mockPool = mockAgent.get(registry)