fix(audit): compute reachable vulnerabilities with Tarjan SCC (#12467)

`pnpm audit` enumerates the install paths to every vulnerable package. The reachability-based pruning added in 11.5.1 (pnpm/pnpm#12087) lets the walker skip subtrees that reach no unsaturated finding by precomputing, per node, the set of vulnerabilities reachable from it. That getter only memoised acyclic subtrees: a node whose subtree contained a cycle was `complete === false`, and so was every ancestor up to the importer roots. None of them were cached, so their reachable set was recomputed on every query. Real dependency graphs commonly contain cycles, and a single cycle high in the graph makes a large fraction of nodes non-memoisable, yielding an O(N^2) walk. This matched the report in pnpm/pnpm#12212 exactly (CPU-bound, identical audit output across versions). Reachability is now computed with Tarjan's strongly-connected-components algorithm. Every node is scanned once; all members of an SCC reach the same set of vulnerabilities and share one set, finalised in reverse-topological order. Cyclic graphs are handled in O(N + E). The reachable set is used only to prune, so it must never under-approximate (that would hide a real finding). Tarjan yields the exact set for every node, so no finding can be dropped, and the path-recording logic is unchanged. The getter returns ReadonlySet<string> so the shared sets cannot be mutated by callers, and a missing memo entry (an impossible-by-construction state) throws rather than silently returning an empty set. A regression test asserts the read-count growth ratio between two cycle sizes (L=200 and L=400) is sub-quadratic: the fix scales ~2x (linear), the previous code ~4x (quadratic). Asserting the ratio cancels the per-node constant, so the test is not brittle to constant-factor changes. Closes pnpm/pnpm#12212. --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Co-authored-by: Zoltan Kochan <z@kochan.io>
2026-06-27 17:35:30 -04:00 · 2026-06-20 21:34:06 +08:00
parent 8c3a372048
commit d3f68e2aa4
4 changed files with 277 additions and 71 deletions
--- a/.changeset/fix-audit-cyclic-path-traversal-perf.md
+++ b/.changeset/fix-audit-cyclic-path-traversal-perf.md
@@ -0,0 +1,8 @@
+---
+"@pnpm/deps.compliance.audit": patch
+"pnpm": patch
+---
+
+Fix a `pnpm audit` performance regression on lockfiles that contain dependency cycles. The reachable-vulnerability pruning added in pnpm 11.5.1 only memoized acyclic subtrees, so any node whose subtree touched a cycle — together with all of its ancestors — was recomputed on every query, making the path walk quadratic. Reachability is now computed once per node using Tarjan's strongly-connected-components algorithm, so cyclic graphs are handled in linear time [#12212](https://github.com/pnpm/pnpm/issues/12212).
+
+The audit path walk also no longer recurses, so a deeply nested dependency graph can no longer overflow the call stack, and the install path to each finding is tracked without per-node copying, keeping memory linear in the graph depth.
--- a/cspell.json
+++ b/cspell.json
@@ -185,6 +185,7 @@
    "longlink",
    "longpaths",
    "loong",
+    "lowlink",
    "luca",
    "martensson",
    "maxtimeout",
@@ -359,6 +360,7 @@
    "stdtype",
    "streamsearch",
    "stringifying",
+    "strongconnect",
    "subcmd",
    "subdep",
    "subdependencies",
@@ -381,6 +383,7 @@
    "taffydb",
    "taiki",
    "tarballtemplate",
+    "Tarjan",
    "teambit",
    "tempy",
    "testcase",
--- a/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts
+++ b/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts
@@ -94,10 +94,20 @@ export function lockfileToAuditRequest (
  }

  // Build a visitor for one lockfile graph. The walker already de-duplicates
-  // by depPath internally, so we don't need a second visited set here.
+  // by depPath internally, so we don't need a second visited set here. An
+  // explicit frame stack stands in for recursion (registering each dependency
+  // before descending, preserving pre-order) so a deep dependency chain from an
+  // untrusted lockfile cannot overflow the call stack.
  const makeVisitor = (graphDepTypes: DepTypes, graphOptionalOnly: Set<DepPath>) => {
-    const visit = (step: LockfileWalkerStep): void => {
-      for (const { depPath, pkgSnapshot, next } of step.dependencies) {
+    return (rootStep: LockfileWalkerStep): void => {
+      const stack: Array<{ dependencies: LockfileWalkerStep['dependencies'], next: number }> = [{ dependencies: rootStep.dependencies, next: 0 }]
+      while (stack.length > 0) {
+        const frame = stack[stack.length - 1]
+        if (frame.next >= frame.dependencies.length) {
+          stack.pop()
+          continue
+        }
+        const { depPath, pkgSnapshot, next } = frame.dependencies[frame.next++]
        const { name, version } = nameVerFromPkgSnapshot(depPath, pkgSnapshot)
        if (version) {
          registerOccurrence({
@@ -107,10 +117,9 @@ export function lockfileToAuditRequest (
            optionalOnly: graphOptionalOnly.has(depPath),
          })
        }
-        visit(next())
+        stack.push({ dependencies: next().dependencies, next: 0 })
      }
    }
-    return visit
  }

  const visitMain = makeVisitor(depTypes, optionalOnly)
@@ -191,11 +200,21 @@ function walkForPaths (ctx: WalkForPathsCtx): void {
  const packages = lockfile.packages ?? {}
  const reachableVulnerabilities = createReachableVulnerabilitiesGetter(lockfile, vulnerableNames, includeOptDeps)

-  // Reused across every root to avoid per-node Set cloning. visit adds the
-  // current depPath before recursing and removes it on the way back, so the
-  // set always reflects the current trail.
+  // Tracks the depPaths on the current DFS trail so cycles terminate. A frame is
+  // added when its node is opened and removed when the frame is unwound, so the
+  // set always reflects the path from the root to the current node. Explicit
+  // stack rather than recursion: a lockfile is untrusted input, and a deep
+  // dependency chain would otherwise overflow the call stack and crash the audit.
  const inTrail = new Set<DepPath>()
-  const visit = (edge: { name: string, depPath: DepPath }, trail: string[]): void => {
+  // The trail is a parent-linked chain of names rather than a copied array per
+  // node: copying would cost O(depth) memory and time at every node and make a
+  // deep chain O(depth^2). The chain is materialized into a path string only
+  // when a vulnerable node is recorded.
+  const stack: Array<{ depPath: DepPath, trail: TrailNode, children: Array<{ name: string, depPath: DepPath }>, next: number }> = []
+
+  // Apply the per-node logic and, unless the node is pruned, push a frame so its
+  // children are visited. Records a path when the node is itself vulnerable.
+  const open = (edge: { name: string, depPath: DepPath }, parentTrail: TrailNode): void => {
    const reachable = reachableVulnerabilities(edge)
    if (reachable.size === 0 || allReachableVulnerabilitiesSaturated(paths, reachable, depTypes, optionalOnly)) return
    if (inTrail.has(edge.depPath)) return
@@ -203,92 +222,191 @@ function walkForPaths (ctx: WalkForPathsCtx): void {
    if (pkgSnapshot == null) return
    const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot)
    const resolvedName = name ?? edge.name
-    const fullPath = [...trail, resolvedName]
+    const trail: TrailNode = { name: resolvedName, parent: parentTrail }
    if (version && vulnerableNames.has(resolvedName)) {
-      recordPath(paths, resolvedName, version, fullPath.join('>'),
+      recordPath(paths, resolvedName, version, joinTrail(trail),
        depTypes[edge.depPath] === DepType.DevOnly,
        optionalOnly.has(edge.depPath))
    }
    if (allReachableVulnerabilitiesSaturated(paths, reachable, depTypes, optionalOnly)) return
-    inTrail.add(edge.depPath)
-    try {
-      for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.dependencies ?? {})) {
-        visit(child, fullPath)
-      }
-      if (includeOptDeps) {
-        for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.optionalDependencies ?? {})) {
-          visit(child, fullPath)
-        }
-      }
-    } finally {
-      inTrail.delete(edge.depPath)
+    const children: Array<{ name: string, depPath: DepPath }> = []
+    appendNamedDepPaths(children, pkgSnapshot.dependencies ?? {})
+    if (includeOptDeps) {
+      appendNamedDepPaths(children, pkgSnapshot.optionalDependencies ?? {})
    }
+    inTrail.add(edge.depPath)
+    stack.push({ depPath: edge.depPath, trail, children, next: 0 })
  }

  for (const [importerId, importer] of Object.entries(lockfile.importers)) {
-    const trail = [importerSegmentOf(importerId)]
+    const trail: TrailNode = { name: importerSegmentOf(importerId), parent: null }
    const roots: Array<{ name: string, depPath: DepPath }> = []
-    if (includeDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.dependencies ?? {}))
-    if (includeDevDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.devDependencies ?? {}))
-    if (includeOptDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.optionalDependencies ?? {}))
+    if (includeDeps) appendNamedDepPaths(roots, importer.dependencies ?? {})
+    if (includeDevDeps) appendNamedDepPaths(roots, importer.devDependencies ?? {})
+    if (includeOptDeps) appendNamedDepPaths(roots, importer.optionalDependencies ?? {})
    for (const root of roots) {
-      visit(root, trail)
+      open(root, trail)
+      while (stack.length > 0) {
+        const frame = stack[stack.length - 1]
+        if (frame.next < frame.children.length) {
+          open(frame.children[frame.next++], frame.trail)
+        } else {
+          inTrail.delete(frame.depPath)
+          stack.pop()
+        }
+      }
    }
  }
 }

+// A node in the current DFS trail. Linking to the parent rather than copying the
+// whole path keeps per-node memory O(1); `joinTrail` walks the chain to the root
+// to produce the `a>b>c` string only when a path is actually recorded.
+interface TrailNode {
+  name: string
+  parent: TrailNode | null
+}
+
+function joinTrail (node: TrailNode): string {
+  const parts: string[] = []
+  let current: TrailNode | null = node
+  while (current != null) {
+    parts.push(current.name)
+    current = current.parent
+  }
+  parts.reverse()
+  return parts.join('>')
+}
+
+// For each node, the set of vulnerabilities reachable from it (itself included),
+// used by the walker to prune subtrees that reach no unsaturated finding.
+// Tarjan's SCC algorithm scans every node once and shares one set across a
+// cycle, avoiding the quadratic recompute of memoizing only acyclic subtrees.
 function createReachableVulnerabilitiesGetter (
  lockfile: LockfileObject,
  vulnerableNames: Set<string>,
  includeOptDeps: boolean
-): (edge: { name: string, depPath: DepPath }) => Set<string> {
+): (edge: { name: string, depPath: DepPath }) => ReadonlySet<string> {
  const packages = lockfile.packages ?? {}
-  // Only fully-resolved (acyclic) subtrees are memoized. A node whose reachable
-  // set still depends on a back-edge to an ancestor is incomplete until the
-  // enclosing cycle is left, so caching it would under-report reachable
-  // vulnerabilities and let the walker prune valid audit paths. Such nodes are
-  // recomputed on each top-level query, which is cheap because cycles in a
-  // lockfile are rare and small.
+  // Final reachable set per node, shared across its SCC.
  const memo = new Map<DepPath, Set<string>>()
+  // Per-node contribution, merged into the shared set when the SCC closes.
+  const partial = new Map<DepPath, Set<string>>()
+  const index = new Map<DepPath, number>()
+  const lowlink = new Map<DepPath, number>()
  const onStack = new Set<DepPath>()
-  const compute = (edge: { name: string, depPath: DepPath }): { result: Set<string>, complete: boolean } => {
-    const cached = memo.get(edge.depPath)
-    if (cached) return { result: cached, complete: true }
-    if (onStack.has(edge.depPath)) return { result: new Set(), complete: false }
-    const pkgSnapshot = packages[edge.depPath]
-    if (pkgSnapshot == null) return { result: new Set(), complete: true }
+  const sccStack: DepPath[] = []
+  let counter = 0

-    onStack.add(edge.depPath)
-    const result = new Set<string>()
-    let complete = true
-    const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot)
-    const resolvedName = name ?? edge.name
-    if (version && vulnerableNames.has(resolvedName)) {
-      result.add(vulnerabilityKey(resolvedName, version, edge.depPath))
+  // Iterative Tarjan: an explicit frame stack stands in for the recursive
+  // strongconnect, so a deep dependency chain from an untrusted lockfile cannot
+  // overflow the call stack. Each frame's `own` set is shared by reference with
+  // `partial`, so contributions merged in while the children run are visible
+  // when the SCC closes.
+  const buildScc = (rootEdge: { name: string, depPath: DepPath }): void => {
+    const work: Array<{ edge: { name: string, depPath: DepPath }, own: Set<string>, children: Array<{ name: string, depPath: DepPath }>, next: number }> = []
+
+    const pushFrame = (edge: { name: string, depPath: DepPath }): void => {
+      index.set(edge.depPath, counter)
+      lowlink.set(edge.depPath, counter)
+      counter++
+      sccStack.push(edge.depPath)
+      onStack.add(edge.depPath)
+
+      // Derive children from this single read rather than via a helper that would
+      // read the snapshot again.
+      const pkgSnapshot = packages[edge.depPath]
+      const own = new Set<string>()
+      const children: Array<{ name: string, depPath: DepPath }> = []
+      if (pkgSnapshot != null) {
+        const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot)
+        const resolvedName = name ?? edge.name
+        if (version && vulnerableNames.has(resolvedName)) {
+          own.add(vulnerabilityKey(resolvedName, version, edge.depPath))
+        }
+        appendNamedDepPaths(children, pkgSnapshot.dependencies ?? {})
+        if (includeOptDeps) {
+          appendNamedDepPaths(children, pkgSnapshot.optionalDependencies ?? {})
+        }
+      }
+      partial.set(edge.depPath, own)
+      work.push({ edge, own, children, next: 0 })
    }
-    const collect = (child: { name: string, depPath: DepPath }): void => {
-      const childResult = compute(child)
-      addAll(result, childResult.result)
-      if (!childResult.complete) complete = false
-    }
-    for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.dependencies ?? {})) {
-      collect(child)
-    }
-    if (includeOptDeps) {
-      for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.optionalDependencies ?? {})) {
-        collect(child)
+
+    pushFrame(rootEdge)
+    while (work.length > 0) {
+      const frame = work[work.length - 1]
+      if (frame.next < frame.children.length) {
+        const child = frame.children[frame.next++]
+        if (!index.has(child.depPath)) {
+          pushFrame(child)
+          continue
+        }
+        if (onStack.has(child.depPath)) {
+          lowlink.set(frame.edge.depPath, Math.min(lowlink.get(frame.edge.depPath)!, index.get(child.depPath)!))
+        }
+        // Finalized successors are already in `memo`; same-SCC ones are folded in
+        // when the SCC closes.
+        const childReachable = memo.get(child.depPath)
+        if (childReachable) addAll(frame.own, childReachable)
+        continue
+      }
+
+      const edge = frame.edge
+      if (lowlink.get(edge.depPath) === index.get(edge.depPath)) {
+        const members: DepPath[] = []
+        // Reuse the first member's own set as the shared accumulator instead of
+        // allocating a fresh one, so the common singleton-SCC case finalizes
+        // without any extra Set allocation or copy.
+        let shared: Set<string> | undefined
+        let member: DepPath
+        do {
+          member = sccStack.pop()!
+          onStack.delete(member)
+          members.push(member)
+          const own = partial.get(member)!
+          partial.delete(member)
+          if (shared === undefined) {
+            shared = own
+          } else {
+            addAll(shared, own)
+          }
+        } while (member !== edge.depPath)
+        for (const m of members) {
+          memo.set(m, shared!)
+        }
+      }
+
+      work.pop()
+      // Apply the post-DFS update to the parent: propagate this node's lowlink
+      // and fold in its reachable set once finalized (same-SCC nodes are folded
+      // later, via `partial`, when the shared SCC root closes).
+      const parent = work[work.length - 1]
+      if (parent != null) {
+        lowlink.set(parent.edge.depPath, Math.min(lowlink.get(parent.edge.depPath)!, lowlink.get(edge.depPath)!))
+        const childReachable = memo.get(edge.depPath)
+        if (childReachable) addAll(parent.own, childReachable)
      }
    }
-    onStack.delete(edge.depPath)
-    if (complete) memo.set(edge.depPath, result)
-    return { result, complete }
  }
-  return (edge) => compute(edge).result
+
+  return (edge) => {
+    if (!index.has(edge.depPath)) buildScc(edge)
+    // strongconnect always finalizes the queried node's SCC, so its reachable
+    // set is present afterwards. A missing entry would be a bug that silently
+    // under-reports and hides a real finding, so fail loudly instead of
+    // returning an empty set.
+    const reachable = memo.get(edge.depPath)
+    if (reachable == null) {
+      throw new Error(`Reachable vulnerabilities were not computed for ${edge.depPath}`)
+    }
+    return reachable
+  }
 }

 function allReachableVulnerabilitiesSaturated (
  paths: AuditPathIndex,
-  reachable: Set<string>,
+  reachable: ReadonlySet<string>,
  depTypes: DepTypes,
  optionalOnly: Set<DepPath>
 ): boolean {
@@ -344,13 +462,15 @@ function recordPath (paths: AuditPathIndex, name: string, version: string, joine
  info.paths.push(joined)
 }

-function resolvedDepsToNamedDepPaths (deps: ResolvedDependencies): Array<{ name: string, depPath: DepPath }> {
-  const result: Array<{ name: string, depPath: DepPath }> = []
+// Append rather than `target.push(...mapped(deps))`: a lockfile is untrusted
+// input, and spreading a pathologically large dependency list into push()
+// arguments can exceed the engine's argument limit and throw, crashing the
+// audit. Appending in a loop also avoids the intermediate array.
+function appendNamedDepPaths (target: Array<{ name: string, depPath: DepPath }>, deps: ResolvedDependencies): void {
  for (const [alias, ref] of Object.entries(deps)) {
    const depPath = dp.refToRelative(ref, alias)
-    if (depPath != null) result.push({ name: alias, depPath })
+    if (depPath != null) target.push({ name: alias, depPath })
  }
-  return result
 }

 // Returns the set of depPaths that are reachable only through optional edges
@@ -392,16 +512,22 @@ export function collectOptionalOnlyDepPaths (
  return result
 }

+// Explicit stack rather than recursion: a lockfile is untrusted input, and a
+// deep dependency chain would otherwise overflow the call stack. Order does not
+// matter — the result is the reachable set, so a LIFO walk is equivalent.
 function walkReachable (lockfile: LockfileObject, depPaths: DepPath[], seen: Set<DepPath>, includeOptionalEdges: boolean): void {
  const packages = lockfile.packages ?? {}
-  for (const depPath of depPaths) {
+  const stack: DepPath[] = []
+  for (const depPath of depPaths) stack.push(depPath)
+  while (stack.length > 0) {
+    const depPath = stack.pop()!
    if (seen.has(depPath)) continue
    seen.add(depPath)
    const snapshot = packages[depPath]
    if (!snapshot) continue
-    walkReachable(lockfile, resolvedDepsToDepPaths(snapshot.dependencies ?? {}), seen, includeOptionalEdges)
+    for (const child of resolvedDepsToDepPaths(snapshot.dependencies ?? {})) stack.push(child)
    if (includeOptionalEdges) {
-      walkReachable(lockfile, resolvedDepsToDepPaths(snapshot.optionalDependencies ?? {}), seen, includeOptionalEdges)
+      for (const child of resolvedDepsToDepPaths(snapshot.optionalDependencies ?? {})) stack.push(child)
    }
  }
 }
--- a/pnpm11/deps/compliance/audit/test/index.ts
+++ b/pnpm11/deps/compliance/audit/test/index.ts
@@ -308,6 +308,75 @@ describe('audit', () => {
    expect(result['x']!.get('1.0.0')!.paths.sort()).toEqual(['.>b>c>x', '.>c>x'])
  })

+  test('buildAuditPathIndex() scales linearly, not quadratically, with cycle size', () => {
+    // n0 -> ... -> n(L-1) -> n0 is one big cycle with a single vulnerable leaf.
+    // Recomputing the cycle for every ancestor scans ~L^2 nodes, so doubling L
+    // quadruples the reads; linear work only doubles them. Comparing the growth
+    // rate of two sizes — rather than an absolute count — keeps the assertion
+    // robust to constant-factor changes in future refactors.
+    const countReads = (L: number): number => {
+      let reads = 0
+      const importers = {
+        ['.' as ProjectId]: { dependencies: { n0: '1.0.0' }, specifiers: { n0: '^1.0.0' } },
+      }
+      const packages: PackageSnapshots = {}
+      for (let i = 0; i < L; i++) {
+        const nextName = i + 1 < L ? `n${i + 1}` : 'n0' // the last node loops back to n0
+        const deps = { [nextName]: '1.0.0', [`leaf${i}`]: '1.0.0' }
+        Object.defineProperty(packages, `n${i}@1.0.0`, {
+          enumerable: true,
+          get: () => {
+            reads++
+            return { dependencies: deps, resolution: { integrity: `n${i}-integrity` } }
+          },
+        })
+        packages[`leaf${i}@1.0.0` as DepPath] = { resolution: { integrity: `leaf${i}-integrity` } }
+      }
+
+      const result = buildAuditPathIndex({
+        importers,
+        lockfileVersion: LOCKFILE_VERSION,
+        packages,
+      }, new Set(['leaf0']), { depTypes: {}, optionalOnly: new Set() })
+
+      // Correctness: the only vulnerable leaf is still reported with its path.
+      expect(result['leaf0']!.get('1.0.0')).toEqual({
+        paths: ['.>n0>leaf0'],
+        dev: false,
+        optional: false,
+      })
+      return reads
+    }
+
+    // Linear ⇒ ratio ≈ 2; quadratic ⇒ ratio ≈ 4. Assert clearly sub-quadratic.
+    expect(countReads(400) / countReads(200)).toBeLessThan(3)
+  })
+
+  test('buildAuditPathIndex() handles a very deep dependency chain without overflowing the stack', () => {
+    // n0 -> n1 -> ... -> n(L-1) -> vuln is a single chain far deeper than the JS
+    // call-stack limit. A recursive walk (reachability or path traversal) would
+    // throw RangeError on this lockfile (a lockfile is untrusted input); the
+    // iterative implementation must complete and still report the leaf.
+    const L = 60_000
+    const packages: PackageSnapshots = {}
+    for (let i = 0; i < L; i++) {
+      const child = i + 1 < L ? `n${i + 1}` : 'vuln'
+      packages[`n${i}@1.0.0` as DepPath] = { dependencies: { [child]: '1.0.0' }, resolution: { integrity: `n${i}-integrity` } }
+    }
+    packages['vuln@1.0.0' as DepPath] = { resolution: { integrity: 'vuln-integrity' } }
+
+    const result = buildAuditPathIndex({
+      importers: { ['.' as ProjectId]: { dependencies: { n0: '1.0.0' }, specifiers: { n0: '^1.0.0' } } },
+      lockfileVersion: LOCKFILE_VERSION,
+      packages,
+    }, new Set(['vuln']), { depTypes: {}, optionalOnly: new Set() })
+
+    const info = result['vuln']!.get('1.0.0')!
+    expect(info.paths).toHaveLength(1)
+    expect(info.paths[0].startsWith('.>n0>n1>')).toBe(true)
+    expect(info.paths[0].endsWith('>vuln')).toBe(true)
+  })
+
  test('buildAuditPathIndex() replaces slashes in workspace importer ids', () => {
    const lockfile = {
      importers: {