diff --git a/.changeset/fix-audit-cyclic-path-traversal-perf.md b/.changeset/fix-audit-cyclic-path-traversal-perf.md new file mode 100644 index 0000000000..192d14397f --- /dev/null +++ b/.changeset/fix-audit-cyclic-path-traversal-perf.md @@ -0,0 +1,8 @@ +--- +"@pnpm/deps.compliance.audit": patch +"pnpm": patch +--- + +Fix a `pnpm audit` performance regression on lockfiles that contain dependency cycles. The reachable-vulnerability pruning added in pnpm 11.5.1 only memoized acyclic subtrees, so any node whose subtree touched a cycle — together with all of its ancestors — was recomputed on every query, making the path walk quadratic. Reachability is now computed once per node using Tarjan's strongly-connected-components algorithm, so cyclic graphs are handled in linear time [#12212](https://github.com/pnpm/pnpm/issues/12212). + +The audit path walk also no longer recurses, so a deeply nested dependency graph can no longer overflow the call stack, and the install path to each finding is tracked without per-node copying, keeping memory linear in the graph depth. diff --git a/cspell.json b/cspell.json index 341cfa1baf..d8d7aba62a 100644 --- a/cspell.json +++ b/cspell.json @@ -185,6 +185,7 @@ "longlink", "longpaths", "loong", + "lowlink", "luca", "martensson", "maxtimeout", @@ -359,6 +360,7 @@ "stdtype", "streamsearch", "stringifying", + "strongconnect", "subcmd", "subdep", "subdependencies", @@ -381,6 +383,7 @@ "taffydb", "taiki", "tarballtemplate", + "Tarjan", "teambit", "tempy", "testcase", diff --git a/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts b/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts index c1398d68ab..0aa6aeba5c 100644 --- a/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts +++ b/pnpm11/deps/compliance/audit/src/lockfileToAuditIndex.ts @@ -94,10 +94,20 @@ export function lockfileToAuditRequest ( } // Build a visitor for one lockfile graph. The walker already de-duplicates - // by depPath internally, so we don't need a second visited set here. + // by depPath internally, so we don't need a second visited set here. An + // explicit frame stack stands in for recursion (registering each dependency + // before descending, preserving pre-order) so a deep dependency chain from an + // untrusted lockfile cannot overflow the call stack. const makeVisitor = (graphDepTypes: DepTypes, graphOptionalOnly: Set) => { - const visit = (step: LockfileWalkerStep): void => { - for (const { depPath, pkgSnapshot, next } of step.dependencies) { + return (rootStep: LockfileWalkerStep): void => { + const stack: Array<{ dependencies: LockfileWalkerStep['dependencies'], next: number }> = [{ dependencies: rootStep.dependencies, next: 0 }] + while (stack.length > 0) { + const frame = stack[stack.length - 1] + if (frame.next >= frame.dependencies.length) { + stack.pop() + continue + } + const { depPath, pkgSnapshot, next } = frame.dependencies[frame.next++] const { name, version } = nameVerFromPkgSnapshot(depPath, pkgSnapshot) if (version) { registerOccurrence({ @@ -107,10 +117,9 @@ export function lockfileToAuditRequest ( optionalOnly: graphOptionalOnly.has(depPath), }) } - visit(next()) + stack.push({ dependencies: next().dependencies, next: 0 }) } } - return visit } const visitMain = makeVisitor(depTypes, optionalOnly) @@ -191,11 +200,21 @@ function walkForPaths (ctx: WalkForPathsCtx): void { const packages = lockfile.packages ?? {} const reachableVulnerabilities = createReachableVulnerabilitiesGetter(lockfile, vulnerableNames, includeOptDeps) - // Reused across every root to avoid per-node Set cloning. visit adds the - // current depPath before recursing and removes it on the way back, so the - // set always reflects the current trail. + // Tracks the depPaths on the current DFS trail so cycles terminate. A frame is + // added when its node is opened and removed when the frame is unwound, so the + // set always reflects the path from the root to the current node. Explicit + // stack rather than recursion: a lockfile is untrusted input, and a deep + // dependency chain would otherwise overflow the call stack and crash the audit. const inTrail = new Set() - const visit = (edge: { name: string, depPath: DepPath }, trail: string[]): void => { + // The trail is a parent-linked chain of names rather than a copied array per + // node: copying would cost O(depth) memory and time at every node and make a + // deep chain O(depth^2). The chain is materialized into a path string only + // when a vulnerable node is recorded. + const stack: Array<{ depPath: DepPath, trail: TrailNode, children: Array<{ name: string, depPath: DepPath }>, next: number }> = [] + + // Apply the per-node logic and, unless the node is pruned, push a frame so its + // children are visited. Records a path when the node is itself vulnerable. + const open = (edge: { name: string, depPath: DepPath }, parentTrail: TrailNode): void => { const reachable = reachableVulnerabilities(edge) if (reachable.size === 0 || allReachableVulnerabilitiesSaturated(paths, reachable, depTypes, optionalOnly)) return if (inTrail.has(edge.depPath)) return @@ -203,92 +222,191 @@ function walkForPaths (ctx: WalkForPathsCtx): void { if (pkgSnapshot == null) return const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot) const resolvedName = name ?? edge.name - const fullPath = [...trail, resolvedName] + const trail: TrailNode = { name: resolvedName, parent: parentTrail } if (version && vulnerableNames.has(resolvedName)) { - recordPath(paths, resolvedName, version, fullPath.join('>'), + recordPath(paths, resolvedName, version, joinTrail(trail), depTypes[edge.depPath] === DepType.DevOnly, optionalOnly.has(edge.depPath)) } if (allReachableVulnerabilitiesSaturated(paths, reachable, depTypes, optionalOnly)) return - inTrail.add(edge.depPath) - try { - for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.dependencies ?? {})) { - visit(child, fullPath) - } - if (includeOptDeps) { - for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.optionalDependencies ?? {})) { - visit(child, fullPath) - } - } - } finally { - inTrail.delete(edge.depPath) + const children: Array<{ name: string, depPath: DepPath }> = [] + appendNamedDepPaths(children, pkgSnapshot.dependencies ?? {}) + if (includeOptDeps) { + appendNamedDepPaths(children, pkgSnapshot.optionalDependencies ?? {}) } + inTrail.add(edge.depPath) + stack.push({ depPath: edge.depPath, trail, children, next: 0 }) } for (const [importerId, importer] of Object.entries(lockfile.importers)) { - const trail = [importerSegmentOf(importerId)] + const trail: TrailNode = { name: importerSegmentOf(importerId), parent: null } const roots: Array<{ name: string, depPath: DepPath }> = [] - if (includeDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.dependencies ?? {})) - if (includeDevDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.devDependencies ?? {})) - if (includeOptDeps) roots.push(...resolvedDepsToNamedDepPaths(importer.optionalDependencies ?? {})) + if (includeDeps) appendNamedDepPaths(roots, importer.dependencies ?? {}) + if (includeDevDeps) appendNamedDepPaths(roots, importer.devDependencies ?? {}) + if (includeOptDeps) appendNamedDepPaths(roots, importer.optionalDependencies ?? {}) for (const root of roots) { - visit(root, trail) + open(root, trail) + while (stack.length > 0) { + const frame = stack[stack.length - 1] + if (frame.next < frame.children.length) { + open(frame.children[frame.next++], frame.trail) + } else { + inTrail.delete(frame.depPath) + stack.pop() + } + } } } } +// A node in the current DFS trail. Linking to the parent rather than copying the +// whole path keeps per-node memory O(1); `joinTrail` walks the chain to the root +// to produce the `a>b>c` string only when a path is actually recorded. +interface TrailNode { + name: string + parent: TrailNode | null +} + +function joinTrail (node: TrailNode): string { + const parts: string[] = [] + let current: TrailNode | null = node + while (current != null) { + parts.push(current.name) + current = current.parent + } + parts.reverse() + return parts.join('>') +} + +// For each node, the set of vulnerabilities reachable from it (itself included), +// used by the walker to prune subtrees that reach no unsaturated finding. +// Tarjan's SCC algorithm scans every node once and shares one set across a +// cycle, avoiding the quadratic recompute of memoizing only acyclic subtrees. function createReachableVulnerabilitiesGetter ( lockfile: LockfileObject, vulnerableNames: Set, includeOptDeps: boolean -): (edge: { name: string, depPath: DepPath }) => Set { +): (edge: { name: string, depPath: DepPath }) => ReadonlySet { const packages = lockfile.packages ?? {} - // Only fully-resolved (acyclic) subtrees are memoized. A node whose reachable - // set still depends on a back-edge to an ancestor is incomplete until the - // enclosing cycle is left, so caching it would under-report reachable - // vulnerabilities and let the walker prune valid audit paths. Such nodes are - // recomputed on each top-level query, which is cheap because cycles in a - // lockfile are rare and small. + // Final reachable set per node, shared across its SCC. const memo = new Map>() + // Per-node contribution, merged into the shared set when the SCC closes. + const partial = new Map>() + const index = new Map() + const lowlink = new Map() const onStack = new Set() - const compute = (edge: { name: string, depPath: DepPath }): { result: Set, complete: boolean } => { - const cached = memo.get(edge.depPath) - if (cached) return { result: cached, complete: true } - if (onStack.has(edge.depPath)) return { result: new Set(), complete: false } - const pkgSnapshot = packages[edge.depPath] - if (pkgSnapshot == null) return { result: new Set(), complete: true } + const sccStack: DepPath[] = [] + let counter = 0 - onStack.add(edge.depPath) - const result = new Set() - let complete = true - const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot) - const resolvedName = name ?? edge.name - if (version && vulnerableNames.has(resolvedName)) { - result.add(vulnerabilityKey(resolvedName, version, edge.depPath)) + // Iterative Tarjan: an explicit frame stack stands in for the recursive + // strongconnect, so a deep dependency chain from an untrusted lockfile cannot + // overflow the call stack. Each frame's `own` set is shared by reference with + // `partial`, so contributions merged in while the children run are visible + // when the SCC closes. + const buildScc = (rootEdge: { name: string, depPath: DepPath }): void => { + const work: Array<{ edge: { name: string, depPath: DepPath }, own: Set, children: Array<{ name: string, depPath: DepPath }>, next: number }> = [] + + const pushFrame = (edge: { name: string, depPath: DepPath }): void => { + index.set(edge.depPath, counter) + lowlink.set(edge.depPath, counter) + counter++ + sccStack.push(edge.depPath) + onStack.add(edge.depPath) + + // Derive children from this single read rather than via a helper that would + // read the snapshot again. + const pkgSnapshot = packages[edge.depPath] + const own = new Set() + const children: Array<{ name: string, depPath: DepPath }> = [] + if (pkgSnapshot != null) { + const { name, version } = nameVerFromPkgSnapshot(edge.depPath, pkgSnapshot) + const resolvedName = name ?? edge.name + if (version && vulnerableNames.has(resolvedName)) { + own.add(vulnerabilityKey(resolvedName, version, edge.depPath)) + } + appendNamedDepPaths(children, pkgSnapshot.dependencies ?? {}) + if (includeOptDeps) { + appendNamedDepPaths(children, pkgSnapshot.optionalDependencies ?? {}) + } + } + partial.set(edge.depPath, own) + work.push({ edge, own, children, next: 0 }) } - const collect = (child: { name: string, depPath: DepPath }): void => { - const childResult = compute(child) - addAll(result, childResult.result) - if (!childResult.complete) complete = false - } - for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.dependencies ?? {})) { - collect(child) - } - if (includeOptDeps) { - for (const child of resolvedDepsToNamedDepPaths(pkgSnapshot.optionalDependencies ?? {})) { - collect(child) + + pushFrame(rootEdge) + while (work.length > 0) { + const frame = work[work.length - 1] + if (frame.next < frame.children.length) { + const child = frame.children[frame.next++] + if (!index.has(child.depPath)) { + pushFrame(child) + continue + } + if (onStack.has(child.depPath)) { + lowlink.set(frame.edge.depPath, Math.min(lowlink.get(frame.edge.depPath)!, index.get(child.depPath)!)) + } + // Finalized successors are already in `memo`; same-SCC ones are folded in + // when the SCC closes. + const childReachable = memo.get(child.depPath) + if (childReachable) addAll(frame.own, childReachable) + continue + } + + const edge = frame.edge + if (lowlink.get(edge.depPath) === index.get(edge.depPath)) { + const members: DepPath[] = [] + // Reuse the first member's own set as the shared accumulator instead of + // allocating a fresh one, so the common singleton-SCC case finalizes + // without any extra Set allocation or copy. + let shared: Set | undefined + let member: DepPath + do { + member = sccStack.pop()! + onStack.delete(member) + members.push(member) + const own = partial.get(member)! + partial.delete(member) + if (shared === undefined) { + shared = own + } else { + addAll(shared, own) + } + } while (member !== edge.depPath) + for (const m of members) { + memo.set(m, shared!) + } + } + + work.pop() + // Apply the post-DFS update to the parent: propagate this node's lowlink + // and fold in its reachable set once finalized (same-SCC nodes are folded + // later, via `partial`, when the shared SCC root closes). + const parent = work[work.length - 1] + if (parent != null) { + lowlink.set(parent.edge.depPath, Math.min(lowlink.get(parent.edge.depPath)!, lowlink.get(edge.depPath)!)) + const childReachable = memo.get(edge.depPath) + if (childReachable) addAll(parent.own, childReachable) } } - onStack.delete(edge.depPath) - if (complete) memo.set(edge.depPath, result) - return { result, complete } } - return (edge) => compute(edge).result + + return (edge) => { + if (!index.has(edge.depPath)) buildScc(edge) + // strongconnect always finalizes the queried node's SCC, so its reachable + // set is present afterwards. A missing entry would be a bug that silently + // under-reports and hides a real finding, so fail loudly instead of + // returning an empty set. + const reachable = memo.get(edge.depPath) + if (reachable == null) { + throw new Error(`Reachable vulnerabilities were not computed for ${edge.depPath}`) + } + return reachable + } } function allReachableVulnerabilitiesSaturated ( paths: AuditPathIndex, - reachable: Set, + reachable: ReadonlySet, depTypes: DepTypes, optionalOnly: Set ): boolean { @@ -344,13 +462,15 @@ function recordPath (paths: AuditPathIndex, name: string, version: string, joine info.paths.push(joined) } -function resolvedDepsToNamedDepPaths (deps: ResolvedDependencies): Array<{ name: string, depPath: DepPath }> { - const result: Array<{ name: string, depPath: DepPath }> = [] +// Append rather than `target.push(...mapped(deps))`: a lockfile is untrusted +// input, and spreading a pathologically large dependency list into push() +// arguments can exceed the engine's argument limit and throw, crashing the +// audit. Appending in a loop also avoids the intermediate array. +function appendNamedDepPaths (target: Array<{ name: string, depPath: DepPath }>, deps: ResolvedDependencies): void { for (const [alias, ref] of Object.entries(deps)) { const depPath = dp.refToRelative(ref, alias) - if (depPath != null) result.push({ name: alias, depPath }) + if (depPath != null) target.push({ name: alias, depPath }) } - return result } // Returns the set of depPaths that are reachable only through optional edges @@ -392,16 +512,22 @@ export function collectOptionalOnlyDepPaths ( return result } +// Explicit stack rather than recursion: a lockfile is untrusted input, and a +// deep dependency chain would otherwise overflow the call stack. Order does not +// matter — the result is the reachable set, so a LIFO walk is equivalent. function walkReachable (lockfile: LockfileObject, depPaths: DepPath[], seen: Set, includeOptionalEdges: boolean): void { const packages = lockfile.packages ?? {} - for (const depPath of depPaths) { + const stack: DepPath[] = [] + for (const depPath of depPaths) stack.push(depPath) + while (stack.length > 0) { + const depPath = stack.pop()! if (seen.has(depPath)) continue seen.add(depPath) const snapshot = packages[depPath] if (!snapshot) continue - walkReachable(lockfile, resolvedDepsToDepPaths(snapshot.dependencies ?? {}), seen, includeOptionalEdges) + for (const child of resolvedDepsToDepPaths(snapshot.dependencies ?? {})) stack.push(child) if (includeOptionalEdges) { - walkReachable(lockfile, resolvedDepsToDepPaths(snapshot.optionalDependencies ?? {}), seen, includeOptionalEdges) + for (const child of resolvedDepsToDepPaths(snapshot.optionalDependencies ?? {})) stack.push(child) } } } diff --git a/pnpm11/deps/compliance/audit/test/index.ts b/pnpm11/deps/compliance/audit/test/index.ts index f4a68872e7..a773478246 100644 --- a/pnpm11/deps/compliance/audit/test/index.ts +++ b/pnpm11/deps/compliance/audit/test/index.ts @@ -308,6 +308,75 @@ describe('audit', () => { expect(result['x']!.get('1.0.0')!.paths.sort()).toEqual(['.>b>c>x', '.>c>x']) }) + test('buildAuditPathIndex() scales linearly, not quadratically, with cycle size', () => { + // n0 -> ... -> n(L-1) -> n0 is one big cycle with a single vulnerable leaf. + // Recomputing the cycle for every ancestor scans ~L^2 nodes, so doubling L + // quadruples the reads; linear work only doubles them. Comparing the growth + // rate of two sizes — rather than an absolute count — keeps the assertion + // robust to constant-factor changes in future refactors. + const countReads = (L: number): number => { + let reads = 0 + const importers = { + ['.' as ProjectId]: { dependencies: { n0: '1.0.0' }, specifiers: { n0: '^1.0.0' } }, + } + const packages: PackageSnapshots = {} + for (let i = 0; i < L; i++) { + const nextName = i + 1 < L ? `n${i + 1}` : 'n0' // the last node loops back to n0 + const deps = { [nextName]: '1.0.0', [`leaf${i}`]: '1.0.0' } + Object.defineProperty(packages, `n${i}@1.0.0`, { + enumerable: true, + get: () => { + reads++ + return { dependencies: deps, resolution: { integrity: `n${i}-integrity` } } + }, + }) + packages[`leaf${i}@1.0.0` as DepPath] = { resolution: { integrity: `leaf${i}-integrity` } } + } + + const result = buildAuditPathIndex({ + importers, + lockfileVersion: LOCKFILE_VERSION, + packages, + }, new Set(['leaf0']), { depTypes: {}, optionalOnly: new Set() }) + + // Correctness: the only vulnerable leaf is still reported with its path. + expect(result['leaf0']!.get('1.0.0')).toEqual({ + paths: ['.>n0>leaf0'], + dev: false, + optional: false, + }) + return reads + } + + // Linear ⇒ ratio ≈ 2; quadratic ⇒ ratio ≈ 4. Assert clearly sub-quadratic. + expect(countReads(400) / countReads(200)).toBeLessThan(3) + }) + + test('buildAuditPathIndex() handles a very deep dependency chain without overflowing the stack', () => { + // n0 -> n1 -> ... -> n(L-1) -> vuln is a single chain far deeper than the JS + // call-stack limit. A recursive walk (reachability or path traversal) would + // throw RangeError on this lockfile (a lockfile is untrusted input); the + // iterative implementation must complete and still report the leaf. + const L = 60_000 + const packages: PackageSnapshots = {} + for (let i = 0; i < L; i++) { + const child = i + 1 < L ? `n${i + 1}` : 'vuln' + packages[`n${i}@1.0.0` as DepPath] = { dependencies: { [child]: '1.0.0' }, resolution: { integrity: `n${i}-integrity` } } + } + packages['vuln@1.0.0' as DepPath] = { resolution: { integrity: 'vuln-integrity' } } + + const result = buildAuditPathIndex({ + importers: { ['.' as ProjectId]: { dependencies: { n0: '1.0.0' }, specifiers: { n0: '^1.0.0' } } }, + lockfileVersion: LOCKFILE_VERSION, + packages, + }, new Set(['vuln']), { depTypes: {}, optionalOnly: new Set() }) + + const info = result['vuln']!.get('1.0.0')! + expect(info.paths).toHaveLength(1) + expect(info.paths[0].startsWith('.>n0>n1>')).toBe(true) + expect(info.paths[0].endsWith('>vuln')).toBe(true) + }) + test('buildAuditPathIndex() replaces slashes in workspace importer ids', () => { const lockfile = { importers: {