From abd66bfca3c19288b6a12df80da943b547658837 Mon Sep 17 00:00:00 2001
From: Ben Meadors <benmmeadors@gmail.com>
Date: Sat, 13 Jun 2026 16:31:13 -0500
Subject: [PATCH] Add multi-hop NextHop recovery tests and unit tests for
 routing reliability

- Introduced a new test suite for multi-hop NextHop directed-message delivery and relay recovery in `test_nexthop_multihop_recovery.py`. This includes tests for end-to-end delivery and recovery after relay drop.
- Implemented unit tests in `test_main.cpp` for NextHop routing reliability mitigations, covering:
  - M1: Ambiguity-aware last-byte resolution.
  - M2: NextHopRouter's strict-neighbor gate and hop limit checks.
  - M3: Route-health freshness and failure decay.
- Enhanced mock classes to facilitate controlled testing of node behaviors and routing logic.
---
 docs/nexthop-routing-reliability.md           | 456 +++++++++++++++++
 .../mesh/test_nexthop_multihop_recovery.py    | 347 +++++++++++++
 src/mesh/MeshTypes.h                          |   5 +
 src/mesh/NextHopRouter.cpp                    | 159 +++++-
 src/mesh/NextHopRouter.h                      |  57 +++
 src/mesh/NodeDB.cpp                           |  67 +++
 src/mesh/NodeDB.h                             |  31 ++
 src/mesh/PacketHistory.cpp                    |   6 +-
 src/mesh/ReliableRouter.cpp                   |   4 +
 src/mesh/Router.cpp                           |  41 +-
 test/test_nexthop_routing/test_main.cpp       | 465 ++++++++++++++++++
 11 files changed, 1604 insertions(+), 34 deletions(-)
 create mode 100644 docs/nexthop-routing-reliability.md
 create mode 100644 mcp-server/tests/mesh/test_nexthop_multihop_recovery.py
 create mode 100644 test/test_nexthop_routing/test_main.cpp

diff --git a/docs/nexthop-routing-reliability.md b/docs/nexthop-routing-reliability.md
new file mode 100644
index 000000000..a41b32ef4
--- /dev/null
+++ b/docs/nexthop-routing-reliability.md
@@ -0,0 +1,456 @@
+# NextHop direct-message reliability on dense meshes — findings & plan
+
+**Status:** Proposal / design doc for branch handoff (no code changes yet)
+**Date:** 2026-06-13
+**Area:** `src/mesh` router stack (`NextHopRouter`, `ReliableRouter`, `FloodingRouter`, `Router`, `NodeDB`, `PacketHistory`)
+**Constraint:** No over-the-air / wire-format changes — `next_hop` and `relay_node` stay 1 byte, no `PacketHeader` changes, no breaking protobuf changes. All new state is RAM-only.
+
+This document captures the analysis and the proposed mitigations so the work can be
+continued on this branch by anyone. It is intentionally code-grounded (file:line
+references throughout) and standalone — you should not need the original investigation
+context to pick it up.
+
+---
+
+## TL;DR
+
+NextHop routing for direct messages (DMs) is unreliable on dense meshes. The headline
+cause is the **birthday problem**: `next_hop` and `relay_node` are each a single byte
+(the last byte of a 32-bit node number), so on a mesh of N nodes the probability that
+two share the same byte hits ~50% at **~19 nodes** and is near-certain by 50–100. But
+there are **other, equally important issues**: that single byte is trusted blindly at
+five different code sites, learned routes **never decay**, routes are learned from the
+**reverse (ACK) path** (asymmetric-link hazard), and collision-driven spurious
+rebroadcasts **amplify congestion** exactly when the mesh is busy.
+
+Because we can't widen the on-wire field, the fix is **interpretation-side** ("don't
+trust a byte that doesn't map to a unique reachable neighbor — flood instead") plus
+**recovery-side** ("decay stale/failing routes so they get re-discovered"). Four
+mitigations, M1–M4, all RAM-only. The net behavioral change: on dense/mobile meshes a
+DM that today silently misroutes or black-holes instead falls back to managed flooding
+(which still delivers) and re-learns a fresh route quickly. Sparse-mesh happy paths are
+unchanged.
+
+---
+
+## How NextHop routing works today (mechanics)
+
+Inheritance chain: `Router` → `FloodingRouter` → `NextHopRouter` → `ReliableRouter`.
+
+**The single-byte identifiers.** Both routing bytes come from one helper:
+
+```cpp
+// src/mesh/NodeDB.h:255
+uint8_t getLastByteOfNodeNum(NodeNum num) { return (uint8_t)((num & 0xFF) ? (num & 0xFF) : 0xFF); }
+```
+
+It projects a 32-bit node number onto 255 values (`0x00` is remapped to `0xFF` so it
+never collides with the `0`-valued sentinels `NO_NEXT_HOP_PREFERENCE` / `NO_RELAY_NODE`,
+`src/mesh/MeshTypes.h:44-46`). `next_hop` and `relay_node` in the packet header are
+`uint8_t` (`src/mesh/mesh.pb.h`, comments "Last byte of the node number…"). The learned
+route stored per destination, `meshtastic_NodeInfoLite::next_hop`, is also a single byte
+(`src/mesh/generated/meshtastic/deviceonly.pb.h:83`).
+
+**Sending a DM** — `NextHopRouter::send` (`src/mesh/NextHopRouter.cpp:23`):
+
+1. `p->relay_node = getLastByteOfNodeNum(getNodeNum())` (mark ourselves as relayer).
+2. `p->next_hop = getNextHop(p->to, p->relay_node)` (`src/mesh/NextHopRouter.cpp:192`):
+   look up `nodeDB->getMeshNode(to)->next_hop`; return it unless it equals the relayer
+   byte; otherwise `NO_NEXT_HOP_PREFERENCE` (→ flood).
+
+**Relaying** — `NextHopRouter::perhapsRebroadcast` (`src/mesh/NextHopRouter.cpp:133`):
+rebroadcast iff `next_hop == NO_NEXT_HOP_PREFERENCE` (flood) **or**
+`next_hop == getLastByteOfNodeNum(getNodeNum())` (we are the addressed next hop)
+(`:147`). Each node only ever compares against **its own** byte.
+
+**Learning** — `NextHopRouter::sniffReceived` (`src/mesh/NextHopRouter.cpp:89`): on an
+ACK/reply (`request_id`/`reply_id` set), if the relayer of the ACK was also a relayer of
+the original packet (validated via `PacketHistory::checkRelayers`), set
+`origTx->next_hop = p->relay_node` (`:114`). I.e. the **forward** next-hop is learned
+from the **reverse** path's relayer.
+
+**Retransmission / fallback** — `NextHopRouter::doRetransmissions`
+(`src/mesh/NextHopRouter.cpp:284`). Budgets: `NUM_RELIABLE_RETX=3` (originator: initial
+
+- 2 retries), `NUM_INTERMEDIATE_RETX=2` (relayer: 1 retry). On the **last** retry
+  (`numRetransmissions==1`) it resets `next_hop` to `NO_NEXT_HOP_PREFERENCE` on the packet
+  **and** clears `sentTo->next_hop` in NodeDB, then floods (`:313-321`). Retransmit timing
+  comes from `iface->getRetransmissionMsec`, whose contention window **grows with channel
+  utilization** (`src/mesh/RadioInterface.cpp` `getTxDelayMsec`/`getTxDelayMsecWeighted`).
+
+**Dedup / relayer history** — `PacketHistory` (`src/mesh/PacketHistory.cpp`): a bounded
+ring (`PACKETHISTORY_MAX = max(MAX_NUM_NODES*2, 100)`, 20 B/record) keyed by
+`(sender,id)`, tracking up to `NUM_RELAYERS=6` relayer **bytes** per packet in
+`relayed_by[]`. `wasRelayer` (`:490`) and `checkRelayers` (`:517`) match bytes against
+that array.
+
+---
+
+## Root-cause analysis
+
+### 1. The single byte is trusted blindly at five sites (the birthday problem)
+
+| #   | Site                             | File:line                   | Failure on collision                                                                                                      |
+| --- | -------------------------------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| 1   | Rebroadcast self-check           | `NextHopRouter.cpp:147`     | A remote "impostor" node sharing the intended next-hop's byte also rebroadcasts → wasted airtime / congestion.            |
+| 2   | Route learning                   | `NextHopRouter.cpp:111-114` | Stores an ambiguous byte as the route; later resolves to the wrong physical node.                                         |
+| 3   | Relayer validation               | `PacketHistory.cpp:490-538` | `wasRelayer(byte)` returns true for the wrong node → mis-validated ACK / mis-learn.                                       |
+| 4   | Favorite-router hop preservation | `Router.cpp:120-145`        | **First** NodeDB node whose last byte matches wins — non-deterministic; can preserve hops for the wrong relay (hop leak). |
+| 5   | Send-path lookup                 | `NextHopRouter.cpp:192-207` | Emits a byte that may address the wrong node; no check it still maps to a reachable neighbor.                             |
+
+Collision math (uniform last byte over 255 buckets): P(collision) ≈ 50% at ~19 nodes,
+
+> 99% by ~75 nodes. Dense meshes are squarely in the "always colliding" regime.
+
+### 2. Stale routes never decay
+
+The learned `next_hop` byte is cleared only on the **current DM's** last retry
+(`NextHopRouter.cpp:313-321`). A route learned hours ago that has since gone dead is
+still trusted on the **next** DM's first attempt — which on a congested mesh is also the
+slowest attempt. Result: silent black-hole at a dead hop until the retransmission budget
+drains, then a late flood. Intermediate nodes hold stale routes indefinitely.
+
+### 3. Reverse-path (asymmetric-link) learning
+
+`origTx->next_hop` is learned from the ACK's relayer (`NextHopRouter.cpp:110-114`) — the
+**reverse** direction. RF links are frequently asymmetric, so the best reverse relay can
+be a poor forward relay. Worse, the next reverse ACK immediately re-learns the same bad
+hop, so the route **flaps** back to the bad value even after a failure reset.
+
+### 4. Congestion amplification
+
+Collision-driven impostor rebroadcasts (issue 1) add airtime; the contention window
+grows with channel utilization, so retransmit intervals **lengthen** exactly when the
+mesh is busy. The 3-try reliable budget can then expire before delivery. On dense
+meshes, efficiency _is_ reliability.
+
+### Note: pubkey-derived node numbers (develop / 2.8) — does not change the plan
+
+develop derives the node number from the public key:
+`my_node_num = crc32Buffer(public_key)` (`src/mesh/NodeDB.cpp:481`), re-derived on key
+change in `createNewIdentity()` (`src/mesh/NodeDB.cpp:3113`). This **reinforces** the
+plan rather than changing it:
+
+- **Birthday problem unchanged and now textbook-exact.** CRC32 mixes well → the last
+  byte is uniformly distributed over 256 values. Derivation adds no wire bits.
+- **Node numbers are now immutable / identity-bound.** Pre-2.8 `pickNewNodeNum()` could
+  renumber a node to dodge a conflict; now the number is fixed by the key, so a last-byte
+  collision **cannot be resolved operationally by renumbering** → M1/M2/M3 become _more_
+  necessary.
+- **Resolver gets cleaner inputs.** Stable node numbers keep a learned byte bound to one
+  identity (good for M3 freshness). `createNewIdentity()` retires the old entry by marking
+  it **ignored** and clearing its pubkey (`src/mesh/NodeDB.cpp:3123-3125`), which M1's
+  candidate gate already skips — so key rotation can't pollute resolution.
+- **No wire-free disambiguation unlocked.** A receiver still gets only 1 byte and cannot
+  recover which full node number a colliding value meant — so "detect ambiguity → flood"
+  remains the correct strategy.
+
+---
+
+## Proposed mitigations
+
+Key insight for all of M1/M2: **a 1-byte ID only needs to be unique among a node's
+direct neighbors / plausible relays, not the whole mesh.** That candidate set is small
+(typically 5–15), so a byte usually resolves unambiguously there; when it doesn't, fall
+back to the _safe_ behavior (flood / decrement / don't-learn).
+
+### M1 — Ambiguity-aware last-byte resolution (new NodeDB primitive)
+
+New types + methods in `src/mesh/NodeDB.h` (near line 255) / `src/mesh/NodeDB.cpp`
+(near `getMeshNode`, ~2936):
+
+```cpp
+enum class LastByteResolution : uint8_t { None, Unique, Ambiguous };
+struct ResolvedNode { LastByteResolution status = LastByteResolution::None; NodeNum num = 0; };
+
+// Resolve a single on-wire last-byte to a unique full NodeNum among relevant candidates.
+ResolvedNode resolveLastByte(uint8_t lastByte, bool requireDirectNeighbor);
+// Convenience: true iff exactly one relevant candidate (Ambiguous and None both -> false = SAFE).
+bool resolveUniqueLastByte(uint8_t lastByte, bool requireDirectNeighbor, NodeNum *outNum = nullptr);
+```
+
+- **One linear pass** over `meshNodes`, reusing `getNumMeshNodes()`/`getMeshNodeByIndex()`,
+  the bitfield helpers (`nodeInfoLiteIsFavorite/HasUser/IsIgnored`), `sinceLastSeen()`,
+  and `getLastByteOfNodeNum()`. **Early-exit** on the 2nd match (return `Ambiguous`).
+- **Guard:** `if (lastByte == 0) return {None, 0};` (covers `NO_RELAY_NODE` / MQTT-invalid).
+- **Candidate gate** (skip): `num == getNodeNum()` (never resolve to ourselves), `num == 0`,
+  `num == NODENUM_BROADCAST`, `nodeInfoLiteIsIgnored`. Then match
+  `getLastByteOfNodeNum(node->num) == lastByte` (cheapest test last, mirroring `Router.cpp:119`).
+- **Relevance gate:**
+  - `requireDirectNeighbor == true` (strict, for SEND): `has_hops_away && hops_away == 0`
+    **and** `sinceLastSeen(node) < NEXTHOP_NEIGHBOR_FRESH_SECS`.
+  - `requireDirectNeighbor == false` (lenient, for learn / hop-preserve): accept if direct
+    neighbor **or** `nodeInfoLiteIsFavorite` **or** role ∈ {ROUTER, ROUTER_LATE, CLIENT_BASE}.
+- **No tie-break.** A collision must return `Ambiguous` — picking "best SNR" would
+  resurrect the silent-misroute bug. (Deliberate non-goal; document in code.)
+
+New constant in `src/mesh/MeshTypes.h` (near line 44):
+`#define NEXTHOP_NEIGHBOR_FRESH_SECS (60 * 60 * 2)` (mirrors `NUM_ONLINE_SECS`).
+
+### M2 — Only route on bytes that resolve to a unique, reachable neighbor
+
+In `getNextHop` (`src/mesh/NextHopRouter.cpp:192-207`), after the existing split-horizon
+check (`node->next_hop != relay_node`), require the stored byte to resolve to a **unique,
+currently-fresh direct neighbor**; else flood:
+
+```cpp
+if (node->next_hop != relay_node) {
+    ResolvedNode r = nodeDB->resolveLastByte(node->next_hop, /*requireDirectNeighbor=*/true);
+    if (r.status == LastByteResolution::Unique) return node->next_hop;
+    LOG_WARN("Next hop 0x%x for 0x%x %s -> flood", node->next_hop, to,
+             r.status == LastByteResolution::Ambiguous ? "ambiguous among neighbors" : "no longer a neighbor");
+    return std::nullopt;
+}
+```
+
+This self-heals when a neighbor goes away (unicast-into-a-void becomes a flood). It
+applies to originating, relaying, and retrying, since all route through `getNextHop`.
+
+Apply M1's safe fallback at the other sites:
+
+- **Learning** (`NextHopRouter.cpp:111-114`): gate `origTx->next_hop = p->relay_node` on
+  `resolveUniqueLastByte(p->relay_node, /*direct=*/false)`. Ambiguous/unknown → don't
+  learn (leave route unset → flood).
+- **Favorite-router preservation** (`Router.cpp:120-145`): replace the "first match wins"
+  loop with `resolveUniqueLastByte(p->relay_node, /*direct=*/false)` + a re-check that the
+  resolved node is favorite/has_user/router. Ambiguous/none/not-favorite → **decrement**
+  (safe). Net: removes one full DB scan, adds one resolver scan (wash).
+
+**Left unchanged, by design (document why in code):**
+
+- **Site 1** rebroadcast self-check (`NextHopRouter.cpp:147`) and self-identity checks
+  (`ReliableRouter.cpp:127`): a node matches its **own** byte — no DB resolution helps. A
+  remote impostor sharing the intended next-hop's byte will still rebroadcast. M1/M2
+  shrink the blast radius by reducing how often an ambiguous byte is ever stored or
+  originated; a true fix needs a wider field (out of scope). **This is the one residual
+  the plan cannot fully close.**
+- **Site 3** `wasRelayer`/`checkRelayers` (`PacketHistory.cpp:490-538`): intentionally
+  byte-domain (both sides are on-wire bytes); the consumer (learning) is now hardened.
+  Add a one-line comment; do not change.
+
+### M3 — Route freshness / failure memory (RAM table on NextHopRouter)
+
+A bounded, LRU-evicted table keyed by destination, mirroring `PacketHistory`'s
+reuse-oldest discipline (not an unbounded map) to cap RAM.
+
+`src/mesh/NextHopRouter.h` (near `pending`, line 99):
+
+```cpp
+struct RouteHealth {
+    NodeNum dest = 0;                  // 0 == empty slot
+    uint32_t learnedAtMsec = 0;        // millis() at last (re)learn; rollover-aware
+    uint8_t consecutiveFailures = 0;
+    uint8_t lastNextHop = NO_NEXT_HOP_PREFERENCE; // byte this health refers to
+};
+static constexpr uint8_t ROUTE_HEALTH_MAX = 32;   // ~384B; drop to 16 if RAM-tight
+RouteHealth routeHealth[ROUTE_HEALTH_MAX] = {};
+// Helpers take `now` (pure/testable): findRouteHealth, getOrAllocRouteHealth,
+// noteRouteLearned, noteRouteSuccess, noteRouteFailure, isRouteStale, clearRouteHealth
+```
+
+Policy:
+
+| Constant                  | Value  | Rationale                                                                                                                                              |
+| ------------------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `ROUTE_TTL_MSEC`          | 30 min | Survives a normal conversation; re-discovers a moved node within a telemetry interval.                                                                 |
+| `ROUTE_FAILURE_THRESHOLD` | 3      | 1–2 consecutive failures are transient LoRa collisions; 3 to the same hop = dead. Accumulates **across** DMs (independent of the per-DM 3-try budget). |
+
+`isRouteStale(h, now)` = `(now - h.learnedAtMsec) >= ROUTE_TTL_MSEC || h.consecutiveFailures >= ROUTE_FAILURE_THRESHOLD`.
+All age math uses **unsigned subtraction** (rollover-safe, matching
+`PacketHistory.cpp:364`); treat `learnedAtMsec == 0` as "set now".
+
+Wiring (as built — `src/mesh/NextHopRouter.cpp`, `src/mesh/ReliableRouter.cpp`):
+
+- `getNextHop`: if a health record matches the stored byte and `isRouteStale`, clear
+  `node->next_hop` (NodeDB) **and** `clearRouteHealth`, return `nullopt` (flood). No
+  record yet (cold path, first DM after boot) → trust NodeDB, but the M2 strict-neighbor
+  gate still applies.
+- `sniffReceived` learn: gate the write through `resolveUniqueLastByte` (M2), then
+  `noteRouteLearned(p->from, p->relay_node, millis())` — resets `consecutiveFailures`
+  **only if the hop changed** (anti-flap for asymmetric re-learn); otherwise just refreshes
+  `learnedAtMsec`. (No success signal is taken on the intermediate reverse-pass: an ACK
+  merely passing through us is not proof that _we_ delivered, and resetting failures there
+  would reintroduce the asymmetric flap.)
+- `doRetransmissions`: on the last-retransmission branch (`numRetransmissions == 1`, the
+  point a directed delivery has gone un-ACKed for both originator and intermediate) →
+  `noteRouteFailure(to)`, then the existing NodeDB `next_hop` reset + flood. We deliberately
+  do **not** `clearRouteHealth` here: keeping the record is what lets the failure count
+  accumulate across DMs so a flapping reverse-path-relearned dead hop eventually ages out.
+- `ReliableRouter::sniffReceived` ACK path → `noteRouteSuccess(getFrom(p), millis())`
+  (an end-to-end ACK addressed to us is genuine forward-delivery proof; clears failures and
+  refreshes freshness). `noteRouteSuccess`/`noteRouteFailure` are no-ops when no record
+  exists, so flood-only destinations never pollute the table.
+
+**Reconciliation (no double-handling):** `doRetransmissions` owns _in-flight_ failure of
+the current DM (reset NodeDB `next_hop` + flood, and bump the cross-DM failure counter);
+`getNextHop` owns _between-DM_ staleness (TTL or failure-threshold → flood + clear). The
+only place that erases a health record is the `getNextHop` decay path; the retransmission
+path leaves it intact so the counter survives a reverse-path re-learn.
+
+### M4 — Earlier flood for unverified routes (gated, off by default)
+
+Compile-gated so healthy sparse meshes are untouched. **Default is off** — the define
+lives in `NextHopRouter.h` and must be flipped to measure:
+`#define NEXTHOP_EARLY_FLOOD_ON_UNVERIFIED 1`.
+
+In `doRetransmissions`, the directed-retry `else` branch: if the route is **not verified**
+(`!findRouteHealth(to) || consecutiveFailures > 0 || isRouteStale`), reset `next_hop` and
+flood on this attempt instead of spending another directed try. A **verified** route
+(record present, `consecutiveFailures == 0`, within TTL — i.e. recently ACKed) takes the
+unchanged directed-retry path, so the sparse-mesh happy path is untouched. Trade-off:
+airtime ↔ latency; the gate ensures we never pay the flood cost on a proven route, only on
+one we already distrust. Off by default precisely so it can be A/B-measured on the
+simulator before broad enable.
+
+---
+
+## Files to modify
+
+| File                                        | Change                                                                                                                                               |
+| ------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `src/mesh/MeshTypes.h`                      | `NEXTHOP_NEIGHBOR_FRESH_SECS`, `ROUTE_TTL_MSEC`, `ROUTE_FAILURE_THRESHOLD`, `NEXTHOP_EARLY_FLOOD_ON_UNVERIFIED`                                      |
+| `src/mesh/NodeDB.h` / `src/mesh/NodeDB.cpp` | `LastByteResolution`, `ResolvedNode`, `resolveLastByte`, `resolveUniqueLastByte`                                                                     |
+| `src/mesh/NextHopRouter.h`                  | `RouteHealth` + array + helpers; `#ifdef PIO_UNIT_TESTING public:` for helpers and `getNextHop`                                                      |
+| `src/mesh/NextHopRouter.cpp`                | `getNextHop` (M2 gate + M3 decay); `sniffReceived` (learn gate + health seed + success); `doRetransmissions` (failure counting + M4); comment site 1 |
+| `src/mesh/Router.cpp`                       | `shouldDecrementHopLimit` → resolver + favorite/router re-check                                                                                      |
+| `src/mesh/ReliableRouter.cpp`               | ACK path → `noteRouteSuccess`                                                                                                                        |
+| `test/test_nexthop_routing/test_main.cpp`   | **new** unit suite (auto-built under `[env:native]`)                                                                                                 |
+
+**Reuse, don't reinvent:** `getLastByteOfNodeNum`, `sinceLastSeen`, the bitfield helpers,
+`getMeshNodeByIndex`/`getNumMeshNodes`, PacketHistory's reuse-oldest eviction shape, and
+`MockNodeDB::addTestNode` (from `test/test_hop_scaling/test_main.cpp`).
+
+---
+
+## Edge cases
+
+- **`0x00`↔`0xFF` projection:** the resolver compares via `getLastByteOfNodeNum` on both
+  sides, so a `…00` node and a `…FF` node correctly collide on `0xFF` → `Ambiguous`. Test
+  explicitly.
+- **MQTT packets:** `relay_node`/`next_hop` are forced invalid when `hop_start == 0`
+  (`src/mesh/RadioLibInterface.cpp:603-605`) → byte 0 → resolver `None` → don't learn
+  (correct).
+- **`has_hops_away == false`** nodes are excluded from the strict gate (never fabricate a
+  Unique neighbor for M2); admitted to the lenient gate only via favorite/router role.
+  Safe; self-corrects once `hops_away` is learned.
+- **Self / broadcast:** the resolver skips `getNodeNum()` and `NODENUM_BROADCAST`;
+  `getNextHop` already early-returns for broadcast.
+- **Perf:** M2 adds one O(N) resolver scan per directed send/relay (early-exit on the 2nd
+  match), cheaper than the crypto already on that path; site-4 is a wash. If ever hot, a
+  future 256-entry last-byte index is the optimization (not now — RAM).
+
+---
+
+## Verification (all tiers)
+
+### 1. Native unit tests — new `test/test_nexthop_routing/test_main.cpp`
+
+`pio test -e native -f test_nexthop_routing`; on macOS `./bin/test-native-docker.sh -f test_nexthop_routing`.
+Design the RouteHealth helpers to take `now` as a parameter so the 30-min TTL logic is
+testable without a clock mock.
+
+- **Resolver:** None / Unique / **Ambiguous (birthday collision)** / strict-excludes-stale /
+  strict-excludes-far / lenient-includes-favorite-router / lenient-collision / skips-self /
+  skips-ignored / **`0x00`↔`0xFF` collision** / early-exit.
+- **`getNextHop`:** unique→byte, **ambiguous→nullopt**, stale-neighbor→nullopt,
+  split-horizon (relay==next_hop)→nullopt, broadcast→nullopt.
+- **RouteHealth:** TTL boundary, **rollover** (learn near `0xFFFFFFFF`, check after wrap),
+  failure threshold, success-resets, **re-learn-same-hop keeps fails (anti-flap)**,
+  re-learn-new-hop resets, LRU eviction bound, clear.
+- **Site-4:** preserve on unique favorite router; **decrement on two colliding favorites**;
+  decrement when the resolved node is not a favorite.
+- **Sparse-mesh regression:** all-distinct last bytes → every resolve Unique, `getNextHop`
+  returns the stored byte unchanged (proves no happy-path change).
+- Re-run `test_packet_history` and `test_hop_scaling` for no regression.
+
+### 2. portduino SimRadio simulator
+
+`pio run -e native && ./bin/test-simulator.sh`. Best vehicle for the **intermediate-node**
+path the 2-device bench can't reach. Line topology A — B — C: establish A→C (B learns a
+directed route), stop B relaying that dest, confirm A re-discovers via flood within
+`ROUTE_FAILURE_THRESHOLD` and that B's `noteRouteFailure`/`clearRouteHealth` fires (visible
+via the `LOG_INFO "Route to … stale"` / "Resetting next hop" lines). Use this to A/B M4
+(attempts-to-delivery, total airtime).
+
+### 3. Hardware via meshtastic MCP (auto-detect; 3+ devices for a real hop)
+
+- `mcp-server/tests/mesh/test_nexthop_multihop_recovery.py` — **the multi-hop validator
+  for this work** (added on this branch). Self-discovers an A — relay — C line, asserts a
+  directed DM is delivered across the relay (next_hop + M1/M2/M3 engaged), and asserts
+  delivery recovers after the relay is power-cycled (M3). Skips unless the bench is a true
+  multi-hop line (≥3 roles via `--hub-profile`, endpoints out of direct RF range).
+- `mcp-server/tests/mesh/test_direct_with_ack.py` — happy-path regression: a fresh/unique
+  route still delivers a want_ack DM on the first/second try (M4's gate must keep this
+  green).
+- `mcp-server/tests/mesh/test_peer_offline_recovery.py` — 2-device recovery validator: peer
+  off mid-conversation then back. Must stay green and ideally recover in fewer attempts.
+
+### 4. Build / format sanity
+
+native-macos **and** Docker both ways; trunk clang-format@16.0.3; a release `pio run` to
+confirm the `#ifdef PIO_UNIT_TESTING` visibility widening does **not** leak into
+production; sanity-check RAM headroom on the smallest nRF52 build for the ~384 B table.
+
+---
+
+## Verification status (as built on `nexthop-redux`)
+
+| Tier                             | What ran                                                                            | Result              |
+| -------------------------------- | ----------------------------------------------------------------------------------- | ------------------- |
+| Unit (native-macos)              | `test_nexthop_routing` (31 cases)                                                   | ✅ 31/31            |
+| Unit (Docker / Linux, CI parity) | `test_nexthop_routing`                                                              | ✅ 31/31            |
+| Regression                       | `test_packet_history`, `test_hop_scaling`, `test_mqtt`, `test_traffic_management`   | ✅ 105/105          |
+| Build                            | `pio run -e native-macos` (M4 off) and with `-DNEXTHOP_EARLY_FLOOD_ON_UNVERIFIED=1` | ✅ both link        |
+| Format                           | trunk `clang-format@16.0.3`                                                         | ✅ no issues        |
+| Simulator (CI `simulator-tests`) | `meshtasticd -s` + `meshtastic.test.testSimulator()` on native-macos                | ✅ exit 0, no crash |
+
+**Pending (environment-blocked, not yet run):**
+
+- **Multi-hop A–B–C recovery sim** — the `simulator/` broker hub is **not git-tracked**
+  (only stale local `.pyc`), and two `meshtasticd -s` instances can't hear each other
+  without it. The intermediate-node failure-count path and the M4 A/B therefore have unit
+  coverage of their logic but no end-to-end multi-node run yet.
+- **Hardware / multi-hop tier** — a committable bench test now exists:
+  `mcp-server/tests/mesh/test_nexthop_multihop_recovery.py`. It self-discovers a real
+  multi-hop pair (A — relay — C), asserts a directed DM is delivered across the relay, and
+  asserts delivery recovers after the relay is power-cycled (the M3 path). It
+  `pytest.skip`s cleanly unless the bench is a true line with endpoints out of direct RF
+  range (≥3 roles via `--hub-profile`), so it's safe to commit and only asserts when the
+  NextHop path is genuinely exercised. Collected + verified to skip without hardware;
+  not yet run on a bench. `test_direct_with_ack.py` / `test_peer_offline_recovery.py`
+  remain the 2-device happy-path/recovery regressions.
+
+---
+
+## Risks & limitations
+
+- **Site-1 impostor rebroadcast** is unfixable without a wider field — documented; M1/M2
+  only shrink its frequency.
+- **Dense meshes flood DMs more often** — intended (a flooded DM arrives; a mis-unicast one
+  black-holes). Call out in the PR so reviewers expect a slightly higher DM flood rate on
+  very dense meshes.
+- **M4 airtime** if the gate is too loose → default conservative + compile-gated +
+  simulator A/B before broad enable.
+- **RAM** ~384 B (32 slots); 16 slots (~192 B) with graceful LRU degradation if tight.
+- **Asymmetric flap** not fully closed (a _new_ bad hop resets the counter); the TTL
+  backstop bounds it. Per-hop failure history is future work (more RAM).
+
+---
+
+## How to continue this work (commit sequencing)
+
+Each step is independently testable; land them as separate commits.
+
+1. **M1 resolver + unit tests** — `NodeDB` only; no behavior change until wired. Lands the
+   `resolveLastByte`/`resolveUniqueLastByte` primitive and its full unit-test matrix.
+2. **M2 + wiring + tests** — `getNextHop` strict gate, learning gate, favorite-router
+   preservation rewrite. Adds the `getNextHop` and site-4 tests.
+3. **M3 health table + decay + tests** — RAM `RouteHealth` table, decay-on-read, failure/
+   success accounting, reconciliation with the existing last-retry reset. Adds the
+   route-health unit tests and the simulator recovery check.
+4. **M4 gated tuning** — early-flood-on-unverified behind the compile flag; simulator A/B
+   and hardware regression.
+
+Reference plan (with the same content) was developed at
+`~/.claude/plans/nexthop-routing-for-direct-lexical-shell.md` on the author's machine; this
+in-repo doc is the canonical handoff copy.
diff --git a/mcp-server/tests/mesh/test_nexthop_multihop_recovery.py b/mcp-server/tests/mesh/test_nexthop_multihop_recovery.py
new file mode 100644
index 000000000..a61645c91
--- /dev/null
+++ b/mcp-server/tests/mesh/test_nexthop_multihop_recovery.py
@@ -0,0 +1,347 @@
+"""Multi-hop NextHop directed-message delivery + relay-recovery (bench test).
+
+This is the hardware/tier-3 validator for the NextHop DM reliability work
+(see `docs/nexthop-routing-reliability.md`). The unit suite
+`test/test_nexthop_routing` covers the routing *logic* exhaustively; this test
+covers the *end-to-end* multi-hop behavior that only a real (or RF-separated)
+mesh exercises:
+
+  * a directed DM that must traverse a relay is delivered (next_hop routing +
+    the M1/M2 ambiguity gate + M3 route learning all engage), and
+  * when the established relay drops and returns, delivery recovers rather than
+    black-holing (the M3 stale-route decay / re-learn path).
+
+TOPOLOGY REQUIREMENT — why this usually SKIPS:
+  A NextHop relay only happens when the two endpoints are NOT direct neighbors.
+  Three co-located radios all hear each other, so A→C is a single direct hop and
+  next_hop never engages. To run this test the bench must be a *line* — A — B — C
+  — with the endpoints out of each other's direct RF range (physical distance or
+  attenuators). The `multihop_topology` fixture detects this automatically: it
+  warms the mesh, looks for a pair that is ≥1 hop apart, confirms the relay via
+  traceroute, and `pytest.skip`s cleanly when the bench is all-direct. So this
+  file is safe to commit and run anywhere — it only *asserts* when the topology
+  genuinely requires a relay.
+
+REQUIREMENTS:
+  * ≥3 baked devices. The default hub profile is 2 roles (nrf52, esp32s3); add a
+    third via `--hub-profile=path/to/hub.yaml` (see conftest `hub_profile`).
+  * The relay-recovery test additionally needs uhubctl + a power-controllable
+    relay port (same gate the other power tests use).
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any
+
+import pytest
+from meshtastic_mcp.connection import connect
+from tests import _power
+from tests._port_discovery import resolve_port_by_role
+
+from ._receive import ReceiveCollector, nudge_nodeinfo, nudge_nodeinfo_port
+
+
+def _hops_away(rec: dict[str, Any]) -> int | None:
+    """Read a node's hop distance from a `nodesByNum` entry, tolerating either
+    the camelCase (`hopsAway`) or snake_case (`hops_away`) spelling depending on
+    the meshtastic-python version."""
+    for key in ("hopsAway", "hops_away"):
+        val = rec.get(key)
+        if isinstance(val, int):
+            return val
+    return None
+
+
+def _warm_mesh(ports: list[str], rounds: int = 2, settle: float = 6.0) -> None:
+    """Flood a fresh NodeInfo from every node so the whole mesh (including
+    multi-hop pairs, reached via relayed broadcasts) populates pubkeys and hop
+    distances. Best-effort — a single node failing to nudge shouldn't abort."""
+    for _ in range(rounds):
+        for port in ports:
+            try:
+                nudge_nodeinfo_port(port)
+            except Exception:  # noqa: BLE001 — warmup is best-effort
+                pass
+            time.sleep(0.5)
+        time.sleep(settle)
+
+
+def _wait_for_pubkey(
+    tx_iface: Any, rx_num: int, rx_port: str, deadline_s: float = 90.0
+) -> bool:
+    """Block until `tx_iface` holds `rx_num`'s public key (directed PKI sends
+    NAK without it). Re-nudges both sides periodically; multi-hop warmup is
+    slower than the 2-device case because NodeInfo must be relayed, hence the
+    longer default deadline."""
+    deadline = time.monotonic() + deadline_s
+    last_nudge = time.monotonic()
+    while time.monotonic() < deadline:
+        rec = (tx_iface.nodesByNum or {}).get(rx_num, {})
+        if rec.get("user", {}).get("publicKey"):
+            return True
+        if time.monotonic() - last_nudge > 20.0:
+            nudge_nodeinfo_port(rx_port)
+            nudge_nodeinfo(tx_iface)
+            last_nudge = time.monotonic()
+        time.sleep(1.0)
+    return False
+
+
+def _traceroute_route(tx_port: str, rx_num: int, rx_port: str) -> list[int] | None:
+    """Run a traceroute TX→RX and return the forward `route` (list of relay node
+    numbers), or None if it couldn't be obtained. Mirrors test_traceroute's
+    request/PKI/retry pattern."""
+    from meshtastic.mesh_interface import MeshInterface
+
+    with ReceiveCollector(tx_port, topic="meshtastic.receive.traceroute") as tx:
+        nudge_nodeinfo_port(rx_port)
+        tx.broadcast_nodeinfo_ping()
+        if not _wait_for_pubkey(tx._iface, rx_num, rx_port, 60.0):
+            return None
+        for _attempt in range(2):
+            try:
+                tx._iface.sendTraceRoute(dest=rx_num, hopLimit=5)
+                break
+            except MeshInterface.MeshInterfaceError:
+                time.sleep(5.0)
+        else:
+            return None
+        pkt = tx.wait_for(lambda p: p.get("from") == rx_num, timeout=8.0)
+        if pkt is None:
+            return None
+        tr = (pkt.get("decoded", {}) or {}).get("traceroute") or {}
+        return [int(n) for n in (tr.get("route") or [])]
+
+
+@pytest.fixture(scope="session")
+def multihop_topology(baked_mesh: dict[str, Any]) -> dict[str, Any]:
+    """Discover a real multi-hop pier (tx → relay → rx) on the bench, or skip.
+
+    Returns {tx_role, tx_port, rx_role, rx_port, rx_num, relay_role, relay_num}.
+    """
+    roles = sorted(baked_mesh)
+    if len(roles) < 3:
+        pytest.skip(
+            "multi-hop NextHop test needs ≥3 baked devices arranged as a line "
+            "(endpoints out of direct RF range). Add a third role via "
+            f"--hub-profile. Detected roles: {roles}"
+        )
+
+    by_role = {r: (baked_mesh[r]["port"], baked_mesh[r]["my_node_num"]) for r in roles}
+    if any(num is None for _, num in by_role.values()):
+        pytest.skip("a baked device is missing my_node_num; can't map the topology")
+
+    _warm_mesh([port for port, _ in by_role.values()])
+
+    # Find an ordered pair that is ≥1 hop apart, using each node's own nodeDB
+    # (cheap — no traceroute yet). On an all-direct bench nothing qualifies.
+    multihop_pair: tuple[str, str] | None = None
+    for a_role in roles:
+        a_port, _ = by_role[a_role]
+        try:
+            with connect(port=a_port) as a_iface:
+                nodes = a_iface.nodesByNum or {}
+        except Exception:  # noqa: BLE001
+            continue
+        for c_role in roles:
+            if c_role == a_role:
+                continue
+            _, c_num = by_role[c_role]
+            hops = _hops_away(nodes.get(c_num, {}))
+            if hops is not None and hops >= 1:
+                multihop_pair = (a_role, c_role)
+                break
+        if multihop_pair:
+            break
+
+    if not multihop_pair:
+        pytest.skip(
+            "no multi-hop pair found — every device appears to be a direct "
+            "neighbor. Arrange the bench as a line (A — B — C) with the "
+            "endpoints out of direct RF range (distance or attenuators) so a "
+            "relay is actually required, then re-run."
+        )
+
+    a_role, c_role = multihop_pair
+    a_port, _ = by_role[a_role]
+    c_port, c_num = by_role[c_role]
+
+    route = _traceroute_route(a_port, c_num, c_port)
+    if not route:
+        pytest.skip(
+            f"{a_role}→{c_role} looked multi-hop but traceroute returned no "
+            "intermediate relay; can't identify the relay node to drive the "
+            "recovery test"
+        )
+
+    relay_num = route[0]
+    relay_role = next((r for r in roles if by_role[r][1] == relay_num), None)
+    return {
+        "tx_role": a_role,
+        "tx_port": a_port,
+        "rx_role": c_role,
+        "rx_port": c_port,
+        "rx_num": c_num,
+        "relay_role": relay_role,
+        "relay_num": relay_num,
+    }
+
+
+@pytest.mark.timeout(300)
+def test_multihop_dm_delivers(multihop_topology: dict[str, Any]) -> None:
+    """A directed wantAck DM that must traverse the relay is delivered.
+
+    Exercises the NextHop routing path end-to-end: TX picks a next hop toward
+    RX (M2 gate), the relay resolves the next_hop byte and forwards (M1), and
+    the route is learned from the returning ACK (M3). Retries absorb transient
+    LoRa loss; the assertion is on eventual delivery.
+    """
+    tx_port = multihop_topology["tx_port"]
+    rx_port = multihop_topology["rx_port"]
+    rx_num = multihop_topology["rx_num"]
+    tx_role = multihop_topology["tx_role"]
+    rx_role = multihop_topology["rx_role"]
+    relay_role = multihop_topology["relay_role"]
+
+    unique = f"nexthop-mh-{tx_role}-to-{rx_role}-{int(time.time())}"
+
+    with ReceiveCollector(rx_port, topic="meshtastic.receive.text") as rx:
+        rx.broadcast_nodeinfo_ping()
+        with connect(port=tx_port) as tx_iface:
+            nudge_nodeinfo(tx_iface)
+            if not _wait_for_pubkey(tx_iface, rx_num, rx_port, 90.0):
+                pytest.skip(
+                    f"{tx_role} never learned {rx_role}'s pubkey over the relay; "
+                    "multi-hop PKI warmup didn't complete"
+                )
+            got = None
+            for _attempt in range(3):
+                pkt = tx_iface.sendText(unique, destinationId=rx_num, wantAck=True)
+                assert pkt is not None
+                got = rx.wait_for(
+                    lambda p: p.get("decoded", {}).get("text") == unique,
+                    timeout=45,
+                )
+                if got is not None:
+                    break
+                rx.broadcast_nodeinfo_ping()
+                nudge_nodeinfo(tx_iface)
+                time.sleep(5.0)
+
+    assert got is not None, (
+        f"multi-hop directed DM {tx_role}→{rx_role} via relay "
+        f"{relay_role!r} never landed — NextHop multi-hop delivery is broken"
+    )
+
+
+@pytest.mark.timeout(600)
+def test_multihop_relay_recovery(
+    multihop_topology: dict[str, Any],
+    power_cycle,  # noqa: ARG001 — forces the uhubctl-availability skip
+) -> None:
+    """Delivery recovers after the established relay drops and returns.
+
+    Establishes a baseline DM (route via relay learned), powers the relay OFF
+    (confirming TX survives sending across a downed relay), then powers it back
+    ON and asserts directed delivery resumes — the M3 stale-route decay /
+    re-learn path. With a strict A — B — C line there is no path while B is down,
+    so we only assert TX doesn't crash during the outage; the delivery assertion
+    is after B returns.
+    """
+    relay_role = multihop_topology["relay_role"]
+    if not relay_role:
+        pytest.skip(
+            "relay node isn't one of the baked hub roles, so it can't be "
+            "power-cycled; recovery test needs a controllable relay"
+        )
+
+    tx_port = multihop_topology["tx_port"]
+    rx_port = multihop_topology["rx_port"]
+    rx_num = multihop_topology["rx_num"]
+    tx_role = multihop_topology["tx_role"]
+    rx_role = multihop_topology["rx_role"]
+
+    base = f"mh-recover-base-{int(time.time())}"
+    post = f"mh-recover-post-{int(time.time())}"
+
+    # Baseline: confirm delivery works (so the route via the relay is learned)
+    # before we perturb anything — otherwise a later failure is ambiguous.
+    with ReceiveCollector(rx_port, topic="meshtastic.receive.text") as rx:
+        rx.broadcast_nodeinfo_ping()
+        with connect(port=tx_port) as tx_iface:
+            nudge_nodeinfo(tx_iface)
+            if not _wait_for_pubkey(tx_iface, rx_num, rx_port, 90.0):
+                pytest.skip("multi-hop PKI warmup failed; can't run recovery test")
+            tx_iface.sendText(base, destinationId=rx_num, wantAck=True)
+            assert (
+                rx.wait_for(
+                    lambda p: p.get("decoded", {}).get("text") == base, timeout=45
+                )
+                is not None
+            ), "baseline multi-hop delivery failed — skipping recovery to avoid a false result"
+
+    # Power the relay OFF.
+    try:
+        _power.power_off(relay_role)
+        _power.wait_for_absence(relay_role, timeout_s=15.0)
+    except Exception as exc:  # noqa: BLE001
+        try:
+            _power.power_on(relay_role)
+            resolve_port_by_role(relay_role, timeout_s=30.0)
+        except Exception:  # noqa: BLE001
+            pass
+        pytest.skip(f"can't power-control relay {relay_role!r}: {exc}")
+
+    # With the only relay down there's no path; we just confirm TX accepts the
+    # send and survives its internal retries (it must not crash / wedge).
+    try:
+        with connect(port=tx_port) as tx_iface:
+            pkt = tx_iface.sendText(
+                f"mh-while-down-{int(time.time())}",
+                destinationId=rx_num,
+                wantAck=True,
+            )
+            assert pkt is not None
+            time.sleep(8.0)  # let retransmissions + route decay run
+    except Exception as exc:  # noqa: BLE001 — restore bench state before failing
+        _power.power_on(relay_role)
+        resolve_port_by_role(relay_role, timeout_s=30.0)
+        raise AssertionError(
+            f"TX crashed sending across a downed relay: {exc}"
+        ) from exc
+
+    # Power the relay back ON and let it re-enumerate + boot.
+    _power.power_on(relay_role)
+    time.sleep(0.5)
+    try:
+        resolve_port_by_role(relay_role, timeout_s=30.0)
+    except Exception:  # noqa: BLE001 — relay port isn't one we connect to directly
+        pass
+    time.sleep(8.0)
+    _warm_mesh([tx_port, rx_port], rounds=1)  # re-flood so the relay re-learns
+
+    # Delivery should resume once the relay is back (M3 re-learn / decay path).
+    got = None
+    with ReceiveCollector(rx_port, topic="meshtastic.receive.text") as rx:
+        rx.broadcast_nodeinfo_ping()
+        with connect(port=tx_port) as tx_iface:
+            nudge_nodeinfo(tx_iface)
+            _wait_for_pubkey(tx_iface, rx_num, rx_port, 90.0)
+            for _attempt in range(4):
+                pkt = tx_iface.sendText(post, destinationId=rx_num, wantAck=True)
+                assert pkt is not None
+                got = rx.wait_for(
+                    lambda p: p.get("decoded", {}).get("text") == post,
+                    timeout=45,
+                )
+                if got is not None:
+                    break
+                rx.broadcast_nodeinfo_ping()
+                nudge_nodeinfo(tx_iface)
+                time.sleep(6.0)
+
+    assert got is not None, (
+        f"after relay {relay_role!r} returned, multi-hop DM {tx_role}→{rx_role} "
+        "never resumed — stale-route recovery (M3) may be broken"
+    )
diff --git a/src/mesh/MeshTypes.h b/src/mesh/MeshTypes.h
index 680926d3c..58ef32150 100644
--- a/src/mesh/MeshTypes.h
+++ b/src/mesh/MeshTypes.h
@@ -45,6 +45,11 @@ enum RxSource {
 // For old firmware there is no relay node set
 #define NO_RELAY_NODE 0
 
+// How recently we must have heard a direct neighbor for its single-byte relay id to be trusted as a
+// unique next hop. Mirrors NUM_ONLINE_SECS (NodeDB.cpp). Used by NodeDB::resolveLastByte() to scope
+// last-byte collision resolution to currently-reachable neighbors.
+#define NEXTHOP_NEIGHBOR_FRESH_SECS (60 * 60 * 2) // 2 hrs
+
 typedef int ErrorCode;
 
 /// Alloc and free packets to our global, ISR safe pool
diff --git a/src/mesh/NextHopRouter.cpp b/src/mesh/NextHopRouter.cpp
index e8613d457..76858ada2 100644
--- a/src/mesh/NextHopRouter.cpp
+++ b/src/mesh/NextHopRouter.cpp
@@ -109,9 +109,18 @@ void NextHopRouter::sniffReceived(const meshtastic_MeshPacket *p, const meshtast
                               &weWereSoleRelayer);
                 if ((weWereRelayer && wasAlreadyRelayer) || (getHopsAway(*p) == 0 && weWereSoleRelayer)) {
                     if (origTx->next_hop != p->relay_node) { // Not already set
-                        LOG_INFO("Update next hop of 0x%x to 0x%x based on ACK/reply (was relayer %d we were sole %d)", p->from,
-                                 p->relay_node, wasAlreadyRelayer, weWereSoleRelayer);
-                        origTx->next_hop = p->relay_node;
+                        // M1/M2: only learn a next hop whose last byte maps to a single plausible relay. On a
+                        // dense mesh the byte may be ambiguous; storing it would aim future DMs at the wrong
+                        // node. If ambiguous/unknown, leave the route unset so we keep flooding (safe).
+                        if (nodeDB->resolveUniqueLastByte(p->relay_node, /*requireDirectNeighbor=*/false)) {
+                            LOG_INFO("Update next hop of 0x%x to 0x%x based on ACK/reply (was relayer %d we were sole %d)",
+                                     p->from, p->relay_node, wasAlreadyRelayer, weWereSoleRelayer);
+                            origTx->next_hop = p->relay_node;
+                            noteRouteLearned(p->from, p->relay_node, millis()); // M3: anchor freshness
+                        } else {
+                            LOG_DEBUG("Not learning next hop for 0x%x: relay byte 0x%x ambiguous/unknown; keep flooding", p->from,
+                                      p->relay_node);
+                        }
                     }
                 }
             }
@@ -144,6 +153,11 @@ bool NextHopRouter::perhapsRebroadcast(const meshtastic_MeshPacket *p)
     if (!isToUs(p) && !isFromUs(p) && (p->hop_limit > 0 || exhaustHops)) {
         if (p->id != 0) {
             if (isRebroadcaster()) {
+                // NOTE: this is a self-identity match (is the addressed next_hop OUR last byte?), so it
+                // cannot be hardened with resolveLastByte() — a remote node that legitimately shares our
+                // last byte will also match here and rebroadcast. That residual collision needs a wider
+                // on-wire field to fix. M1/M2 instead shrink the blast radius by reducing how often an
+                // ambiguous next_hop byte is ever learned (sniffReceived) or originated (getNextHop).
                 if (p->next_hop == NO_NEXT_HOP_PREFERENCE || p->next_hop == nodeDB->getLastByteOfNodeNum(getNodeNum())) {
                     meshtastic_MeshPacket *tosend = packetPool.allocCopy(*p); // keep a copy because we will be sending it
                     LOG_INFO("Rebroadcast received message coming from %x", p->relay_node);
@@ -196,10 +210,29 @@ std::optional<uint8_t> NextHopRouter::getNextHop(NodeNum to, uint8_t relay_node)
 
     meshtastic_NodeInfoLite *node = nodeDB->getMeshNode(to);
     if (node && node->next_hop) {
+        // M3: proactively decay a stale or repeatedly-failing route back to flooding, so a dead hop
+        // isn't trusted on the next DM's first (and on dense meshes, slowest) attempt. We only act on
+        // a health record that still matches the stored byte; a next_hop set by another path (e.g.
+        // TraceRouteModule) with no matching record is left authoritative.
+        RouteHealth *h = findRouteHealth(to);
+        if (h && h->lastNextHop == node->next_hop && isRouteStale(*h, millis())) {
+            LOG_INFO("Next hop 0x%x for 0x%x is stale (age/fails); flood and clear", node->next_hop, to);
+            node->next_hop = NO_NEXT_HOP_PREFERENCE; // clear persisted route
+            clearRouteHealth(to);                    // clear RAM health
+            return std::nullopt;
+        }
+
         // We are careful not to return the relay node as the next hop
         if (node->next_hop != relay_node) {
-            // LOG_DEBUG("Next hop for 0x%x is 0x%x", to, node->next_hop);
-            return node->next_hop;
+            // M1/M2: only emit a stored next_hop if its last byte still maps to a UNIQUE, currently
+            // reachable direct neighbor. On a dense mesh the last byte collides, so an ambiguous byte
+            // would unicast a hint toward the wrong physical node; if the neighbor has gone away we'd
+            // unicast into a void. In both cases flood instead (managed flooding still delivers).
+            ResolvedNode r = nodeDB->resolveLastByte(node->next_hop, /*requireDirectNeighbor=*/true);
+            if (r.status == LastByteResolution::Unique)
+                return node->next_hop;
+            LOG_WARN("Next hop 0x%x for 0x%x %s; set no pref", node->next_hop, to,
+                     r.status == LastByteResolution::Ambiguous ? "ambiguous among neighbors" : "not a known neighbor");
         } else
             LOG_WARN("Next hop for 0x%x is 0x%x, same as relayer; set no pref", to, node->next_hop);
     }
@@ -311,7 +344,10 @@ int32_t NextHopRouter::doRetransmissions()
 
                 if (!isBroadcast(p.packet->to)) {
                     if (p.numRetransmissions == 1) {
-                        // Last retransmission, reset next_hop (fallback to FloodingRouter)
+                        // Last retransmission: this directed delivery went un-ACKed. Record the failure
+                        // (M3 — accumulates across DMs to age out a flapping/dead route) and reset
+                        // next_hop so the final try falls back to FloodingRouter.
+                        noteRouteFailure(p.packet->to);
                         p.packet->next_hop = NO_NEXT_HOP_PREFERENCE;
                         // Also reset it in the nodeDB
                         meshtastic_NodeInfoLite *sentTo = nodeDB->getMeshNode(p.packet->to);
@@ -321,7 +357,25 @@ int32_t NextHopRouter::doRetransmissions()
                         }
                         FloodingRouter::send(packetPool.allocCopy(*p.packet));
                     } else {
+#if NEXTHOP_EARLY_FLOOD_ON_UNVERIFIED
+                        // M4 (gated): if the route isn't proven healthy, don't spend a second directed
+                        // attempt — start flooding one retry sooner to cut recovery latency. A verified
+                        // route (fresh, zero recent failures) keeps the unchanged directed-retry path so
+                        // the sparse-mesh happy path is untouched.
+                        RouteHealth *h = findRouteHealth(p.packet->to);
+                        bool verified = h && h->consecutiveFailures == 0 && !isRouteStale(*h, now);
+                        if (!verified) {
+                            p.packet->next_hop = NO_NEXT_HOP_PREFERENCE;
+                            meshtastic_NodeInfoLite *sentTo = nodeDB->getMeshNode(p.packet->to);
+                            if (sentTo)
+                                sentTo->next_hop = NO_NEXT_HOP_PREFERENCE;
+                            FloodingRouter::send(packetPool.allocCopy(*p.packet));
+                        } else {
+                            NextHopRouter::send(packetPool.allocCopy(*p.packet));
+                        }
+#else
                         NextHopRouter::send(packetPool.allocCopy(*p.packet));
+#endif
                     }
                 } else {
                     // Note: we call the superclass version because we don't want to have our version of send() add a new
@@ -355,3 +409,96 @@ void NextHopRouter::setNextTx(PendingPacket *pending)
     printPacket("", pending->packet);
     setReceivedMessage(); // Run ASAP, so we can figure out our correct sleep time
 }
+
+// ---------------------------------------------------------------------------
+// M3: RAM route-health table. Bounded array with reuse-oldest eviction (same discipline as
+// PacketHistory). All age comparisons use unsigned subtraction so they survive the 49.7-day millis()
+// rollover. dest == 0 marks an empty slot; learnedAtMsec is normalized to 1 on write so an occupied
+// slot is never read as infinitely old.
+// ---------------------------------------------------------------------------
+
+RouteHealth *NextHopRouter::findRouteHealth(NodeNum dest)
+{
+    if (dest == 0)
+        return nullptr;
+    for (auto &h : routeHealth)
+        if (h.dest == dest)
+            return &h;
+    return nullptr;
+}
+
+RouteHealth *NextHopRouter::getOrAllocRouteHealth(NodeNum dest, uint32_t now)
+{
+    if (dest == 0)
+        return nullptr;
+
+    RouteHealth *oldest = &routeHealth[0];
+    RouteHealth *freeSlot = nullptr;
+    for (auto &h : routeHealth) {
+        if (h.dest == dest)
+            return &h; // existing record
+        if (h.dest == 0) {
+            if (!freeSlot)
+                freeSlot = &h; // remember the first free slot; prefer it over evicting
+            continue;
+        }
+        // Track the oldest occupied slot in case the table is full (rollover-safe).
+        if ((uint32_t)(now - h.learnedAtMsec) > (uint32_t)(now - oldest->learnedAtMsec))
+            oldest = &h;
+    }
+    // Claim the free slot if there is one, else reuse the oldest. Reset before use and stamp the dest
+    // so the record is findable.
+    RouteHealth *slot = freeSlot ? freeSlot : oldest;
+    *slot = RouteHealth{};
+    slot->dest = dest;
+    return slot;
+}
+
+void NextHopRouter::noteRouteLearned(NodeNum dest, uint8_t nextHop, uint32_t now)
+{
+    if (dest == 0 || nextHop == NO_NEXT_HOP_PREFERENCE)
+        return;
+    RouteHealth *h = getOrAllocRouteHealth(dest, now);
+    if (!h)
+        return;
+    // A genuinely new next hop earns a clean slate; re-learning the SAME hop keeps the accumulated
+    // failure count so an asymmetric reverse path that keeps re-teaching a dead forward hop still ages
+    // out instead of resetting the counter every time.
+    if (h->lastNextHop != nextHop) {
+        h->lastNextHop = nextHop;
+        h->consecutiveFailures = 0;
+    }
+    h->learnedAtMsec = now ? now : 1;
+}
+
+void NextHopRouter::noteRouteSuccess(NodeNum dest, uint32_t now)
+{
+    RouteHealth *h = findRouteHealth(dest);
+    if (!h)
+        return; // only routes we actually learned have health to refresh
+    h->consecutiveFailures = 0;
+    h->learnedAtMsec = now ? now : 1;
+}
+
+void NextHopRouter::noteRouteFailure(NodeNum dest)
+{
+    RouteHealth *h = findRouteHealth(dest);
+    if (!h)
+        return; // nothing to penalize (we were flooding, or never learned a route here)
+    if (h->consecutiveFailures < 255)
+        h->consecutiveFailures++;
+}
+
+bool NextHopRouter::isRouteStale(const RouteHealth &h, uint32_t now) const
+{
+    if (h.consecutiveFailures >= ROUTE_FAILURE_THRESHOLD)
+        return true;
+    return (uint32_t)(now - h.learnedAtMsec) >= ROUTE_TTL_MSEC;
+}
+
+void NextHopRouter::clearRouteHealth(NodeNum dest)
+{
+    RouteHealth *h = findRouteHealth(dest);
+    if (h)
+        *h = RouteHealth{};
+}
diff --git a/src/mesh/NextHopRouter.h b/src/mesh/NextHopRouter.h
index 42ef13cd9..467d9ca68 100644
--- a/src/mesh/NextHopRouter.h
+++ b/src/mesh/NextHopRouter.h
@@ -43,6 +43,28 @@ struct PendingPacket {
     explicit PendingPacket(meshtastic_MeshPacket *p, uint8_t numRetransmissions);
 };
 
+/**
+ * RAM-only per-destination route health. Tracks how fresh a learned next_hop is and how many
+ * consecutive directed deliveries to it have failed, so getNextHop() can proactively decay a stale or
+ * repeatedly-failing route back to flooding instead of trusting a dead hop on the next (and on dense
+ * meshes, slowest) attempt. Not persisted: the learned next_hop itself lives in NodeInfoLite; this is
+ * just freshness/failure metadata.
+ */
+struct RouteHealth {
+    NodeNum dest = 0;                             ///< destination this record describes; 0 == empty slot
+    uint32_t learnedAtMsec = 0;                   ///< millis() when next_hop was last (re)learned (rollover-aware)
+    uint8_t consecutiveFailures = 0;              ///< directed deliveries to `dest` that went un-ACKed
+    uint8_t lastNextHop = NO_NEXT_HOP_PREFERENCE; ///< the relay byte this health refers to
+};
+
+// M4 (optional, off by default): when a route is not proven healthy, fall back to flooding one retry
+// earlier instead of spending a second directed attempt. Trades airtime for recovery latency on dense
+// meshes; leaves the sparse-mesh happy path (fresh, verified routes) unchanged. Measure on the
+// simulator before enabling broadly.
+#ifndef NEXTHOP_EARLY_FLOOD_ON_UNVERIFIED
+#define NEXTHOP_EARLY_FLOOD_ON_UNVERIFIED 0
+#endif
+
 class GlobalPacketIdHashFunction
 {
   public:
@@ -92,12 +114,22 @@ class NextHopRouter : public FloodingRouter
     // The number of retransmissions the original sender will do
     constexpr static uint8_t NUM_RELIABLE_RETX = 3;
 
+    // M3: bounded RAM route-health table (reuse-oldest eviction, like PacketHistory)
+    constexpr static uint8_t ROUTE_HEALTH_MAX = 32;              // ~12B/slot -> ~384B
+    constexpr static uint32_t ROUTE_TTL_MSEC = 30UL * 60 * 1000; // re-discover a route unconfirmed for 30 min
+    constexpr static uint8_t ROUTE_FAILURE_THRESHOLD = 3;        // consecutive un-ACKed directed deliveries -> dead
+
   protected:
     /**
      * Pending retransmissions
      */
     std::unordered_map<GlobalPacketId, PendingPacket, GlobalPacketIdHashFunction> pending;
 
+    /**
+     * Per-destination route health (M3). Bounded array, reuse-oldest eviction. RAM-only.
+     */
+    RouteHealth routeHealth[ROUTE_HEALTH_MAX] = {};
+
     /**
      * Should this incoming filter be dropped?
      *
@@ -142,13 +174,38 @@ class NextHopRouter : public FloodingRouter
 
     void setNextTx(PendingPacket *pending);
 
+    // --- M3 route-health helpers (RAM-only). Protected so ReliableRouter (a subclass) can record
+    // delivery success, and so the unit-test shim can reach them via `using`. All take `now` where
+    // time matters so the decay logic is pure and testable without a clock mock. ---
+
+    /// @return the health record for `dest`, or nullptr if we hold none.
+    RouteHealth *findRouteHealth(NodeNum dest);
+    /// @return an existing record for `dest`, else a freshly claimed slot (reuse-oldest on overflow).
+    RouteHealth *getOrAllocRouteHealth(NodeNum dest, uint32_t now);
+    /// Record that we (re)learned `nextHop` for `dest`. Resets the failure count only when the hop
+    /// changed (so a flapping reverse-path re-learn of the same dead hop still ages out).
+    void noteRouteLearned(NodeNum dest, uint8_t nextHop, uint32_t now);
+    /// Record an end-to-end delivery success to `dest` (clears failures, refreshes freshness).
+    void noteRouteSuccess(NodeNum dest, uint32_t now);
+    /// Record that a directed delivery to `dest` went un-ACKed (no-op if we hold no record).
+    void noteRouteFailure(NodeNum dest);
+    /// @return true if the route is too old (TTL) or has failed too many times in a row.
+    bool isRouteStale(const RouteHealth &h, uint32_t now) const;
+    /// Forget any health record for `dest`.
+    void clearRouteHealth(NodeNum dest);
+
+#ifdef PIO_UNIT_TESTING
+  public: // expose getNextHop to the test shim without widening production visibility
+#else
   private:
+#endif
     /**
      * Get the next hop for a destination, given the relay node
      * @return the node number of the next hop, 0 if no preference (fallback to FloodingRouter)
      */
     std::optional<uint8_t> getNextHop(NodeNum to, uint8_t relay_node);
 
+  private:
     /** Check if we should be rebroadcasting this packet if so, do so.
      *  @return true if we did rebroadcast */
     bool perhapsRebroadcast(const meshtastic_MeshPacket *p) override;
diff --git a/src/mesh/NodeDB.cpp b/src/mesh/NodeDB.cpp
index 2481eb8ca..ab24dc529 100644
--- a/src/mesh/NodeDB.cpp
+++ b/src/mesh/NodeDB.cpp
@@ -2942,6 +2942,73 @@ meshtastic_NodeInfoLite *NodeDB::getMeshNode(NodeNum n)
     return NULL;
 }
 
+ResolvedNode NodeDB::resolveLastByte(uint8_t lastByte, bool requireDirectNeighbor)
+{
+    ResolvedNode result; // defaults to {None, 0}
+
+    // 0 is the NO_RELAY_NODE / NO_NEXT_HOP_PREFERENCE sentinel (also what MQTT-sourced packets carry
+    // when hop_start==0). getLastByteOfNodeNum() never yields 0, so nothing can legitimately match.
+    if (lastByte == 0)
+        return result;
+
+    const NodeNum self = getNodeNum();
+    NodeNum firstMatch = 0;
+    uint8_t matches = 0;
+
+    for (size_t i = 0; i < numMeshNodes; i++) {
+        const meshtastic_NodeInfoLite *node = &meshNodes->at(i);
+
+        // Candidate gate: never resolve to ourselves, the sentinels, or an ignored node.
+        if (node->num == self || node->num == 0 || node->num == NODENUM_BROADCAST)
+            continue;
+        if (nodeInfoLiteIsIgnored(node))
+            continue;
+        if (getLastByteOfNodeNum(node->num) != lastByte) // cheapest discriminator last
+            continue;
+
+        // Relevance gate: is this node a plausible relay for the requested scope?
+        bool relevant;
+        if (requireDirectNeighbor) {
+            relevant = node->has_hops_away && node->hops_away == 0 && sinceLastSeen(node) < NEXTHOP_NEIGHBOR_FRESH_SECS;
+        } else {
+            const bool directNeighbor = node->has_hops_away && node->hops_away == 0;
+            const bool routerRole =
+                IS_ONE_OF(node->role, meshtastic_Config_DeviceConfig_Role_ROUTER, meshtastic_Config_DeviceConfig_Role_ROUTER_LATE,
+                          meshtastic_Config_DeviceConfig_Role_CLIENT_BASE);
+            relevant = directNeighbor || nodeInfoLiteIsFavorite(node) || routerRole;
+        }
+        if (!relevant)
+            continue;
+
+        if (++matches == 1) {
+            firstMatch = node->num;
+        } else {
+            // A second relevant candidate shares this byte: ambiguous. No further scanning can
+            // change that, so stop early and report the collision.
+            result.status = LastByteResolution::Ambiguous;
+            result.num = 0;
+            return result;
+        }
+    }
+
+    if (matches == 1) {
+        result.status = LastByteResolution::Unique;
+        result.num = firstMatch;
+    }
+    return result;
+}
+
+bool NodeDB::resolveUniqueLastByte(uint8_t lastByte, bool requireDirectNeighbor, NodeNum *outNum)
+{
+    ResolvedNode r = resolveLastByte(lastByte, requireDirectNeighbor);
+    if (r.status == LastByteResolution::Unique) {
+        if (outNum)
+            *outNum = r.num;
+        return true;
+    }
+    return false;
+}
+
 // returns true if the maximum number of nodes is reached or we are running low on memory
 bool NodeDB::isFull()
 {
diff --git a/src/mesh/NodeDB.h b/src/mesh/NodeDB.h
index 3fe244294..5a3b3c551 100644
--- a/src/mesh/NodeDB.h
+++ b/src/mesh/NodeDB.h
@@ -114,6 +114,20 @@ uint32_t sinceLastSeen(const meshtastic_NodeInfoLite *n);
 /// Given a packet, return how many seconds in the past (vs now) it was received
 uint32_t sinceReceived(const meshtastic_MeshPacket *p);
 
+/// Outcome of mapping a single on-wire last-byte (next_hop / relay_node) back to a full NodeNum.
+/// Because the wire only carries the last byte of a 32-bit node number, the mapping is ambiguous on
+/// dense meshes (the "birthday problem"). Callers must treat Ambiguous and None as "don't trust it".
+enum class LastByteResolution : uint8_t {
+    None,      ///< no relevant candidate node has this last byte
+    Unique,    ///< exactly one relevant candidate -> `num` is valid
+    Ambiguous, ///< two or more relevant candidates collide on this byte
+};
+
+struct ResolvedNode {
+    LastByteResolution status = LastByteResolution::None;
+    NodeNum num = 0; ///< valid only when status == Unique
+};
+
 /// Given a packet, return the number of hops used to reach this node.
 /// Returns defaultIfUnknown if the number of hops couldn't be determined.
 int8_t getHopsAway(const meshtastic_MeshPacket &p, int8_t defaultIfUnknown = -1);
@@ -296,6 +310,23 @@ class NodeDB
     virtual meshtastic_NodeInfoLite *getMeshNode(NodeNum n);
     size_t getNumMeshNodes() { return numMeshNodes; }
 
+    /**
+     * Resolve a single on-wire last-byte (e.g. next_hop / relay_node) back to a unique full NodeNum,
+     * detecting last-byte collisions instead of silently picking the first match. A 1-byte id only
+     * needs to be unique among a node's plausible relays, not the whole mesh, so we scope the search:
+     *  - requireDirectNeighbor == true  : candidates are direct neighbors (hops_away==0) heard within
+     *                                     NEXTHOP_NEIGHBOR_FRESH_SECS. Use on the SEND path.
+     *  - requireDirectNeighbor == false : also accept favorites and router-role nodes (unknown hop
+     *                                     distance allowed). Use when learning / preserving hops.
+     * Ignored nodes, our own node, and the broadcast/0 sentinels are never candidates. On a tie the
+     * result is Ambiguous (no tie-break) so callers fall back to flooding rather than misroute.
+     */
+    ResolvedNode resolveLastByte(uint8_t lastByte, bool requireDirectNeighbor);
+
+    /// Convenience wrapper around resolveLastByte(): true iff exactly one relevant candidate matches.
+    /// Ambiguous and None both return false (the safe answer for learning / hop preservation).
+    bool resolveUniqueLastByte(uint8_t lastByte, bool requireDirectNeighbor, NodeNum *outNum = nullptr);
+
     // Thread-safe satellite-map accessors. Return false if absent or the
     // corresponding DB is compiled out.
     bool copyNodePosition(NodeNum n, meshtastic_PositionLite &out) const;
diff --git a/src/mesh/PacketHistory.cpp b/src/mesh/PacketHistory.cpp
index f5c36b083..e4f565d1a 100644
--- a/src/mesh/PacketHistory.cpp
+++ b/src/mesh/PacketHistory.cpp
@@ -486,7 +486,11 @@ bool PacketHistory::wasRelayer(const uint8_t relayer, const uint32_t id, const N
 }
 
 /* Check if a certain node was a relayer of a packet in the history given iterator
- * @return true if node was indeed a relayer, false if not */
+ * @return true if node was indeed a relayer, false if not
+ * NOTE: intentionally byte-domain. Both `relayer` and relayed_by[] are on-wire last bytes, so this
+ * answers "did a relayer with this byte touch the packet" — correct without resolving to a NodeNum.
+ * The collision risk is neutralized where the result is consumed (route learning in
+ * NextHopRouter::sniffReceived now gates the write through NodeDB::resolveUniqueLastByte). */
 bool PacketHistory::wasRelayer(const uint8_t relayer, const PacketRecord &r, bool *wasSole)
 {
     bool found = false;
diff --git a/src/mesh/ReliableRouter.cpp b/src/mesh/ReliableRouter.cpp
index a1cfb1200..3126c820e 100644
--- a/src/mesh/ReliableRouter.cpp
+++ b/src/mesh/ReliableRouter.cpp
@@ -151,6 +151,10 @@ void ReliableRouter::sniffReceived(const meshtastic_MeshPacket *p, const meshtas
             LOG_DEBUG("Received a %s for 0x%x, stopping retransmissions", ackId ? "ACK" : "NAK", ackId);
             if (ackId) {
                 stopRetransmission(p->to, ackId);
+                // M3: an end-to-end ACK proves the directed route to the ACK's sender currently works,
+                // so clear its failure count and refresh freshness (keeps a good route pinned).
+                if (!isBroadcast(getFrom(p)))
+                    noteRouteSuccess(getFrom(p), millis());
             } else {
                 stopRetransmission(p->to, nakId);
             }
diff --git a/src/mesh/Router.cpp b/src/mesh/Router.cpp
index f176f60e6..2f3142087 100644
--- a/src/mesh/Router.cpp
+++ b/src/mesh/Router.cpp
@@ -114,37 +114,24 @@ bool Router::shouldDecrementHopLimit(const meshtastic_MeshPacket *p)
     }
 #endif
 
-    // For subsequent hops, check if previous relay is a favorite router
-    // Optimized search for favorite routers with matching last byte
-    // Check ordering optimized for IoT devices (cheapest checks first)
-    for (size_t i = 0; i < nodeDB->getNumMeshNodes(); i++) {
-        meshtastic_NodeInfoLite *node = nodeDB->getMeshNodeByIndex(i);
-        if (!node)
-            continue;
-
-        // Check 1: is_favorite (cheapest - single bit test)
-        if (!nodeInfoLiteIsFavorite(node))
-            continue;
-
-        // Check 2: has_user (cheap - single bit test)
-        if (!nodeInfoLiteHasUser(node))
-            continue;
-
-        // Check 3: role check (moderate cost - multiple comparisons)
-        if (!IS_ONE_OF(node->role, meshtastic_Config_DeviceConfig_Role_ROUTER, meshtastic_Config_DeviceConfig_Role_ROUTER_LATE,
-                       meshtastic_Config_DeviceConfig_Role_CLIENT_BASE)) {
-            continue;
-        }
-
-        // Check 4: last byte extraction and comparison (most expensive)
-        if (nodeDB->getLastByteOfNodeNum(node->num) == p->relay_node) {
-            // Found a favorite router match
-            LOG_DEBUG("Identified favorite relay router 0x%x from last byte 0x%x", node->num, p->relay_node);
+    // For subsequent hops, preserve hop_limit only when the previous relay is UNAMBIGUOUSLY a favorite
+    // router. The relay_node byte is just the last byte of a 32-bit node number, so on a dense mesh it
+    // collides; the old "first matching node wins" scan could preserve hops for the wrong node
+    // (non-deterministic, depends on NodeDB order). resolveLastByte() reports a collision instead, and
+    // we re-check the favorite/router predicate on the single resolved node. On ambiguity/none we
+    // decrement (the safe default).
+    NodeNum resolved = 0;
+    if (nodeDB->resolveUniqueLastByte(p->relay_node, /*requireDirectNeighbor=*/false, &resolved)) {
+        meshtastic_NodeInfoLite *node = nodeDB->getMeshNode(resolved);
+        if (node && nodeInfoLiteIsFavorite(node) && nodeInfoLiteHasUser(node) &&
+            IS_ONE_OF(node->role, meshtastic_Config_DeviceConfig_Role_ROUTER, meshtastic_Config_DeviceConfig_Role_ROUTER_LATE,
+                      meshtastic_Config_DeviceConfig_Role_CLIENT_BASE)) {
+            LOG_DEBUG("Identified unique favorite relay router 0x%x from last byte 0x%x", resolved, p->relay_node);
             return false; // Don't decrement hop_limit
         }
     }
 
-    // No favorite router match found, decrement hop_limit
+    // No unambiguous favorite router match found, decrement hop_limit
     return true;
 }
 
diff --git a/test/test_nexthop_routing/test_main.cpp b/test/test_nexthop_routing/test_main.cpp
new file mode 100644
index 000000000..e766cbdee
--- /dev/null
+++ b/test/test_nexthop_routing/test_main.cpp
@@ -0,0 +1,465 @@
+// Unit tests for NextHop direct-message reliability mitigations (see docs/nexthop-routing-reliability.md):
+//   M1 - NodeDB::resolveLastByte / resolveUniqueLastByte (ambiguity-aware last-byte resolution)
+//   M2 - NextHopRouter::getNextHop strict-neighbor gate + Router::shouldDecrementHopLimit favorite check
+//   M3 - NextHopRouter route-health freshness / failure decay
+//
+// Time handling: the route-health helpers take `now` as a parameter so the 30-minute TTL logic is
+// pure and testable without a clock mock. getNextHop()/sinceLastSeen() use the real native clock;
+// we back-date timestamps relative to it, and the unsigned-subtraction age math is rollover-safe.
+
+#include "MeshTypes.h" // before TestUtil.h: provides NodeNum etc.
+#include "TestUtil.h"
+#include <unity.h>
+
+#include "configuration.h"
+#include "gps/RTC.h"
+#include "mesh/NextHopRouter.h"
+#include "mesh/NodeDB.h"
+#include <cstdio>
+#include <cstring>
+#include <memory>
+
+#define MSG_BUF_LEN 200
+#define TEST_MSG_FMT(fmt, ...)                                                                                                   \
+    do {                                                                                                                         \
+        char _buf[MSG_BUF_LEN];                                                                                                  \
+        snprintf(_buf, sizeof(_buf), fmt, __VA_ARGS__);                                                                          \
+        TEST_MESSAGE(_buf);                                                                                                      \
+    } while (0)
+
+static constexpr NodeNum kLocalNode = 0x11111111; // last byte 0x11
+
+// ---------------------------------------------------------------------------
+// MockNodeDB — inject nodes with controlled last byte, hop distance, age, role, favorite flag.
+// ---------------------------------------------------------------------------
+class MockNodeDB : public NodeDB
+{
+  public:
+    void clearTestNodes()
+    {
+        testNodes.clear();
+        meshNodes = &testNodes;
+        numMeshNodes = 0;
+    }
+
+    // ageSecs is how long ago we last heard the node; getTime() returns a large Unix timestamp on
+    // native, so getTime()-ageSecs does not underflow for the ranges used here.
+    void addNode(NodeNum num, uint8_t hopsAway, bool hasHops, uint32_t ageSecs,
+                 meshtastic_Config_DeviceConfig_Role role = meshtastic_Config_DeviceConfig_Role_CLIENT, bool favorite = false,
+                 bool ignored = false, uint8_t nextHop = NO_NEXT_HOP_PREFERENCE)
+    {
+        meshtastic_NodeInfoLite node = meshtastic_NodeInfoLite_init_zero;
+        node.num = num;
+        node.has_hops_away = hasHops;
+        node.hops_away = hopsAway;
+        node.role = role;
+        node.next_hop = nextHop;
+        node.last_heard = getTime() - ageSecs;
+        nodeInfoLiteSetBit(&node, NODEINFO_BITFIELD_IS_FAVORITE_MASK, favorite);
+        nodeInfoLiteSetBit(&node, NODEINFO_BITFIELD_IS_IGNORED_MASK, ignored);
+        nodeInfoLiteSetBit(&node, NODEINFO_BITFIELD_HAS_USER_MASK, true);
+        testNodes.push_back(node);
+        meshNodes = &testNodes;
+        numMeshNodes = testNodes.size();
+    }
+
+    std::vector<meshtastic_NodeInfoLite> testNodes;
+};
+
+// ---------------------------------------------------------------------------
+// Test shim — expose getNextHop and the route-health helpers; reset health between tests.
+// Nulls cryptLock so the Router base can be (re)constructed (same pattern as test_mqtt MockRouter).
+// ---------------------------------------------------------------------------
+class NextHopRouterTestShim : public NextHopRouter
+{
+  public:
+    NextHopRouterTestShim() : NextHopRouter()
+    {
+        delete cryptLock;
+        cryptLock = nullptr;
+    }
+
+    using NextHopRouter::clearRouteHealth;
+    using NextHopRouter::findRouteHealth;
+    using NextHopRouter::getNextHop;
+    using NextHopRouter::getOrAllocRouteHealth;
+    using NextHopRouter::isRouteStale;
+    using NextHopRouter::noteRouteFailure;
+    using NextHopRouter::noteRouteLearned;
+    using NextHopRouter::noteRouteSuccess;
+    using Router::shouldDecrementHopLimit; // protected in Router
+
+    void resetRouteHealthForTest()
+    {
+        for (auto &h : routeHealth)
+            h = RouteHealth{};
+    }
+};
+
+static MockNodeDB *mockNodeDB = nullptr;
+static NextHopRouterTestShim *shim = nullptr;
+
+static constexpr uint32_t TTL = NextHopRouter::ROUTE_TTL_MSEC;
+static constexpr uint8_t THRESH = NextHopRouter::ROUTE_FAILURE_THRESHOLD;
+static constexpr uint8_t HEALTH_MAX = NextHopRouter::ROUTE_HEALTH_MAX;
+
+// Helper: a decoded packet whose hops-away is `hopsAway`, relayed by last byte `relay`.
+static meshtastic_MeshPacket makeRelayedPacket(uint8_t relay, uint8_t hopsAway)
+{
+    meshtastic_MeshPacket p = meshtastic_MeshPacket_init_zero;
+    p.which_payload_variant = meshtastic_MeshPacket_decoded_tag;
+    p.relay_node = relay;
+    p.hop_start = 4;
+    p.hop_limit = 4 - hopsAway; // getHopsAway() == hop_start - hop_limit
+    return p;
+}
+
+void setUp(void)
+{
+    myNodeInfo.my_node_num = kLocalNode;
+    config.device.role = meshtastic_Config_DeviceConfig_Role_CLIENT;
+    mockNodeDB->clearTestNodes();
+    shim->resetRouteHealthForTest();
+}
+
+void tearDown(void) {}
+
+// ===========================================================================
+// Group 1 — resolveLastByte (M1)
+// ===========================================================================
+
+void test_resolve_none_when_empty(void)
+{
+    ResolvedNode r = mockNodeDB->resolveLastByte(0xAB, true);
+    TEST_ASSERT_EQUAL(LastByteResolution::None, r.status);
+}
+
+void test_resolve_zero_byte_is_none(void)
+{
+    // 0 is the NO_RELAY_NODE / NO_NEXT_HOP_PREFERENCE sentinel — never resolves.
+    mockNodeDB->addNode(0x22222200, 0, true, 60); // last byte maps to 0xFF, not 0
+    TEST_ASSERT_EQUAL(LastByteResolution::None, mockNodeDB->resolveLastByte(0x00, true).status);
+    TEST_ASSERT_EQUAL(LastByteResolution::None, mockNodeDB->resolveLastByte(0x00, false).status);
+}
+
+void test_resolve_unique_neighbor(void)
+{
+    mockNodeDB->addNode(0x000005AB, 0, true, 60); // direct, fresh, last byte 0xAB
+    ResolvedNode r = mockNodeDB->resolveLastByte(0xAB, true);
+    TEST_ASSERT_EQUAL(LastByteResolution::Unique, r.status);
+    TEST_ASSERT_EQUAL_HEX32(0x000005AB, r.num);
+}
+
+void test_resolve_collision_is_ambiguous(void)
+{
+    // Birthday collision: two fresh direct neighbors share last byte 0xAB.
+    mockNodeDB->addNode(0x000005AB, 0, true, 60);
+    mockNodeDB->addNode(0x000006AB, 0, true, 60);
+    ResolvedNode r = mockNodeDB->resolveLastByte(0xAB, true);
+    TEST_ASSERT_EQUAL(LastByteResolution::Ambiguous, r.status);
+    TEST_ASSERT_EQUAL_HEX32(0, r.num); // never silently picks one
+}
+
+void test_resolve_strict_excludes_stale(void)
+{
+    mockNodeDB->addNode(0x000005AB, 0, true, 60);                                // fresh
+    mockNodeDB->addNode(0x000006AB, 0, true, NEXTHOP_NEIGHBOR_FRESH_SECS + 100); // stale
+    ResolvedNode r = mockNodeDB->resolveLastByte(0xAB, true);
+    TEST_ASSERT_EQUAL(LastByteResolution::Unique, r.status);
+    TEST_ASSERT_EQUAL_HEX32(0x000005AB, r.num);
+}
+
+void test_resolve_strict_excludes_far(void)
+{
+    mockNodeDB->addNode(0x000005AB, 0, true, 60); // direct neighbor
+    mockNodeDB->addNode(0x000006AB, 2, true, 60); // 2 hops away -> not a direct neighbor
+    ResolvedNode r = mockNodeDB->resolveLastByte(0xAB, true);
+    TEST_ASSERT_EQUAL(LastByteResolution::Unique, r.status);
+    TEST_ASSERT_EQUAL_HEX32(0x000005AB, r.num);
+}
+
+void test_resolve_lenient_includes_favorite_router(void)
+{
+    // Unknown hop distance, but a favorite ROUTER: lenient gate accepts, strict gate does not.
+    mockNodeDB->addNode(0x000007AB, 0, false, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/true);
+    TEST_ASSERT_EQUAL(LastByteResolution::Unique, mockNodeDB->resolveLastByte(0xAB, false).status);
+    TEST_ASSERT_EQUAL(LastByteResolution::None, mockNodeDB->resolveLastByte(0xAB, true).status);
+}
+
+void test_resolve_lenient_collision_favorite_plus_neighbor(void)
+{
+    mockNodeDB->addNode(0x000007AB, 0, false, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/true);
+    mockNodeDB->addNode(0x000005AB, 0, true, 60); // direct neighbor, same byte
+    TEST_ASSERT_EQUAL(LastByteResolution::Ambiguous, mockNodeDB->resolveLastByte(0xAB, false).status);
+}
+
+void test_resolve_skips_self(void)
+{
+    // A node equal to us (last byte 0x11) must never resolve to ourselves.
+    mockNodeDB->addNode(kLocalNode, 0, true, 60);
+    TEST_ASSERT_EQUAL(LastByteResolution::None, mockNodeDB->resolveLastByte(0x11, true).status);
+}
+
+void test_resolve_skips_ignored(void)
+{
+    mockNodeDB->addNode(0x000005AB, 0, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, /*ignored=*/true);
+    TEST_ASSERT_EQUAL(LastByteResolution::None, mockNodeDB->resolveLastByte(0xAB, true).status);
+}
+
+void test_resolve_0x00_maps_to_0xFF_and_collides(void)
+{
+    // getLastByteOfNodeNum() maps ...00 -> 0xFF, so a ...00 node and a ...FF node collide on 0xFF.
+    TEST_ASSERT_EQUAL_HEX8(0xFF, mockNodeDB->getLastByteOfNodeNum(0x11111100));
+    mockNodeDB->addNode(0x11111100, 0, true, 60); // last byte 0xFF (remapped)
+    TEST_ASSERT_EQUAL(LastByteResolution::Unique, mockNodeDB->resolveLastByte(0xFF, true).status);
+    mockNodeDB->addNode(0x222222FF, 0, true, 60); // genuine ...FF
+    TEST_ASSERT_EQUAL(LastByteResolution::Ambiguous, mockNodeDB->resolveLastByte(0xFF, true).status);
+}
+
+void test_resolve_unique_helper(void)
+{
+    mockNodeDB->addNode(0x000005AB, 0, true, 60);
+    NodeNum out = 0;
+    TEST_ASSERT_TRUE(mockNodeDB->resolveUniqueLastByte(0xAB, true, &out));
+    TEST_ASSERT_EQUAL_HEX32(0x000005AB, out);
+    mockNodeDB->addNode(0x000006AB, 0, true, 60); // create collision
+    TEST_ASSERT_FALSE(mockNodeDB->resolveUniqueLastByte(0xAB, true));
+}
+
+// ===========================================================================
+// Group 2 — getNextHop (M2 send-path gate + M3 decay)
+// ===========================================================================
+
+static constexpr NodeNum DEST = 0x000000B0; // DM destination (last byte 0xB0, distinct from 0xAB)
+
+void test_getnexthop_unique_returns_byte(void)
+{
+    mockNodeDB->addNode(DEST, 2, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, false, /*nextHop=*/0xAB);
+    mockNodeDB->addNode(0x000005AB, 0, true, 60); // unique fresh neighbor with byte 0xAB
+    auto nh = shim->getNextHop(DEST, /*relay=*/0x11);
+    TEST_ASSERT_TRUE(nh.has_value());
+    TEST_ASSERT_EQUAL_HEX8(0xAB, nh.value());
+}
+
+void test_getnexthop_ambiguous_floods(void)
+{
+    mockNodeDB->addNode(DEST, 2, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, false, /*nextHop=*/0xAB);
+    mockNodeDB->addNode(0x000005AB, 0, true, 60);
+    mockNodeDB->addNode(0x000006AB, 0, true, 60); // collision -> ambiguous
+    TEST_ASSERT_FALSE(shim->getNextHop(DEST, 0x11).has_value());
+}
+
+void test_getnexthop_vanished_neighbor_floods(void)
+{
+    mockNodeDB->addNode(DEST, 2, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, false, /*nextHop=*/0xAB);
+    // The only 0xAB node is stale -> strict gate yields None -> flood.
+    mockNodeDB->addNode(0x000005AB, 0, true, NEXTHOP_NEIGHBOR_FRESH_SECS + 100);
+    TEST_ASSERT_FALSE(shim->getNextHop(DEST, 0x11).has_value());
+}
+
+void test_getnexthop_split_horizon_floods(void)
+{
+    mockNodeDB->addNode(DEST, 2, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, false, /*nextHop=*/0xAB);
+    mockNodeDB->addNode(0x000005AB, 0, true, 60);
+    // relay_node == stored next_hop -> don't send it back the way it came.
+    TEST_ASSERT_FALSE(shim->getNextHop(DEST, /*relay=*/0xAB).has_value());
+}
+
+void test_getnexthop_broadcast_is_nullopt(void)
+{
+    TEST_ASSERT_FALSE(shim->getNextHop(NODENUM_BROADCAST, 0x11).has_value());
+}
+
+void test_getnexthop_decays_stale_route(void)
+{
+    mockNodeDB->addNode(DEST, 2, true, 60, meshtastic_Config_DeviceConfig_Role_CLIENT, false, false, /*nextHop=*/0xAB);
+    mockNodeDB->addNode(0x000005AB, 0, true, 60); // a valid unique neighbor exists...
+    // ...but the health record is older than the TTL, so the route should decay to flooding.
+    shim->noteRouteLearned(DEST, 0xAB, millis() - (TTL + 5000));
+    TEST_ASSERT_FALSE(shim->getNextHop(DEST, 0x11).has_value());
+    // Decay also clears the persisted next_hop and the RAM health record.
+    TEST_ASSERT_EQUAL_HEX8(NO_NEXT_HOP_PREFERENCE, mockNodeDB->getMeshNode(DEST)->next_hop);
+    TEST_ASSERT_NULL(shim->findRouteHealth(DEST));
+}
+
+// ===========================================================================
+// Group 3 — route-health helpers (M3)
+// ===========================================================================
+
+void test_health_fresh_not_stale(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    RouteHealth *h = shim->findRouteHealth(DEST);
+    TEST_ASSERT_NOT_NULL(h);
+    TEST_ASSERT_FALSE(shim->isRouteStale(*h, 1000 + TTL - 1));
+}
+
+void test_health_ttl_expiry(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    RouteHealth *h = shim->findRouteHealth(DEST);
+    TEST_ASSERT_TRUE(shim->isRouteStale(*h, 1000 + TTL)); // boundary is inclusive (>=)
+}
+
+void test_health_ttl_rollover_safe(void)
+{
+    const uint32_t learnAt = 0xFFFFFFFFu - 1000; // learned just before the millis() rollover
+    shim->noteRouteLearned(DEST, 0xAB, learnAt);
+    RouteHealth *h = shim->findRouteHealth(DEST);
+    // 1500 ms later (wrapped to now=500): unsigned subtraction yields ~1500 ms, not "stale".
+    TEST_ASSERT_FALSE(shim->isRouteStale(*h, 500));
+}
+
+void test_health_failure_threshold(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    for (uint8_t i = 1; i < THRESH; i++)
+        shim->noteRouteFailure(DEST);
+    TEST_ASSERT_FALSE(shim->isRouteStale(*shim->findRouteHealth(DEST), 1000)); // THRESH-1 failures: ok
+    shim->noteRouteFailure(DEST);
+    TEST_ASSERT_TRUE(shim->isRouteStale(*shim->findRouteHealth(DEST), 1000)); // THRESH failures: stale
+}
+
+void test_health_success_resets_failures(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteSuccess(DEST, 2000);
+    RouteHealth *h = shim->findRouteHealth(DEST);
+    TEST_ASSERT_EQUAL_UINT8(0, h->consecutiveFailures);
+    TEST_ASSERT_FALSE(shim->isRouteStale(*h, 2000));
+}
+
+void test_health_relearn_same_hop_keeps_failures(void)
+{
+    // Anti-flap: an asymmetric reverse path re-teaching the same dead hop must not reset failures.
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteLearned(DEST, 0xAB, 2000); // same hop
+    TEST_ASSERT_EQUAL_UINT8(2, shim->findRouteHealth(DEST)->consecutiveFailures);
+}
+
+void test_health_relearn_new_hop_resets_failures(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteFailure(DEST);
+    shim->noteRouteLearned(DEST, 0xCD, 2000); // genuinely new hop -> clean slate
+    RouteHealth *h = shim->findRouteHealth(DEST);
+    TEST_ASSERT_EQUAL_UINT8(0, h->consecutiveFailures);
+    TEST_ASSERT_EQUAL_HEX8(0xCD, h->lastNextHop);
+}
+
+void test_health_failure_without_record_is_noop(void)
+{
+    shim->noteRouteFailure(DEST); // no record yet
+    TEST_ASSERT_NULL(shim->findRouteHealth(DEST));
+}
+
+void test_health_clear(void)
+{
+    shim->noteRouteLearned(DEST, 0xAB, 1000);
+    TEST_ASSERT_NOT_NULL(shim->findRouteHealth(DEST));
+    shim->clearRouteHealth(DEST);
+    TEST_ASSERT_NULL(shim->findRouteHealth(DEST));
+}
+
+void test_health_lru_eviction_bounds_table(void)
+{
+    // Fill every slot with increasing learn times, then add one more: the oldest must be evicted.
+    for (uint8_t i = 0; i < HEALTH_MAX; i++)
+        shim->noteRouteLearned(0x1000 + i, 0xAB, 1000 + (uint32_t)i * 1000);
+    NodeNum oldest = 0x1000;
+    TEST_ASSERT_NOT_NULL(shim->findRouteHealth(oldest));
+    shim->noteRouteLearned(0x2000, 0xAB, 1000 + (uint32_t)HEALTH_MAX * 1000); // overflow
+    TEST_ASSERT_NULL(shim->findRouteHealth(oldest));                          // evicted
+    TEST_ASSERT_NOT_NULL(shim->findRouteHealth(0x2000));                      // newest present
+}
+
+// ===========================================================================
+// Group 4 — shouldDecrementHopLimit favorite-router resolution (M2, site 4)
+// ===========================================================================
+
+void test_hoplimit_preserve_unique_favorite_router(void)
+{
+    config.device.role = meshtastic_Config_DeviceConfig_Role_ROUTER;
+    mockNodeDB->addNode(0x000007AB, 0, true, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/true);
+    meshtastic_MeshPacket p = makeRelayedPacket(/*relay=*/0xAB, /*hopsAway=*/1);
+    TEST_ASSERT_FALSE(shim->shouldDecrementHopLimit(&p)); // preserve
+}
+
+void test_hoplimit_decrement_on_colliding_favorites(void)
+{
+    // Headline regression: two favorite routers share the relay byte -> ambiguous -> decrement
+    // (the old "first NodeDB match wins" scan would non-deterministically preserve).
+    config.device.role = meshtastic_Config_DeviceConfig_Role_ROUTER;
+    mockNodeDB->addNode(0x000007AB, 0, true, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/true);
+    mockNodeDB->addNode(0x000008AB, 0, true, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/true);
+    meshtastic_MeshPacket p = makeRelayedPacket(0xAB, 1);
+    TEST_ASSERT_TRUE(shim->shouldDecrementHopLimit(&p)); // decrement
+}
+
+void test_hoplimit_decrement_when_resolved_not_favorite(void)
+{
+    config.device.role = meshtastic_Config_DeviceConfig_Role_ROUTER;
+    mockNodeDB->addNode(0x000007AB, 0, true, 60, meshtastic_Config_DeviceConfig_Role_ROUTER, /*favorite=*/false);
+    meshtastic_MeshPacket p = makeRelayedPacket(0xAB, 1);
+    TEST_ASSERT_TRUE(shim->shouldDecrementHopLimit(&p)); // unique but not a favorite -> decrement
+}
+
+// ===========================================================================
+
+void setup()
+{
+    initializeTestEnvironment();
+    UNITY_BEGIN();
+
+    mockNodeDB = new MockNodeDB();
+    shim = new NextHopRouterTestShim();
+    nodeDB = mockNodeDB;
+
+    printf("\n=== resolveLastByte (M1) ===\n");
+    RUN_TEST(test_resolve_none_when_empty);
+    RUN_TEST(test_resolve_zero_byte_is_none);
+    RUN_TEST(test_resolve_unique_neighbor);
+    RUN_TEST(test_resolve_collision_is_ambiguous);
+    RUN_TEST(test_resolve_strict_excludes_stale);
+    RUN_TEST(test_resolve_strict_excludes_far);
+    RUN_TEST(test_resolve_lenient_includes_favorite_router);
+    RUN_TEST(test_resolve_lenient_collision_favorite_plus_neighbor);
+    RUN_TEST(test_resolve_skips_self);
+    RUN_TEST(test_resolve_skips_ignored);
+    RUN_TEST(test_resolve_0x00_maps_to_0xFF_and_collides);
+    RUN_TEST(test_resolve_unique_helper);
+
+    printf("\n=== getNextHop (M2 + M3 decay) ===\n");
+    RUN_TEST(test_getnexthop_unique_returns_byte);
+    RUN_TEST(test_getnexthop_ambiguous_floods);
+    RUN_TEST(test_getnexthop_vanished_neighbor_floods);
+    RUN_TEST(test_getnexthop_split_horizon_floods);
+    RUN_TEST(test_getnexthop_broadcast_is_nullopt);
+    RUN_TEST(test_getnexthop_decays_stale_route);
+
+    printf("\n=== route-health helpers (M3) ===\n");
+    RUN_TEST(test_health_fresh_not_stale);
+    RUN_TEST(test_health_ttl_expiry);
+    RUN_TEST(test_health_ttl_rollover_safe);
+    RUN_TEST(test_health_failure_threshold);
+    RUN_TEST(test_health_success_resets_failures);
+    RUN_TEST(test_health_relearn_same_hop_keeps_failures);
+    RUN_TEST(test_health_relearn_new_hop_resets_failures);
+    RUN_TEST(test_health_failure_without_record_is_noop);
+    RUN_TEST(test_health_clear);
+    RUN_TEST(test_health_lru_eviction_bounds_table);
+
+    printf("\n=== shouldDecrementHopLimit (M2 site 4) ===\n");
+    RUN_TEST(test_hoplimit_preserve_unique_favorite_router);
+    RUN_TEST(test_hoplimit_decrement_on_colliding_favorites);
+    RUN_TEST(test_hoplimit_decrement_when_resolved_not_favorite);
+
+    exit(UNITY_END());
+}
+
+void loop() {}