fix(ble): Restore bounded bonded reconnect fallback (#5960)

This commit is contained in:
Jeremiah K
2026-06-26 06:12:10 -05:00
committed by GitHub
parent f60525ab22
commit 780bec03f9
4 changed files with 73 additions and 51 deletions

View File

@@ -28,9 +28,10 @@ import kotlinx.coroutines.flow.asStateFlow
* When created from a live BLE scan, [advertisement] is populated and used for optimal peripheral construction via
* `Peripheral(advertisement)` with a direct (non-autoConnect) connection attempt.
*
* Bonded-only devices (address only, [advertisement] null) can still be constructed — e.g. for display in the device
* list — but [BleRadioTransport.findDevice] requires a fresh advertisement before connecting and will throw
* [RadioNotConnectedException] rather than returning a stale handle.
* Bonded-only devices (address only, [advertisement] null) can still be constructed for display in the device list and
* as a bounded fallback when fresh scans miss. In that fallback, Kable creates an address-only peripheral so Android
* can wait for the device to advertise through its `autoConnect` path while the transport keeps the connection attempt
* bounded.
*
* @param address The device's MAC address (or platform identifier string).
* @param name The device's display name, if known.

View File

@@ -78,20 +78,19 @@ private val CONNECTION_TIMEOUT = 15.seconds
*/
private val HEARTBEAT_DRAIN_DELAY = 200.milliseconds
private val TARGETED_SCAN_TIMEOUT = 2.seconds
internal val TARGETED_SCAN_TIMEOUT = 2.seconds
/**
* Bounded scan duration used by both discovery paths in [findDevice]:
* - Bonded escalation: after the 2s [TARGETED_SCAN_TIMEOUT] misses the low-duty advertisement window, this is the
* single bounded retry before declaring the bonded device unreachable.
* single bounded retry before falling back to the bonded handle.
* - Non-bonded retry: each of the [SCAN_RETRY_COUNT] attempts uses this duration.
*
* 5s covers multiple advertising intervals for typical BLE power-save slots (~12s each) while keeping the total bonded
* scan window bounded (~7s scan, ~10s total attempt including the existing reconnect settle delay) so
* [BleReconnectPolicy] iterates on its normal schedule instead of parking on [CONNECTION_TIMEOUT] (15s) via an
* `autoConnect=true` hang on a stale handle.
* 5s covers multiple advertising intervals for typical BLE power-save slots (~12s each). If the bounded bonded scan
* window still misses, [findDevice] falls back to the bonded handle and [attemptConnection] keeps that patient
* `autoConnect` path bounded through [CONNECTION_TIMEOUT].
*/
private val SCAN_TIMEOUT = 5.seconds
internal val SCAN_TIMEOUT = 5.seconds
private val GATT_CLEANUP_TIMEOUT = 5.seconds
/**
@@ -246,18 +245,21 @@ class BleRadioTransport(
}
// Escalation: radio may be in a low-duty advertising slot. Try one bounded SCAN_TIMEOUT scan.
Logger.i { "[${address.anonymize()}] Targeted scan missed; escalating to bounded scan before giving up" }
Logger.i { "[${address.anonymize()}] Targeted scan missed; escalating to bounded scan before fallback" }
scanForFreshDevice(SCAN_TIMEOUT)?.let {
Logger.i { "[${address.anonymize()}] Fresh advertisement found during escalated scan" }
return it
}
// Never fall back to the stale bonded handle — that path forces autoConnect=true and hangs for
// CONNECTION_TIMEOUT (15s) before throwing. Bounded scan window (~7s scan, ~10s total attempt
// including the existing reconnect settle delay) lets the reconnect policy iterate on its
// normal schedule instead of parking on the autoConnect hang.
Logger.w { "[${address.anonymize()}] No fresh advertisement within $SCAN_TIMEOUT; reporting unreachable" }
throw RadioNotConnectedException("Bonded device is not advertising")
// If both scans miss, fall back to the bonded handle. Bonded-only devices have no fresh advertisement, so
// Kable uses autoConnect=true and Android can patiently wait for the device to advertise again.
// This remains bounded by CONNECTION_TIMEOUT in connectAndAwait(), after which BleReconnectPolicy owns
// retry/backoff.
Logger.w {
"[${address.anonymize()}] No fresh advertisement within $SCAN_TIMEOUT; " +
"falling back to bonded handle for bounded autoConnect"
}
return bondedDevice
}
// Non-bonded path: preserve existing retry behavior (SCAN_RETRY_COUNT attempts at SCAN_TIMEOUT).
@@ -279,8 +281,7 @@ class BleRadioTransport(
*
* One scan attempt only — no retry, no backoff. Both bonded and non-bonded paths in [findDevice] share this
* primitive so retry policy stays centralized:
* - Bonded: escalated from [TARGETED_SCAN_TIMEOUT] to [SCAN_TIMEOUT] before [findDevice] declares the device
* unreachable.
* - Bonded: escalated from [TARGETED_SCAN_TIMEOUT] to [SCAN_TIMEOUT] before [findDevice] returns the bonded handle.
* - Non-bonded: [SCAN_RETRY_COUNT] attempts at [SCAN_TIMEOUT] with [SCAN_RETRY_DELAY] between attempts.
*
* The outer [withTimeoutOrNull] is binding: the scanner receives [timeout] as a hint, but this coroutine resumes on

View File

@@ -59,6 +59,15 @@ import kotlin.test.assertEquals
import kotlin.test.assertTrue
import kotlin.time.Duration
private const val BONDED_FALLBACK_CONNECT_BUFFER_MILLIS = 1_000L
private val targetedScanTimeoutMillis = TARGETED_SCAN_TIMEOUT.inWholeMilliseconds
private val escalatedScanTimeoutMillis = SCAN_TIMEOUT.inWholeMilliseconds
private val bondedReconnectWindowMillis =
BleReconnectPolicy.DEFAULT_SETTLE_DELAY.inWholeMilliseconds +
targetedScanTimeoutMillis +
escalatedScanTimeoutMillis +
BONDED_FALLBACK_CONNECT_BUFFER_MILLIS
/**
* Tests covering the BLE reconnect crash fixes in [BleRadioTransport]:
* 1. **CancellationException / GATT 133 fix**: [discoverServicesAndSetupCharacteristics] previously had a bare `catch
@@ -1037,8 +1046,8 @@ class BleRadioTransportReconnectCrashTest {
* advertising window. The escalation path (TARGETED_SCAN_TIMEOUT → SCAN_TIMEOUT) must surface a freshly-scanned
* device.
*
* Sequenced scanner: call 0 (2s targeted) returns empty; call 1 (5s escalated) emits a fresh `FakeBleDevice` for
* the bonded address. The transport must connect using the fresh scanned device, never the stale bonded handle.
* Sequenced scanner: call 0 (targeted) returns empty; call 1 (escalated) emits a fresh `FakeBleDevice` for the
* bonded address. The transport must connect using the fresh scanned device, never the stale bonded handle.
*/
@Test
fun `bonded device missed by targeted scan is found by escalated scan`() = runTest {
@@ -1048,7 +1057,7 @@ class BleRadioTransportReconnectCrashTest {
val freshDevice = FakeBleDevice(address = address, name = "Fresh Scan Result")
val sequencedScanner =
SequencedBleScanner().apply {
responses += { flow { awaitCancellation() } } // call 0: targeted scan misses (runs until 2s timeout)
responses += { flow { awaitCancellation() } } // call 0: targeted scan misses
responses += { flow { emit(freshDevice) } } // call 1: escalated scan emits fresh device
}
@@ -1069,22 +1078,21 @@ class BleRadioTransportReconnectCrashTest {
try {
bleTransport.start()
// 2s targeted scan + immediate escalated emit + connect/handshake settle.
advanceTimeBy(8_000L)
advanceTimeBy(bondedReconnectWindowMillis)
assertTrue(
sequencedScanner.callCount >= 2,
"Expected at least 2 scan calls (targeted + escalated); got ${sequencedScanner.callCount}",
)
assertEquals(
2_000L,
targetedScanTimeoutMillis,
sequencedScanner.calls[0].timeout.inWholeMilliseconds,
"First scan must be targeted (2s)",
"First scan must use TARGETED_SCAN_TIMEOUT",
)
assertEquals(
5_000L,
escalatedScanTimeoutMillis,
sequencedScanner.calls[1].timeout.inWholeMilliseconds,
"Second scan must be escalated (5s)",
"Second scan must use SCAN_TIMEOUT",
)
assertEquals(SERVICE_UUID, sequencedScanner.calls[0].serviceUuid, "First scan must include service UUID")
assertEquals(address, sequencedScanner.calls[0].address, "First scan must target selected address")
@@ -1104,24 +1112,27 @@ class BleRadioTransportReconnectCrashTest {
}
/**
* Critical regression: when the bonded device is not advertising at all, [findDevice] must throw
* [RadioNotConnectedException] after the bounded scan window (~7s scan, ~10s total attempt including the existing
* reconnect settle delay) and must not fall back to the stale bonded handle.
* Regression: when the bonded device misses both fresh scan windows, [findDevice] must fall back to the bonded
* handle so Android's autoConnect path can patiently wait for the radio to advertise again.
*
* Before the fix, the stale handle forced `autoConnect=true` and hung for [CONNECTION_TIMEOUT] (15s) before the
* reconnect loop could iterate. This test proves `connectAndAwait` is never invoked against the stale handle.
* The fallback remains bounded by [CONNECTION_TIMEOUT] in `connectAndAwait`, after which the reconnect policy owns
* the next retry/backoff iteration.
*/
@Test
fun `bonded device never advertising results in zero connect calls`() = runTest {
fun `bonded device never advertising falls back to bounded autoConnect`() = runTest {
val bondedDevice = FakeBleDevice(address = address, name = "Bonded Handle")
bluetoothRepository.bond(bondedDevice)
val sequencedScanner =
SequencedBleScanner().apply {
responses += { flow { awaitCancellation() } } // call 0: targeted scan misses (runs until 2s timeout)
responses += { flow { awaitCancellation() } } // call 1: escalated scan misses (runs until 5s timeout)
responses += { flow { awaitCancellation() } } // call 0: targeted scan misses
responses += { flow { awaitCancellation() } } // call 1: escalated scan misses
}
// Keep profile setup stable so the test can inspect the connected device.
connection.service.addCharacteristic(FROMNUM_CHARACTERISTIC)
connection.service.addCharacteristic(FROMRADIO_CHARACTERISTIC)
val bleTransport =
BleRadioTransport(
scope = this,
@@ -1134,21 +1145,27 @@ class BleRadioTransportReconnectCrashTest {
try {
bleTransport.start()
// 2s targeted + 5s escalated + reconnect backoff + settle delay before second iteration.
// With awaitCancellation(), scans now suspend for their full timeout durations, so we
// must cover the full reconnect iteration window (settle + targeted + escalated + backoff).
advanceTimeBy(20_000L)
advanceTimeBy(bondedReconnectWindowMillis)
// Proves the stale bonded handle was not used: no connectAndAwait call.
assertEquals(
0,
connection.connectAndAwaitCalls,
"Must never call connectAndAwait when no fresh advertisement was found",
)
assertTrue(
sequencedScanner.callCount >= 2,
"Expected at least 2 scan calls (targeted + escalated); got ${sequencedScanner.callCount}",
)
assertEquals(
targetedScanTimeoutMillis,
sequencedScanner.calls[0].timeout.inWholeMilliseconds,
"First scan must use TARGETED_SCAN_TIMEOUT",
)
assertEquals(
escalatedScanTimeoutMillis,
sequencedScanner.calls[1].timeout.inWholeMilliseconds,
"Second scan must use SCAN_TIMEOUT",
)
assertEquals(1, connection.connectAndAwaitCalls, "Must connect once through the bounded bonded fallback")
assertTrue(
connection.device === bondedDevice,
"Must use the bonded handle after fresh scans miss (got: ${connection.device?.name})",
)
bleTransport.close()
} finally {

View File

@@ -27,6 +27,8 @@ import kotlinx.coroutines.ExperimentalCoroutinesApi
import kotlinx.coroutines.test.TestScope
import kotlinx.coroutines.test.advanceTimeBy
import kotlinx.coroutines.test.runTest
import org.meshtastic.core.ble.MeshtasticBleConstants.FROMNUM_CHARACTERISTIC
import org.meshtastic.core.ble.MeshtasticBleConstants.FROMRADIO_CHARACTERISTIC
import org.meshtastic.core.ble.MeshtasticBleConstants.SERVICE_UUID
import org.meshtastic.core.model.RadioNotConnectedException
import org.meshtastic.core.repository.RadioInterfaceService
@@ -209,9 +211,11 @@ class BleRadioTransportTest {
}
@Test
fun `findDevice does not fall back to bonded device when scan finds nothing`() = runTest {
fun `findDevice falls back to bonded device when fresh scans miss`() = runTest {
val bondedDevice = FakeBleDevice(address = address, name = "Bonded Device")
bluetoothRepository.bond(bondedDevice)
connection.service.addCharacteristic(FROMNUM_CHARACTERISTIC)
connection.service.addCharacteristic(FROMRADIO_CHARACTERISTIC)
val bleTransport =
BleRadioTransport(
@@ -224,12 +228,11 @@ class BleRadioTransportTest {
)
bleTransport.start()
try {
// 3s settle + 2s targeted + 5s escalated scan = 10s before RadioNotConnectedException.
// The bonded device is never advertising, so findDevice must escalate and throw
// rather than returning the stale bonded handle.
// 3s settle + 2s targeted + 5s escalated scan = 10s before bonded fallback.
advanceTimeBy(11_000)
assertEquals(0, connection.connectAndAwaitCalls, "Must not connect when bonded device is not advertising")
assertEquals(1, connection.connectAndAwaitCalls, "Must use bounded bonded fallback after fresh scans miss")
assertEquals("Bonded Device", connection.device?.name)
} finally {
bleTransport.close()
}