feat(ai): upgrade Chirpy on-device AI with proper APIs, download UX, and streaming (#5579)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-24 23:01:22 -04:00 · 2026-05-22 16:51:02 -07:00
parent 5ec6d80f61
commit d870141b7c
132 changed files with 15131 additions and 150 deletions
--- a/androidApp/src/google/kotlin/org/meshtastic/app/ai/GeminiNanoDocAssistant.kt
+++ b/androidApp/src/google/kotlin/org/meshtastic/app/ai/GeminiNanoDocAssistant.kt
@@ -21,11 +21,17 @@ import com.google.firebase.Firebase
 import com.google.firebase.ai.DownloadStatus
 import com.google.firebase.ai.InferenceMode
 import com.google.firebase.ai.OnDeviceConfig
+import com.google.firebase.ai.OnDeviceModelOption
 import com.google.firebase.ai.OnDeviceModelStatus
 import com.google.firebase.ai.ai
 import com.google.firebase.ai.type.GenerativeBackend
 import com.google.firebase.ai.type.PublicPreviewAPI
 import com.google.firebase.ai.type.content
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.MutableStateFlow
+import kotlinx.coroutines.flow.StateFlow
+import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.first
 import org.meshtastic.core.repository.NodeRepository
 import org.meshtastic.feature.docs.ai.AIDocAssistant
@@ -34,16 +40,15 @@ import org.meshtastic.feature.docs.data.KeywordSearchEngine
 import org.meshtastic.feature.docs.model.AIDocAssistantResult
 import org.meshtastic.feature.docs.model.DocPage
 import org.meshtastic.feature.docs.model.DocsAiError
+import org.meshtastic.feature.docs.model.ModelReadiness

 /**
 * Gemini on-device AI assistant for the Google flavor.
 *
- * Runs entirely on-device using the Firebase AI Logic SDK with [InferenceMode.ONLY_ON_DEVICE]. Supported on Pixel 9+,
- * Samsung Galaxy S25/S26, OnePlus 13/15, and other devices with AICore.
+ * Runs entirely on-device using the Firebase AI Logic SDK with [InferenceMode.ONLY_ON_DEVICE].
 *
 * Context strategy: extracts only the **most relevant paragraphs** from each page (those containing query terms),
- * strips markdown formatting to maximize information density, and fits within the on-device token budget (optimized for
- * modern Nano v4 with up to 32K tokens / 32,000 characters context).
+ * strips markdown formatting to maximize information density, and fits within the on-device token budget.
 *
 * Multi-turn history is supported locally by formatting the past conversation turns directly into the prompt.
 *
@@ -62,60 +67,123 @@ class GeminiNanoDocAssistant(
            .generativeModel(
                modelName = MODEL_NAME,
                systemInstruction = content { text(SYSTEM_INSTRUCTION) },
-                onDeviceConfig = OnDeviceConfig(mode = InferenceMode.ONLY_ON_DEVICE),
+                onDeviceConfig =
+                OnDeviceConfig(
+                    mode = InferenceMode.ONLY_ON_DEVICE,
+                    modelOption = OnDeviceModelOption.STABLE,
+                    maxOutputTokens = MAX_OUTPUT_TOKENS,
+                    temperature = TEMPERATURE,
+                    topK = TOP_K,
+                ),
            )
    }

+    private val _modelStatus = MutableStateFlow<ModelReadiness>(ModelReadiness.Checking)
+
+    /** Exposes model download/readiness state for UI consumption. */
+    override val modelStatus: StateFlow<ModelReadiness> = _modelStatus.asStateFlow()
+
    /** Conversation history stored as list of (Question, Answer) pairs. */
    private val history = mutableListOf<Pair<String, String>>()

    override suspend fun isSupported(): Boolean = try {
        val ext = onDeviceModel.onDeviceExtension
-        val status = ext?.checkStatus()
+        if (ext == null) {
+            _modelStatus.value = ModelReadiness.Unavailable("On-device extension not available")
+            return false
+        }
+        _modelStatus.value = ModelReadiness.Checking
+        val status = ext.checkStatus()
        Logger.d(tag = TAG) { "On-device model status: $status" }
        when (status) {
-            OnDeviceModelStatus.AVAILABLE -> true
-
-            OnDeviceModelStatus.DOWNLOADING -> true
-
-            OnDeviceModelStatus.DOWNLOADABLE -> {
-                Logger.i(tag = TAG) { "Model downloadable — requesting download" }
-                ext.download().collect { downloadStatus ->
-                    when (downloadStatus) {
-                        is DownloadStatus.DownloadStarted ->
-                            Logger.d(tag = TAG) { "Download started: ${downloadStatus.bytesToDownload} bytes" }
-
-                        is DownloadStatus.DownloadInProgress ->
-                            Logger.d(tag = TAG) {
-                                "Download progress: ${downloadStatus.totalBytesDownloaded} bytes"
-                            }
-
-                        is DownloadStatus.DownloadCompleted -> Logger.i(tag = TAG) { "Model download completed" }
-
-                        is DownloadStatus.DownloadFailed ->
-                            Logger.w(tag = TAG) { "Model download failed: $downloadStatus" }
-                    }
-                }
+            OnDeviceModelStatus.AVAILABLE -> {
+                warmUp(ext)
+                _modelStatus.value = ModelReadiness.Available
                true
            }

-            else -> false
+            OnDeviceModelStatus.DOWNLOADING -> {
+                _modelStatus.value = ModelReadiness.Downloading(bytesDownloaded = 0L, totalBytes = 0L)
+                false
+            }
+
+            OnDeviceModelStatus.DOWNLOADABLE -> {
+                Logger.i(tag = TAG) { "Model downloadable — requesting download" }
+                var downloadCompleted = false
+                var totalSize = 0L
+                ext.download().collect { downloadStatus ->
+                    when (downloadStatus) {
+                        is DownloadStatus.DownloadStarted -> {
+                            totalSize = downloadStatus.bytesToDownload
+                            _modelStatus.value = ModelReadiness.Downloading(0L, totalSize)
+                            Logger.d(tag = TAG) { "Download started: $totalSize bytes" }
+                        }
+
+                        is DownloadStatus.DownloadInProgress -> {
+                            _modelStatus.value =
+                                ModelReadiness.Downloading(downloadStatus.totalBytesDownloaded, totalSize)
+                            Logger.d(tag = TAG) {
+                                "Download progress: ${downloadStatus.totalBytesDownloaded}/$totalSize"
+                            }
+                        }
+
+                        is DownloadStatus.DownloadCompleted -> {
+                            Logger.i(tag = TAG) { "Model download completed" }
+                            warmUp(ext)
+                            _modelStatus.value = ModelReadiness.Available
+                            downloadCompleted = true
+                        }
+
+                        is DownloadStatus.DownloadFailed -> {
+                            _modelStatus.value = ModelReadiness.Unavailable("Download failed")
+                            Logger.w(tag = TAG) { "Model download failed: $downloadStatus" }
+                        }
+                    }
+                }
+                downloadCompleted
+            }
+
+            else -> {
+                _modelStatus.value = ModelReadiness.Unavailable("Model unavailable on this device")
+                false
+            }
        }
    } catch (e: Exception) {
        if (e is kotlinx.coroutines.CancellationException) throw e
        Logger.w(tag = TAG) { "isSupported() check failed: ${e.message}" }
+        _modelStatus.value = ModelReadiness.Unavailable(e.message)
        false
    }

+    private suspend fun warmUp(ext: com.google.firebase.ai.OnDeviceExtension) {
+        try {
+            ext.warmUp()
+            Logger.i(tag = TAG) { "Model warmed up successfully" }
+        } catch (e: Exception) {
+            if (e is kotlinx.coroutines.CancellationException) throw e
+            Logger.w(tag = TAG) { "Warmup failed (non-fatal): ${e.message}" }
+        }
+    }
+
    override suspend fun answer(question: String, currentPageId: String?): AIDocAssistantResult =
        answerStream(question, currentPageId).first { it !is AIDocAssistantResult.Partial }

-    @Suppress("TooGenericExceptionCaught")
    override fun answerStream(
        question: String,
        currentPageId: String?,
    ): kotlinx.coroutines.flow.Flow<AIDocAssistantResult> = kotlinx.coroutines.flow.flow {
-        try {
+        // Fast path: short prompts with no page context skip expensive doc loading.
+        val isLightweight =
+            question.length < MAX_LIGHTWEIGHT_PROMPT_LEN && currentPageId == null && history.isEmpty()
+
+        val prompt: String
+        val contextPages: List<DocPage>
+
+        if (isLightweight) {
+            prompt = "You are Chirpy, the Meshtastic mesh networking assistant mascot. $question"
+            contextPages = emptyList()
+            Logger.d(tag = TAG) { "Lightweight prompt (no context): ${question.length} chars" }
+        } else {
            val bundle = bundleLoader.load()
            val queryTerms = extractQueryTerms(question)

@@ -133,59 +201,98 @@ class GeminiNanoDocAssistant(
                "Context: ${contextResult.parts.size} pages, ${contextResult.totalChars} chars (budget $MAX_CONTEXT_CHARS)"
            }

-            val prompt = buildPrompt(question, contextResult.parts)
+            prompt = buildPrompt(question, contextResult.parts)
+            contextPages = contextResult.usedPageIds.mapNotNull { id -> bundle.pages.find { it.id == id } }
            Logger.d(tag = TAG) { "Prompt: ${prompt.length} chars, history count: ${history.size}" }
+        }

-            val contextPages = contextResult.usedPageIds.mapNotNull { id -> bundle.pages.find { it.id == id } }
-
-            val accumulatedText = StringBuilder()
-            onDeviceModel.generateContentStream(prompt).collect { chunk ->
-                val text = chunk.text
-                if (!text.isNullOrEmpty()) {
-                    accumulatedText.append(text)
-                    emit(
-                        AIDocAssistantResult.Partial(
-                            answer = accumulatedText.toString(),
-                            sourcePages = contextPages,
-                            usedOnDeviceModel = true,
-                        ),
-                    )
+        var lastError: Exception? = null
+        for (attempt in 0..MAX_RETRIES) {
+            if (attempt > 0) {
+                val backoffMs = INITIAL_BACKOFF_MS * (1L shl (attempt - 1))
+                Logger.i(tag = TAG) {
+                    "Retrying inference in ${backoffMs}ms (attempt ${attempt + 1}/${MAX_RETRIES + 1})"
                }
+                delay(backoffMs)
            }
-            val onDeviceAnswer = accumulatedText.toString().trimEnd()
-
-            val mentionedPages =
-                bundle.pages.filter { page ->
-                    page.id !in contextResult.usedPageIds && onDeviceAnswer.contains(page.title, ignoreCase = true)
+            try {
+                val accumulatedText = StringBuilder()
+                var lastEmitTime = 0L
+                onDeviceModel.generateContentStream(prompt).collect { chunk ->
+                    val text = chunk.text
+                    if (!text.isNullOrEmpty()) {
+                        accumulatedText.append(text)
+                        val now = System.nanoTime()
+                        val elapsedMs = (now - lastEmitTime) / 1_000_000
+                        if (elapsedMs >= STREAM_THROTTLE_MS) {
+                            lastEmitTime = now
+                            emit(
+                                AIDocAssistantResult.Partial(
+                                    answer = cleanResponse(accumulatedText.toString()),
+                                    sourcePages = contextPages,
+                                    usedOnDeviceModel = true,
+                                ),
+                            )
+                        }
+                    }
+                    // Log token usage from last chunk
+                    chunk.usageMetadata?.let { meta ->
+                        Logger.d(tag = TAG) {
+                            "Tokens — prompt: ${meta.promptTokenCount}, response: ${meta.candidatesTokenCount}, total: ${meta.totalTokenCount}"
+                        }
+                    }
                }
-            val allSourcePages = contextPages + mentionedPages
+                // Emit final partial to ensure UI has the complete text before Success
+                emit(
+                    AIDocAssistantResult.Partial(
+                        answer = cleanResponse(accumulatedText.toString()),
+                        sourcePages = contextPages,
+                        usedOnDeviceModel = true,
+                    ),
+                )
+                val onDeviceAnswer = cleanResponse(accumulatedText.toString().trimEnd())
+                val allSourcePages = contextPages

-            // Record this turn in the conversation history, keeping only the last 4 turns to avoid memory leaks
-            history.add(question to onDeviceAnswer)
-            if (history.size > 4) {
-                history.removeAt(0)
+                // Keep short history because on-device inference has a smaller request token budget.
+                if (!isLightweight) {
+                    history.add(question.take(MAX_HISTORY_CHARS) to onDeviceAnswer.take(MAX_HISTORY_CHARS))
+                    if (history.size > MAX_HISTORY_TURNS) {
+                        history.removeAt(0)
+                    }
+                }
+
+                emit(
+                    AIDocAssistantResult.Success(
+                        answer = onDeviceAnswer,
+                        sourcePages = allSourcePages,
+                        usedOnDeviceModel = true,
+                    ),
+                )
+                return@flow // Success — exit retry loop
+            } catch (e: Exception) {
+                if (e is kotlinx.coroutines.CancellationException) throw e
+                lastError = e
+                val isBusy =
+                    e.message?.contains("BUSY", ignoreCase = true) == true ||
+                        e.message?.contains("BATTERY", ignoreCase = true) == true ||
+                        e.message?.contains("BACKGROUND", ignoreCase = true) == true
+                if (!isBusy || attempt >= MAX_RETRIES) {
+                    Logger.w(tag = TAG) { "On-device inference failed: ${e.message}" }
+                    val errorType =
+                        when {
+                            isBusy -> DocsAiError.Busy
+
+                            e.message?.contains("UNAVAILABLE", ignoreCase = true) == true ->
+                                DocsAiError.ModelUnavailable
+
+                            else -> DocsAiError.Unknown
+                        }
+                    val fallbackPages = searchEngine.selectForTokenBudget(question, maxChars = MAX_CONTEXT_CHARS)
+                    emit(AIDocAssistantResult.Error(reason = errorType, suggestedPages = fallbackPages))
+                    return@flow
+                }
+                Logger.i(tag = TAG) { "BUSY error, will retry (attempt ${attempt + 1})" }
            }
-
-            emit(
-                AIDocAssistantResult.Success(
-                    answer = onDeviceAnswer,
-                    sourcePages = allSourcePages,
-                    usedOnDeviceModel = true,
-                ),
-            )
-        } catch (e: Exception) {
-            if (e is kotlinx.coroutines.CancellationException) throw e
-            Logger.w(tag = TAG) { "On-device inference failed: ${e.message}" }
-            val errorType =
-                when {
-                    e.message?.contains("BUSY", ignoreCase = true) == true -> DocsAiError.Busy
-                    e.message?.contains("BATTERY", ignoreCase = true) == true -> DocsAiError.Busy
-                    e.message?.contains("BACKGROUND", ignoreCase = true) == true -> DocsAiError.Busy
-                    e.message?.contains("UNAVAILABLE", ignoreCase = true) == true -> DocsAiError.ModelUnavailable
-                    else -> DocsAiError.Unknown
-                }
-            val fallbackPages = searchEngine.selectForTokenBudget(question, maxChars = MAX_CONTEXT_CHARS)
-            emit(AIDocAssistantResult.Error(reason = errorType, suggestedPages = fallbackPages))
        }
    }

@@ -194,6 +301,15 @@ class GeminiNanoDocAssistant(
        Logger.d(tag = TAG) { "Chat session reset" }
    }

+    /** Cleans on-device model response artifacts (markdown fences, excessive newlines). */
+    private fun cleanResponse(text: String): String = text
+        .removePrefix("```markdown\n")
+        .removePrefix("```\n")
+        .removeSuffix("\n```")
+        .removeSuffix("```")
+        .replace(Regex("\n{3,}"), "\n\n")
+        .trim()
+
    private data class ContextResult(val parts: List<String>, val usedPageIds: Set<String>, val totalChars: Int)

    /** Builds context parts from ranked pages within the given char budget. */
@@ -400,7 +516,7 @@ class GeminiNanoDocAssistant(
        val historyStr =
            if (history.isNotEmpty()) {
                "Previous conversation history:\n" +
-                    history.takeLast(4).joinToString("\n") { (q, a) -> "User: $q\nAssistant: $a" } +
+                    history.takeLast(MAX_HISTORY_TURNS).joinToString("\n") { (q, a) -> "User: $q\nAssistant: $a" } +
                    "\n\n"
            } else {
                ""
@@ -423,8 +539,8 @@ class GeminiNanoDocAssistant(
    companion object {
        private const val TAG = "ChirpyAI"

-        /** Gemini Nano v4 — local on-device LLM model supporting larger 32K token context window. */
-        private const val MODEL_NAME = "nano-v4-full"
+        /** Cloud model identifier; with ONLY_ON_DEVICE mode, inference remains local. */
+        private const val MODEL_NAME = "gemini-2.5-flash-lite"

        private const val SYSTEM_INSTRUCTION =
            """You are Chirpy, the friendly AI assistant and official Node mascot built into the Meshtastic Android app. You help users understand mesh networking, configure their Meshtastic nodes, troubleshoot connectivity issues, and get the most out of the Meshtastic ecosystem.
@@ -433,27 +549,34 @@ Personality: Helpful, concise, cheerful, and highly enthusiastic about mesh netw

 Knowledge sources (in priority order):
 1. Bundled app documentation provided as context below
-2. Official Meshtastic documentation at meshtastic.org/docs
-3. Official Meshtastic GitHub repositories (github.com/meshtastic)
-4. General LoRa/mesh networking knowledge
+2. Your pre-trained knowledge of Meshtastic concepts (may not reflect latest changes)
+3. General LoRa/mesh networking knowledge from training data

 Guidelines:
 - Answer the user's question directly and helpfully
 - When the bundled docs cover the topic, cite them
- When the bundled docs don't cover it, use your knowledge of official Meshtastic sources — don't refuse to help
+- If the bundled docs don't cover a topic, use your training knowledge but note it may be outdated — suggest checking meshtastic.org/docs for the latest information
 - Only reference official Meshtastic sources (meshtastic.org, github.com/meshtastic) — never cite random forums, blogs, or third-party sites
 - For firmware-specific or hardware-specific questions beyond app scope, point users to meshtastic.org/docs
 - Keep answers concise (2-4 short paragraphs max) unless the user asks for detail
+- Never give the user a nickname or pet name — just address them naturally
 - If you're truly unsure about something Meshtastic-specific, say so honestly rather than guessing"""

-        /** Total context char budget — optimized for modern on-device Nano v4 (~32K tokens context window). */
-        private const val MAX_CONTEXT_CHARS = 32_000
+        /**
+         * Total prompt context char budget. Keep this conservative because on-device requests are limited to about 4k
+         * tokens.
+         */
+        private const val MAX_CONTEXT_CHARS = 9_000

        /** Max chars for the current page (gets priority). */
-        private const val MAX_PAGE_CHARS = 16_000
+        private const val MAX_PAGE_CHARS = 4_000

        /** Max chars per additional page snippet. */
-        private const val MAX_SNIPPET_CHARS = 8_000
+        private const val MAX_SNIPPET_CHARS = 1_500
+
+        /** Keep only a short, bounded history for on-device prompt assembly. */
+        private const val MAX_HISTORY_TURNS = 2
+        private const val MAX_HISTORY_CHARS = 600

        /** Minimum useful snippet size — don't bother with tiny fragments. */
        private const val MIN_USEFUL_SNIPPET = 100
@@ -461,6 +584,27 @@ Guidelines:
        /** Minimum paragraph length to consider. */
        private const val MIN_PARAGRAPH_LEN = 20

+        /** Maximum output tokens for on-device generation. */
+        private const val MAX_OUTPUT_TOKENS = 256
+
+        /** Prompts shorter than this with no page context use the fast (no-doc-loading) path. */
+        private const val MAX_LIGHTWEIGHT_PROMPT_LEN = 100
+
+        /** Temperature for response generation (0.0 = deterministic, 1.0 = creative). */
+        private const val TEMPERATURE = 0.7f
+
+        /** Top-K for token selection. */
+        private const val TOP_K = 40
+
+        /** Max retry attempts for BUSY errors. */
+        private const val MAX_RETRIES = 3
+
+        /** Initial backoff delay in milliseconds (doubles each retry). */
+        private const val INITIAL_BACKOFF_MS = 500L
+
+        /** Minimum interval between partial stream emissions to avoid UI jank. */
+        private const val STREAM_THROTTLE_MS = 80L
+
        // Scoring weights for page ranking
        private const val CONTENT_MATCH_SCORE = 3
        private const val TITLE_MATCH_SCORE = 10
--- a/androidApp/src/google/kotlin/org/meshtastic/app/di/GoogleAiModule.kt
+++ b/androidApp/src/google/kotlin/org/meshtastic/app/di/GoogleAiModule.kt
@@ -30,6 +30,10 @@ import org.meshtastic.feature.docs.data.KeywordSearchEngine
 import org.meshtastic.feature.docs.translation.DocTranslationCache
 import org.meshtastic.feature.docs.translation.DocTranslationService

+// TODO: Enable Firebase App Check (with Play Integrity provider) if hybrid/cloud
+//  fallback is ever adopted. App Check only gates cloud proxy requests — on-device
+//  inference (ONLY_ON_DEVICE mode) bypasses it entirely, so no action needed today.
+
 /** Provides the on-device Gemini Nano AI assistant for the Google flavor. */
@Module
 class GoogleAiModule {