Sane sanitization

2026-05-24 16:58:01 -04:00 · 2026-04-23 14:19:33 -05:00
parent 031f332ec1
commit 2cc13a1132
6 changed files with 302 additions and 0 deletions
--- a/platformio.ini
+++ b/platformio.ini
@@ -29,6 +29,7 @@ build_flags = -Wno-missing-field-initializers
 	-DUSE_THREAD_NAMES
 	-DTINYGPS_OPTION_NO_CUSTOM_FIELDS
 	-DPB_ENABLE_MALLOC=1
+	-DPB_VALIDATE_UTF8=1
 	-DRADIOLIB_EXCLUDE_CC1101=1
 	-DRADIOLIB_EXCLUDE_NRF24=1
 	-DRADIOLIB_EXCLUDE_RF69=1
--- a/src/mesh/TypeConversions.cpp
+++ b/src/mesh/TypeConversions.cpp
@@ -1,6 +1,7 @@
 #include "TypeConversions.h"
 #include "mesh/generated/meshtastic/deviceonly.pb.h"
 #include "mesh/generated/meshtastic/mesh.pb.h"
+#include "meshUtils.h"

 meshtastic_NodeInfo TypeConversions::ConvertToNodeInfo(const meshtastic_NodeInfoLite *lite)
 {
@@ -82,8 +83,10 @@ meshtastic_UserLite TypeConversions::ConvertToUserLite(meshtastic_User user)

    strncpy(lite.long_name, user.long_name, sizeof(lite.long_name));
    lite.long_name[sizeof(lite.long_name) - 1] = '\0';
+    sanitizeUtf8(lite.long_name, sizeof(lite.long_name));
    strncpy(lite.short_name, user.short_name, sizeof(lite.short_name));
    lite.short_name[sizeof(lite.short_name) - 1] = '\0';
+    sanitizeUtf8(lite.short_name, sizeof(lite.short_name));
    lite.hw_model = user.hw_model;
    lite.role = user.role;
    lite.is_licensed = user.is_licensed;
@@ -102,8 +105,10 @@ meshtastic_User TypeConversions::ConvertToUser(uint32_t nodeNum, meshtastic_User
    snprintf(user.id, sizeof(user.id), "!%08x", nodeNum);
    strncpy(user.long_name, lite.long_name, sizeof(user.long_name));
    user.long_name[sizeof(user.long_name) - 1] = '\0';
+    sanitizeUtf8(user.long_name, sizeof(user.long_name));
    strncpy(user.short_name, lite.short_name, sizeof(user.short_name));
    user.short_name[sizeof(user.short_name) - 1] = '\0';
+    sanitizeUtf8(user.short_name, sizeof(user.short_name));
    user.hw_model = lite.hw_model;
    user.role = lite.role;
    user.is_licensed = lite.is_licensed;
--- a/src/meshUtils.cpp
+++ b/src/meshUtils.cpp
@@ -117,4 +117,93 @@ size_t pb_string_length(const char *str, size_t max_len)
        }
    }
    return len;
+}
+
+bool sanitizeUtf8(char *buf, size_t bufSize)
+{
+    if (!buf || bufSize == 0)
+        return false;
+
+    // Ensure null-terminated within buffer
+    buf[bufSize - 1] = '\0';
+
+    bool replaced = false;
+    size_t i = 0;
+    size_t len = strlen(buf);
+
+    while (i < len) {
+        uint8_t b = (uint8_t)buf[i];
+
+        // Determine expected sequence length from lead byte
+        size_t seqLen;
+        uint32_t minCodepoint;
+        if (b <= 0x7F) {
+            // ASCII — valid single byte
+            i++;
+            continue;
+        } else if ((b & 0xE0) == 0xC0) {
+            seqLen = 2;
+            minCodepoint = 0x80; // Reject overlong
+        } else if ((b & 0xF0) == 0xE0) {
+            seqLen = 3;
+            minCodepoint = 0x800;
+        } else if ((b & 0xF8) == 0xF0) {
+            seqLen = 4;
+            minCodepoint = 0x10000;
+        } else {
+            // Invalid lead byte (0x80-0xBF or 0xF8+)
+            buf[i] = '?';
+            replaced = true;
+            i++;
+            continue;
+        }
+
+        // Check that we have enough bytes remaining
+        if (i + seqLen > len) {
+            // Truncated sequence at end of string — replace remaining bytes
+            for (size_t j = i; j < len; j++) {
+                buf[j] = '?';
+            }
+            replaced = true;
+            break;
+        }
+
+        // Validate continuation bytes (must be 10xxxxxx)
+        bool valid = true;
+        for (size_t j = 1; j < seqLen; j++) {
+            if (((uint8_t)buf[i + j] & 0xC0) != 0x80) {
+                valid = false;
+                break;
+            }
+        }
+
+        if (valid) {
+            // Decode codepoint to check for overlong encodings and surrogates
+            uint32_t cp = 0;
+            if (seqLen == 2)
+                cp = b & 0x1F;
+            else if (seqLen == 3)
+                cp = b & 0x0F;
+            else
+                cp = b & 0x07;
+            for (size_t j = 1; j < seqLen; j++)
+                cp = (cp << 6) | ((uint8_t)buf[i + j] & 0x3F);
+
+            if (cp < minCodepoint || cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
+                // Overlong encoding, out of Unicode range, or surrogate half
+                valid = false;
+            }
+        }
+
+        if (valid) {
+            i += seqLen;
+        } else {
+            // Replace only the lead byte; continuation bytes will be caught on next iteration
+            buf[i] = '?';
+            replaced = true;
+            i++;
+        }
+    }
+
+    return replaced;
 }
--- a/src/meshUtils.h
+++ b/src/meshUtils.h
@@ -38,6 +38,10 @@ const std::string vformat(const char *const zcFormat, ...);
 // Get actual string length for nanopb char array fields.
 size_t pb_string_length(const char *str, size_t max_len);

+// Sanitize a fixed-size char buffer in-place by replacing invalid UTF-8 sequences with '?'.
+// Ensures the result is null-terminated within bufSize. Returns true if any bytes were replaced.
+bool sanitizeUtf8(char *buf, size_t bufSize);
+
 /// Calculate 2^n without calling pow() - used for spreading factor and other calculations
 inline uint32_t pow_of_2(uint32_t n)
 {
--- a/src/modules/AdminModule.cpp
+++ b/src/modules/AdminModule.cpp
@@ -599,10 +599,14 @@ void AdminModule::handleSetOwner(const meshtastic_User &o)
    if (*o.long_name) {
        changed |= strcmp(owner.long_name, o.long_name);
        strncpy(owner.long_name, o.long_name, sizeof(owner.long_name));
+        owner.long_name[sizeof(owner.long_name) - 1] = '\0';
+        sanitizeUtf8(owner.long_name, sizeof(owner.long_name));
    }
    if (*o.short_name) {
        changed |= strcmp(owner.short_name, o.short_name);
        strncpy(owner.short_name, o.short_name, sizeof(owner.short_name));
+        owner.short_name[sizeof(owner.short_name) - 1] = '\0';
+        sanitizeUtf8(owner.short_name, sizeof(owner.short_name));
    }
    snprintf(owner.id, sizeof(owner.id), "!%08x", nodeDB->getNodeNum());

@@ -1400,7 +1404,11 @@ void AdminModule::handleSetHamMode(const meshtastic_HamParameters &p)

    // Set call sign and override lora limitations for licensed use
    strncpy(owner.long_name, p.call_sign, sizeof(owner.long_name));
+    owner.long_name[sizeof(owner.long_name) - 1] = '\0';
+    sanitizeUtf8(owner.long_name, sizeof(owner.long_name));
    strncpy(owner.short_name, p.short_name, sizeof(owner.short_name));
+    owner.short_name[sizeof(owner.short_name) - 1] = '\0';
+    sanitizeUtf8(owner.short_name, sizeof(owner.short_name));
    owner.is_licensed = true;
    config.lora.override_duty_cycle = true;
    config.lora.tx_power = p.tx_power;
--- a/test/test_utf8/test_main.cpp
+++ b/test/test_utf8/test_main.cpp
@@ -0,0 +1,195 @@
+#include "meshUtils.h"
+#include <cstring>
+#include <unity.h>
+
+void setUp(void) {}
+void tearDown(void) {}
+
+// --- Valid UTF-8 should pass through unchanged ---
+
+void test_ascii_unchanged()
+{
+    char buf[32] = "Hello World";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("Hello World", buf);
+}
+
+void test_valid_2byte_unchanged()
+{
+    // "café" — é is C3 A9
+    char buf[16] = "caf\xC3\xA9";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("caf\xC3\xA9", buf);
+}
+
+void test_valid_3byte_unchanged()
+{
+    // "€" is E2 82 AC
+    char buf[16] = "\xE2\x82\xAC";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("\xE2\x82\xAC", buf);
+}
+
+void test_valid_4byte_emoji_unchanged()
+{
+    // 🌙 is F0 9F 8C 99
+    char buf[16] = "\xF0\x9F\x8C\x99";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("\xF0\x9F\x8C\x99", buf);
+}
+
+void test_valid_mixed_unchanged()
+{
+    // "Hi 🌙!" — mix of ASCII and 4-byte
+    char buf[16] = "Hi \xF0\x9F\x8C\x99!";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("Hi \xF0\x9F\x8C\x99!", buf);
+}
+
+void test_empty_string()
+{
+    char buf[8] = "";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("", buf);
+}
+
+// --- Invalid sequences observed in the wild ---
+
+void test_truncated_4byte_at_end()
+{
+    // Name with valid emoji 🌙 followed by a truncated 4-byte sequence + ASCII
+    char buf[32] = "Lunar Tower \xF0\x9F\x8C\x99\xF0\x9F\x97"
+                   "4";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    // The 🌙 should be preserved; F0 9F 97 is an incomplete 4-byte sequence,
+    // '4' (0x34) is not a valid continuation byte
+    TEST_ASSERT_EQUAL_STRING("Lunar Tower \xF0\x9F\x8C\x99???4", buf);
+}
+
+void test_lone_lead_bytes_without_continuations()
+{
+    // Mixed ASCII with stray multibyte lead bytes (E1, F3) lacking proper continuations
+    char buf[32] = "Mesht\xE1\xF3tic 37e2";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    // E1 expects 2 continuation bytes, but F3 is not a continuation → E1 replaced
+    // F3 expects 3 continuation bytes, 't','i','c' are not continuations → F3 replaced
+    TEST_ASSERT_EQUAL_STRING("Mesht??tic 37e2", buf);
+}
+
+// --- Edge cases ---
+
+void test_bare_continuation_byte()
+{
+    // 0x80 alone is invalid (continuation byte with no lead)
+    char buf[8] = "\x80";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("?", buf);
+}
+
+void test_overlong_2byte()
+{
+    // C0 AF is an overlong encoding of U+002F '/'
+    char buf[8] = "\xC0\xAF";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    // C0 is a 2-byte lead, AF is valid continuation, but codepoint 0x2F < 0x80 → overlong
+    // C0 replaced, AF (now bare continuation) also replaced
+    TEST_ASSERT_EQUAL_STRING("??", buf);
+}
+
+void test_surrogate_half()
+{
+    // ED A0 80 encodes U+D800 (surrogate half — invalid in UTF-8)
+    char buf[8] = "\xED\xA0\x80";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("???", buf);
+}
+
+void test_5byte_sequence_rejected()
+{
+    // F8 80 80 80 80 — 5-byte sequence, not valid UTF-8
+    char buf[8] = "\xF8\x80\x80\x80\x80";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    // F8 is invalid lead (>= 0xF8), each 0x80 is bare continuation
+    TEST_ASSERT_EQUAL_STRING("?????", buf);
+}
+
+void test_truncated_3byte_at_buffer_end()
+{
+    // Buffer is exactly 4 bytes: E2 82 then forced null at [3]
+    char buf[4];
+    buf[0] = '\xE2';
+    buf[1] = '\x82';
+    buf[2] = '\0'; // String ends before the 3-byte sequence completes
+    buf[3] = '\0';
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("??", buf);
+}
+
+void test_null_termination_enforced()
+{
+    // Fill buffer completely with no null terminator
+    char buf[5];
+    memset(buf, 'A', sizeof(buf));
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+    // Should be null-terminated and content preserved (all ASCII)
+    TEST_ASSERT_EQUAL_STRING("AAAA", buf);
+}
+
+void test_null_buffer()
+{
+    TEST_ASSERT_FALSE(sanitizeUtf8(nullptr, 10));
+}
+
+void test_zero_size()
+{
+    char buf[4] = "Hi";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, 0));
+    // Buffer should be untouched
+    TEST_ASSERT_EQUAL_STRING("Hi", buf);
+}
+
+void test_valid_max_codepoint()
+{
+    // U+10FFFF = F4 8F BF BF (maximum valid Unicode codepoint)
+    char buf[8] = "\xF4\x8F\xBF\xBF";
+    TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
+    TEST_ASSERT_EQUAL_STRING("\xF4\x8F\xBF\xBF", buf);
+}
+
+void test_above_max_codepoint()
+{
+    // U+110000 = F4 90 80 80 (just above maximum valid Unicode)
+    char buf[8] = "\xF4\x90\x80\x80";
+    TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
+}
+
+int main(int argc, char **argv)
+{
+    UNITY_BEGIN();
+
+    // Valid UTF-8 passthrough
+    RUN_TEST(test_ascii_unchanged);
+    RUN_TEST(test_valid_2byte_unchanged);
+    RUN_TEST(test_valid_3byte_unchanged);
+    RUN_TEST(test_valid_4byte_emoji_unchanged);
+    RUN_TEST(test_valid_mixed_unchanged);
+    RUN_TEST(test_empty_string);
+
+    // Invalid sequences observed in the wild
+    RUN_TEST(test_truncated_4byte_at_end);
+    RUN_TEST(test_lone_lead_bytes_without_continuations);
+
+    // Edge cases
+    RUN_TEST(test_bare_continuation_byte);
+    RUN_TEST(test_overlong_2byte);
+    RUN_TEST(test_surrogate_half);
+    RUN_TEST(test_5byte_sequence_rejected);
+    RUN_TEST(test_truncated_3byte_at_buffer_end);
+    RUN_TEST(test_null_termination_enforced);
+    RUN_TEST(test_null_buffer);
+    RUN_TEST(test_zero_size);
+    RUN_TEST(test_valid_max_codepoint);
+    RUN_TEST(test_above_max_codepoint);
+
+    return UNITY_END();
+}