diff --git a/platformio.ini b/platformio.ini index cd22fab6e..a97b813fa 100644 --- a/platformio.ini +++ b/platformio.ini @@ -29,6 +29,7 @@ build_flags = -Wno-missing-field-initializers -DUSE_THREAD_NAMES -DTINYGPS_OPTION_NO_CUSTOM_FIELDS -DPB_ENABLE_MALLOC=1 + -DPB_VALIDATE_UTF8=1 -DRADIOLIB_EXCLUDE_CC1101=1 -DRADIOLIB_EXCLUDE_NRF24=1 -DRADIOLIB_EXCLUDE_RF69=1 diff --git a/src/mesh/TypeConversions.cpp b/src/mesh/TypeConversions.cpp index 201a703e2..3798daf28 100644 --- a/src/mesh/TypeConversions.cpp +++ b/src/mesh/TypeConversions.cpp @@ -1,6 +1,7 @@ #include "TypeConversions.h" #include "mesh/generated/meshtastic/deviceonly.pb.h" #include "mesh/generated/meshtastic/mesh.pb.h" +#include "meshUtils.h" meshtastic_NodeInfo TypeConversions::ConvertToNodeInfo(const meshtastic_NodeInfoLite *lite) { @@ -82,8 +83,10 @@ meshtastic_UserLite TypeConversions::ConvertToUserLite(meshtastic_User user) strncpy(lite.long_name, user.long_name, sizeof(lite.long_name)); lite.long_name[sizeof(lite.long_name) - 1] = '\0'; + sanitizeUtf8(lite.long_name, sizeof(lite.long_name)); strncpy(lite.short_name, user.short_name, sizeof(lite.short_name)); lite.short_name[sizeof(lite.short_name) - 1] = '\0'; + sanitizeUtf8(lite.short_name, sizeof(lite.short_name)); lite.hw_model = user.hw_model; lite.role = user.role; lite.is_licensed = user.is_licensed; @@ -102,8 +105,10 @@ meshtastic_User TypeConversions::ConvertToUser(uint32_t nodeNum, meshtastic_User snprintf(user.id, sizeof(user.id), "!%08x", nodeNum); strncpy(user.long_name, lite.long_name, sizeof(user.long_name)); user.long_name[sizeof(user.long_name) - 1] = '\0'; + sanitizeUtf8(user.long_name, sizeof(user.long_name)); strncpy(user.short_name, lite.short_name, sizeof(user.short_name)); user.short_name[sizeof(user.short_name) - 1] = '\0'; + sanitizeUtf8(user.short_name, sizeof(user.short_name)); user.hw_model = lite.hw_model; user.role = lite.role; user.is_licensed = lite.is_licensed; diff --git a/src/meshUtils.cpp b/src/meshUtils.cpp index 1a4497101..89c548887 100644 --- a/src/meshUtils.cpp +++ b/src/meshUtils.cpp @@ -117,4 +117,93 @@ size_t pb_string_length(const char *str, size_t max_len) } } return len; +} + +bool sanitizeUtf8(char *buf, size_t bufSize) +{ + if (!buf || bufSize == 0) + return false; + + // Ensure null-terminated within buffer + buf[bufSize - 1] = '\0'; + + bool replaced = false; + size_t i = 0; + size_t len = strlen(buf); + + while (i < len) { + uint8_t b = (uint8_t)buf[i]; + + // Determine expected sequence length from lead byte + size_t seqLen; + uint32_t minCodepoint; + if (b <= 0x7F) { + // ASCII — valid single byte + i++; + continue; + } else if ((b & 0xE0) == 0xC0) { + seqLen = 2; + minCodepoint = 0x80; // Reject overlong + } else if ((b & 0xF0) == 0xE0) { + seqLen = 3; + minCodepoint = 0x800; + } else if ((b & 0xF8) == 0xF0) { + seqLen = 4; + minCodepoint = 0x10000; + } else { + // Invalid lead byte (0x80-0xBF or 0xF8+) + buf[i] = '?'; + replaced = true; + i++; + continue; + } + + // Check that we have enough bytes remaining + if (i + seqLen > len) { + // Truncated sequence at end of string — replace remaining bytes + for (size_t j = i; j < len; j++) { + buf[j] = '?'; + } + replaced = true; + break; + } + + // Validate continuation bytes (must be 10xxxxxx) + bool valid = true; + for (size_t j = 1; j < seqLen; j++) { + if (((uint8_t)buf[i + j] & 0xC0) != 0x80) { + valid = false; + break; + } + } + + if (valid) { + // Decode codepoint to check for overlong encodings and surrogates + uint32_t cp = 0; + if (seqLen == 2) + cp = b & 0x1F; + else if (seqLen == 3) + cp = b & 0x0F; + else + cp = b & 0x07; + for (size_t j = 1; j < seqLen; j++) + cp = (cp << 6) | ((uint8_t)buf[i + j] & 0x3F); + + if (cp < minCodepoint || cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) { + // Overlong encoding, out of Unicode range, or surrogate half + valid = false; + } + } + + if (valid) { + i += seqLen; + } else { + // Replace only the lead byte; continuation bytes will be caught on next iteration + buf[i] = '?'; + replaced = true; + i++; + } + } + + return replaced; } \ No newline at end of file diff --git a/src/meshUtils.h b/src/meshUtils.h index da3a4593b..6a15229fb 100644 --- a/src/meshUtils.h +++ b/src/meshUtils.h @@ -38,6 +38,10 @@ const std::string vformat(const char *const zcFormat, ...); // Get actual string length for nanopb char array fields. size_t pb_string_length(const char *str, size_t max_len); +// Sanitize a fixed-size char buffer in-place by replacing invalid UTF-8 sequences with '?'. +// Ensures the result is null-terminated within bufSize. Returns true if any bytes were replaced. +bool sanitizeUtf8(char *buf, size_t bufSize); + /// Calculate 2^n without calling pow() - used for spreading factor and other calculations inline uint32_t pow_of_2(uint32_t n) { diff --git a/src/modules/AdminModule.cpp b/src/modules/AdminModule.cpp index 8a1843bcb..468e8d91e 100644 --- a/src/modules/AdminModule.cpp +++ b/src/modules/AdminModule.cpp @@ -599,10 +599,14 @@ void AdminModule::handleSetOwner(const meshtastic_User &o) if (*o.long_name) { changed |= strcmp(owner.long_name, o.long_name); strncpy(owner.long_name, o.long_name, sizeof(owner.long_name)); + owner.long_name[sizeof(owner.long_name) - 1] = '\0'; + sanitizeUtf8(owner.long_name, sizeof(owner.long_name)); } if (*o.short_name) { changed |= strcmp(owner.short_name, o.short_name); strncpy(owner.short_name, o.short_name, sizeof(owner.short_name)); + owner.short_name[sizeof(owner.short_name) - 1] = '\0'; + sanitizeUtf8(owner.short_name, sizeof(owner.short_name)); } snprintf(owner.id, sizeof(owner.id), "!%08x", nodeDB->getNodeNum()); @@ -1400,7 +1404,11 @@ void AdminModule::handleSetHamMode(const meshtastic_HamParameters &p) // Set call sign and override lora limitations for licensed use strncpy(owner.long_name, p.call_sign, sizeof(owner.long_name)); + owner.long_name[sizeof(owner.long_name) - 1] = '\0'; + sanitizeUtf8(owner.long_name, sizeof(owner.long_name)); strncpy(owner.short_name, p.short_name, sizeof(owner.short_name)); + owner.short_name[sizeof(owner.short_name) - 1] = '\0'; + sanitizeUtf8(owner.short_name, sizeof(owner.short_name)); owner.is_licensed = true; config.lora.override_duty_cycle = true; config.lora.tx_power = p.tx_power; diff --git a/test/test_utf8/test_main.cpp b/test/test_utf8/test_main.cpp new file mode 100644 index 000000000..7ac64653d --- /dev/null +++ b/test/test_utf8/test_main.cpp @@ -0,0 +1,195 @@ +#include "meshUtils.h" +#include +#include + +void setUp(void) {} +void tearDown(void) {} + +// --- Valid UTF-8 should pass through unchanged --- + +void test_ascii_unchanged() +{ + char buf[32] = "Hello World"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("Hello World", buf); +} + +void test_valid_2byte_unchanged() +{ + // "café" — é is C3 A9 + char buf[16] = "caf\xC3\xA9"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("caf\xC3\xA9", buf); +} + +void test_valid_3byte_unchanged() +{ + // "€" is E2 82 AC + char buf[16] = "\xE2\x82\xAC"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("\xE2\x82\xAC", buf); +} + +void test_valid_4byte_emoji_unchanged() +{ + // 🌙 is F0 9F 8C 99 + char buf[16] = "\xF0\x9F\x8C\x99"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("\xF0\x9F\x8C\x99", buf); +} + +void test_valid_mixed_unchanged() +{ + // "Hi 🌙!" — mix of ASCII and 4-byte + char buf[16] = "Hi \xF0\x9F\x8C\x99!"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("Hi \xF0\x9F\x8C\x99!", buf); +} + +void test_empty_string() +{ + char buf[8] = ""; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("", buf); +} + +// --- Invalid sequences observed in the wild --- + +void test_truncated_4byte_at_end() +{ + // Name with valid emoji 🌙 followed by a truncated 4-byte sequence + ASCII + char buf[32] = "Lunar Tower \xF0\x9F\x8C\x99\xF0\x9F\x97" + "4"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + // The 🌙 should be preserved; F0 9F 97 is an incomplete 4-byte sequence, + // '4' (0x34) is not a valid continuation byte + TEST_ASSERT_EQUAL_STRING("Lunar Tower \xF0\x9F\x8C\x99???4", buf); +} + +void test_lone_lead_bytes_without_continuations() +{ + // Mixed ASCII with stray multibyte lead bytes (E1, F3) lacking proper continuations + char buf[32] = "Mesht\xE1\xF3tic 37e2"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + // E1 expects 2 continuation bytes, but F3 is not a continuation → E1 replaced + // F3 expects 3 continuation bytes, 't','i','c' are not continuations → F3 replaced + TEST_ASSERT_EQUAL_STRING("Mesht??tic 37e2", buf); +} + +// --- Edge cases --- + +void test_bare_continuation_byte() +{ + // 0x80 alone is invalid (continuation byte with no lead) + char buf[8] = "\x80"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("?", buf); +} + +void test_overlong_2byte() +{ + // C0 AF is an overlong encoding of U+002F '/' + char buf[8] = "\xC0\xAF"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + // C0 is a 2-byte lead, AF is valid continuation, but codepoint 0x2F < 0x80 → overlong + // C0 replaced, AF (now bare continuation) also replaced + TEST_ASSERT_EQUAL_STRING("??", buf); +} + +void test_surrogate_half() +{ + // ED A0 80 encodes U+D800 (surrogate half — invalid in UTF-8) + char buf[8] = "\xED\xA0\x80"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("???", buf); +} + +void test_5byte_sequence_rejected() +{ + // F8 80 80 80 80 — 5-byte sequence, not valid UTF-8 + char buf[8] = "\xF8\x80\x80\x80\x80"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + // F8 is invalid lead (>= 0xF8), each 0x80 is bare continuation + TEST_ASSERT_EQUAL_STRING("?????", buf); +} + +void test_truncated_3byte_at_buffer_end() +{ + // Buffer is exactly 4 bytes: E2 82 then forced null at [3] + char buf[4]; + buf[0] = '\xE2'; + buf[1] = '\x82'; + buf[2] = '\0'; // String ends before the 3-byte sequence completes + buf[3] = '\0'; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("??", buf); +} + +void test_null_termination_enforced() +{ + // Fill buffer completely with no null terminator + char buf[5]; + memset(buf, 'A', sizeof(buf)); + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); + // Should be null-terminated and content preserved (all ASCII) + TEST_ASSERT_EQUAL_STRING("AAAA", buf); +} + +void test_null_buffer() +{ + TEST_ASSERT_FALSE(sanitizeUtf8(nullptr, 10)); +} + +void test_zero_size() +{ + char buf[4] = "Hi"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, 0)); + // Buffer should be untouched + TEST_ASSERT_EQUAL_STRING("Hi", buf); +} + +void test_valid_max_codepoint() +{ + // U+10FFFF = F4 8F BF BF (maximum valid Unicode codepoint) + char buf[8] = "\xF4\x8F\xBF\xBF"; + TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf))); + TEST_ASSERT_EQUAL_STRING("\xF4\x8F\xBF\xBF", buf); +} + +void test_above_max_codepoint() +{ + // U+110000 = F4 90 80 80 (just above maximum valid Unicode) + char buf[8] = "\xF4\x90\x80\x80"; + TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf))); +} + +int main(int argc, char **argv) +{ + UNITY_BEGIN(); + + // Valid UTF-8 passthrough + RUN_TEST(test_ascii_unchanged); + RUN_TEST(test_valid_2byte_unchanged); + RUN_TEST(test_valid_3byte_unchanged); + RUN_TEST(test_valid_4byte_emoji_unchanged); + RUN_TEST(test_valid_mixed_unchanged); + RUN_TEST(test_empty_string); + + // Invalid sequences observed in the wild + RUN_TEST(test_truncated_4byte_at_end); + RUN_TEST(test_lone_lead_bytes_without_continuations); + + // Edge cases + RUN_TEST(test_bare_continuation_byte); + RUN_TEST(test_overlong_2byte); + RUN_TEST(test_surrogate_half); + RUN_TEST(test_5byte_sequence_rejected); + RUN_TEST(test_truncated_3byte_at_buffer_end); + RUN_TEST(test_null_termination_enforced); + RUN_TEST(test_null_buffer); + RUN_TEST(test_zero_size); + RUN_TEST(test_valid_max_codepoint); + RUN_TEST(test_above_max_codepoint); + + return UNITY_END(); +}