Sane sanitization

This commit is contained in:
Ben Meadors
2026-04-23 14:19:33 -05:00
parent 031f332ec1
commit 2cc13a1132
6 changed files with 302 additions and 0 deletions

View File

@@ -29,6 +29,7 @@ build_flags = -Wno-missing-field-initializers
-DUSE_THREAD_NAMES
-DTINYGPS_OPTION_NO_CUSTOM_FIELDS
-DPB_ENABLE_MALLOC=1
-DPB_VALIDATE_UTF8=1
-DRADIOLIB_EXCLUDE_CC1101=1
-DRADIOLIB_EXCLUDE_NRF24=1
-DRADIOLIB_EXCLUDE_RF69=1

View File

@@ -1,6 +1,7 @@
#include "TypeConversions.h"
#include "mesh/generated/meshtastic/deviceonly.pb.h"
#include "mesh/generated/meshtastic/mesh.pb.h"
#include "meshUtils.h"
meshtastic_NodeInfo TypeConversions::ConvertToNodeInfo(const meshtastic_NodeInfoLite *lite)
{
@@ -82,8 +83,10 @@ meshtastic_UserLite TypeConversions::ConvertToUserLite(meshtastic_User user)
strncpy(lite.long_name, user.long_name, sizeof(lite.long_name));
lite.long_name[sizeof(lite.long_name) - 1] = '\0';
sanitizeUtf8(lite.long_name, sizeof(lite.long_name));
strncpy(lite.short_name, user.short_name, sizeof(lite.short_name));
lite.short_name[sizeof(lite.short_name) - 1] = '\0';
sanitizeUtf8(lite.short_name, sizeof(lite.short_name));
lite.hw_model = user.hw_model;
lite.role = user.role;
lite.is_licensed = user.is_licensed;
@@ -102,8 +105,10 @@ meshtastic_User TypeConversions::ConvertToUser(uint32_t nodeNum, meshtastic_User
snprintf(user.id, sizeof(user.id), "!%08x", nodeNum);
strncpy(user.long_name, lite.long_name, sizeof(user.long_name));
user.long_name[sizeof(user.long_name) - 1] = '\0';
sanitizeUtf8(user.long_name, sizeof(user.long_name));
strncpy(user.short_name, lite.short_name, sizeof(user.short_name));
user.short_name[sizeof(user.short_name) - 1] = '\0';
sanitizeUtf8(user.short_name, sizeof(user.short_name));
user.hw_model = lite.hw_model;
user.role = lite.role;
user.is_licensed = lite.is_licensed;

View File

@@ -117,4 +117,93 @@ size_t pb_string_length(const char *str, size_t max_len)
}
}
return len;
}
bool sanitizeUtf8(char *buf, size_t bufSize)
{
if (!buf || bufSize == 0)
return false;
// Ensure null-terminated within buffer
buf[bufSize - 1] = '\0';
bool replaced = false;
size_t i = 0;
size_t len = strlen(buf);
while (i < len) {
uint8_t b = (uint8_t)buf[i];
// Determine expected sequence length from lead byte
size_t seqLen;
uint32_t minCodepoint;
if (b <= 0x7F) {
// ASCII — valid single byte
i++;
continue;
} else if ((b & 0xE0) == 0xC0) {
seqLen = 2;
minCodepoint = 0x80; // Reject overlong
} else if ((b & 0xF0) == 0xE0) {
seqLen = 3;
minCodepoint = 0x800;
} else if ((b & 0xF8) == 0xF0) {
seqLen = 4;
minCodepoint = 0x10000;
} else {
// Invalid lead byte (0x80-0xBF or 0xF8+)
buf[i] = '?';
replaced = true;
i++;
continue;
}
// Check that we have enough bytes remaining
if (i + seqLen > len) {
// Truncated sequence at end of string — replace remaining bytes
for (size_t j = i; j < len; j++) {
buf[j] = '?';
}
replaced = true;
break;
}
// Validate continuation bytes (must be 10xxxxxx)
bool valid = true;
for (size_t j = 1; j < seqLen; j++) {
if (((uint8_t)buf[i + j] & 0xC0) != 0x80) {
valid = false;
break;
}
}
if (valid) {
// Decode codepoint to check for overlong encodings and surrogates
uint32_t cp = 0;
if (seqLen == 2)
cp = b & 0x1F;
else if (seqLen == 3)
cp = b & 0x0F;
else
cp = b & 0x07;
for (size_t j = 1; j < seqLen; j++)
cp = (cp << 6) | ((uint8_t)buf[i + j] & 0x3F);
if (cp < minCodepoint || cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
// Overlong encoding, out of Unicode range, or surrogate half
valid = false;
}
}
if (valid) {
i += seqLen;
} else {
// Replace only the lead byte; continuation bytes will be caught on next iteration
buf[i] = '?';
replaced = true;
i++;
}
}
return replaced;
}

View File

@@ -38,6 +38,10 @@ const std::string vformat(const char *const zcFormat, ...);
// Get actual string length for nanopb char array fields.
size_t pb_string_length(const char *str, size_t max_len);
// Sanitize a fixed-size char buffer in-place by replacing invalid UTF-8 sequences with '?'.
// Ensures the result is null-terminated within bufSize. Returns true if any bytes were replaced.
bool sanitizeUtf8(char *buf, size_t bufSize);
/// Calculate 2^n without calling pow() - used for spreading factor and other calculations
inline uint32_t pow_of_2(uint32_t n)
{

View File

@@ -599,10 +599,14 @@ void AdminModule::handleSetOwner(const meshtastic_User &o)
if (*o.long_name) {
changed |= strcmp(owner.long_name, o.long_name);
strncpy(owner.long_name, o.long_name, sizeof(owner.long_name));
owner.long_name[sizeof(owner.long_name) - 1] = '\0';
sanitizeUtf8(owner.long_name, sizeof(owner.long_name));
}
if (*o.short_name) {
changed |= strcmp(owner.short_name, o.short_name);
strncpy(owner.short_name, o.short_name, sizeof(owner.short_name));
owner.short_name[sizeof(owner.short_name) - 1] = '\0';
sanitizeUtf8(owner.short_name, sizeof(owner.short_name));
}
snprintf(owner.id, sizeof(owner.id), "!%08x", nodeDB->getNodeNum());
@@ -1400,7 +1404,11 @@ void AdminModule::handleSetHamMode(const meshtastic_HamParameters &p)
// Set call sign and override lora limitations for licensed use
strncpy(owner.long_name, p.call_sign, sizeof(owner.long_name));
owner.long_name[sizeof(owner.long_name) - 1] = '\0';
sanitizeUtf8(owner.long_name, sizeof(owner.long_name));
strncpy(owner.short_name, p.short_name, sizeof(owner.short_name));
owner.short_name[sizeof(owner.short_name) - 1] = '\0';
sanitizeUtf8(owner.short_name, sizeof(owner.short_name));
owner.is_licensed = true;
config.lora.override_duty_cycle = true;
config.lora.tx_power = p.tx_power;

View File

@@ -0,0 +1,195 @@
#include "meshUtils.h"
#include <cstring>
#include <unity.h>
void setUp(void) {}
void tearDown(void) {}
// --- Valid UTF-8 should pass through unchanged ---
void test_ascii_unchanged()
{
char buf[32] = "Hello World";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("Hello World", buf);
}
void test_valid_2byte_unchanged()
{
// "café" — é is C3 A9
char buf[16] = "caf\xC3\xA9";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("caf\xC3\xA9", buf);
}
void test_valid_3byte_unchanged()
{
// "€" is E2 82 AC
char buf[16] = "\xE2\x82\xAC";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("\xE2\x82\xAC", buf);
}
void test_valid_4byte_emoji_unchanged()
{
// 🌙 is F0 9F 8C 99
char buf[16] = "\xF0\x9F\x8C\x99";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("\xF0\x9F\x8C\x99", buf);
}
void test_valid_mixed_unchanged()
{
// "Hi 🌙!" — mix of ASCII and 4-byte
char buf[16] = "Hi \xF0\x9F\x8C\x99!";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("Hi \xF0\x9F\x8C\x99!", buf);
}
void test_empty_string()
{
char buf[8] = "";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("", buf);
}
// --- Invalid sequences observed in the wild ---
void test_truncated_4byte_at_end()
{
// Name with valid emoji 🌙 followed by a truncated 4-byte sequence + ASCII
char buf[32] = "Lunar Tower \xF0\x9F\x8C\x99\xF0\x9F\x97"
"4";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
// The 🌙 should be preserved; F0 9F 97 is an incomplete 4-byte sequence,
// '4' (0x34) is not a valid continuation byte
TEST_ASSERT_EQUAL_STRING("Lunar Tower \xF0\x9F\x8C\x99???4", buf);
}
void test_lone_lead_bytes_without_continuations()
{
// Mixed ASCII with stray multibyte lead bytes (E1, F3) lacking proper continuations
char buf[32] = "Mesht\xE1\xF3tic 37e2";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
// E1 expects 2 continuation bytes, but F3 is not a continuation → E1 replaced
// F3 expects 3 continuation bytes, 't','i','c' are not continuations → F3 replaced
TEST_ASSERT_EQUAL_STRING("Mesht??tic 37e2", buf);
}
// --- Edge cases ---
void test_bare_continuation_byte()
{
// 0x80 alone is invalid (continuation byte with no lead)
char buf[8] = "\x80";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("?", buf);
}
void test_overlong_2byte()
{
// C0 AF is an overlong encoding of U+002F '/'
char buf[8] = "\xC0\xAF";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
// C0 is a 2-byte lead, AF is valid continuation, but codepoint 0x2F < 0x80 → overlong
// C0 replaced, AF (now bare continuation) also replaced
TEST_ASSERT_EQUAL_STRING("??", buf);
}
void test_surrogate_half()
{
// ED A0 80 encodes U+D800 (surrogate half — invalid in UTF-8)
char buf[8] = "\xED\xA0\x80";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("???", buf);
}
void test_5byte_sequence_rejected()
{
// F8 80 80 80 80 — 5-byte sequence, not valid UTF-8
char buf[8] = "\xF8\x80\x80\x80\x80";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
// F8 is invalid lead (>= 0xF8), each 0x80 is bare continuation
TEST_ASSERT_EQUAL_STRING("?????", buf);
}
void test_truncated_3byte_at_buffer_end()
{
// Buffer is exactly 4 bytes: E2 82 then forced null at [3]
char buf[4];
buf[0] = '\xE2';
buf[1] = '\x82';
buf[2] = '\0'; // String ends before the 3-byte sequence completes
buf[3] = '\0';
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("??", buf);
}
void test_null_termination_enforced()
{
// Fill buffer completely with no null terminator
char buf[5];
memset(buf, 'A', sizeof(buf));
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
// Should be null-terminated and content preserved (all ASCII)
TEST_ASSERT_EQUAL_STRING("AAAA", buf);
}
void test_null_buffer()
{
TEST_ASSERT_FALSE(sanitizeUtf8(nullptr, 10));
}
void test_zero_size()
{
char buf[4] = "Hi";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, 0));
// Buffer should be untouched
TEST_ASSERT_EQUAL_STRING("Hi", buf);
}
void test_valid_max_codepoint()
{
// U+10FFFF = F4 8F BF BF (maximum valid Unicode codepoint)
char buf[8] = "\xF4\x8F\xBF\xBF";
TEST_ASSERT_FALSE(sanitizeUtf8(buf, sizeof(buf)));
TEST_ASSERT_EQUAL_STRING("\xF4\x8F\xBF\xBF", buf);
}
void test_above_max_codepoint()
{
// U+110000 = F4 90 80 80 (just above maximum valid Unicode)
char buf[8] = "\xF4\x90\x80\x80";
TEST_ASSERT_TRUE(sanitizeUtf8(buf, sizeof(buf)));
}
int main(int argc, char **argv)
{
UNITY_BEGIN();
// Valid UTF-8 passthrough
RUN_TEST(test_ascii_unchanged);
RUN_TEST(test_valid_2byte_unchanged);
RUN_TEST(test_valid_3byte_unchanged);
RUN_TEST(test_valid_4byte_emoji_unchanged);
RUN_TEST(test_valid_mixed_unchanged);
RUN_TEST(test_empty_string);
// Invalid sequences observed in the wild
RUN_TEST(test_truncated_4byte_at_end);
RUN_TEST(test_lone_lead_bytes_without_continuations);
// Edge cases
RUN_TEST(test_bare_continuation_byte);
RUN_TEST(test_overlong_2byte);
RUN_TEST(test_surrogate_half);
RUN_TEST(test_5byte_sequence_rejected);
RUN_TEST(test_truncated_3byte_at_buffer_end);
RUN_TEST(test_null_termination_enforced);
RUN_TEST(test_null_buffer);
RUN_TEST(test_zero_size);
RUN_TEST(test_valid_max_codepoint);
RUN_TEST(test_above_max_codepoint);
return UNITY_END();
}