Files
MuditaOS/module-utils/ucs2/UCS2.cpp
mkamonMdt 19a897cf65 [EGD-4604] Fix incorrect message UCS2 decode
The problem could be noticed in a incoming message that contained
emojis. All emojis we pushed to the front of a message, due to
separation on decoding of `uint32_t` chars 'larger' than `0xffff`
2021-02-12 08:09:27 +01:00

191 lines
5.8 KiB
C++

// Copyright (c) 2017-2021, Mudita Sp. z.o.o. All rights reserved.
// For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md
#include "UCS2.hpp"
#include <cstring>
#include <cstdint>
#include <sstream>
#include <iomanip>
#include "log/log.hpp"
#include <iterator>
#include <locale>
#include <codecvt>
namespace ucs2
{
constexpr uint32_t bufferExt = 32;
} // namespace ucs2
UCS2::UCS2() : sizeAllocated{ucs2::bufferExt}, buffer{std::make_unique<uint32_t[]>(ucs2::bufferExt)}
{
}
UCS2::UCS2(const UTF8 &string)
{
clear();
for (std::size_t i = 0; i < string.length(); i++) {
uint32_t utfChar = convertFromUtf(string[i]);
append(utfChar);
}
}
UCS2::UCS2(const std::string &string)
: sizeAllocated{ucs2::bufferExt}, buffer{std::make_unique<uint32_t[]>(ucs2::bufferExt)}
{
constexpr uint8_t chunkSize16Bit = 4;
for (std::size_t i = 0; i < string.length() / chunkSize16Bit; i++) {
const auto ucs2char = getUcs2Char(string, i);
// handle 32 bit
if (0xffff0000 & ucs2char) {
++i;
append(ucs2char);
continue;
}
// handle 16 bit
append(ucs2char);
}
// terminate ucs2 string by 0
append(0);
}
UCS2::UCS2(UCS2 &ucs)
: length{ucs.getLength()}, sizeUsed{ucs.getSizeUsed()},
sizeAllocated{ucs.getSizeAlocated()}, buffer{std::make_unique<uint32_t[]>(sizeAllocated)}
{
memcpy(buffer.get(), ucs.buffer.get(), sizeUsed);
}
UTF8 UCS2::toUTF8() const noexcept
{
if (length == 0) {
return UTF8();
}
constexpr auto maxDecodedCharSize = 3;
std::array<uint8_t, maxDecodedCharSize> localBuffer;
std::string decodeResult;
decodeResult.reserve(maxDecodedCharSize * length + 1);
for (uint32_t i = 0; i < length; i++) {
uint32_t offset = 0;
const uint32_t c = this->buffer[i];
if (c > 0xffff) {
// 32 bit conversion
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
const uint16_t y = (c & 0x03FF0000) >> 16;
const uint16_t x = c & 0x03FF;
const uint32_t decoded = 0x10000 + (y << 10) + x;
decodeResult.append(convertToUtf8String({decoded}));
continue;
}
else if (c > 0x07ff) {
localBuffer[offset++] = (0x00E0 | ((c & 0xF000) >> 12));
localBuffer[offset++] = (0x0080 | ((c & 0x0FC0) >> 6));
localBuffer[offset++] = (0x0080 | (c & 0x003F));
}
else if (c > 0x07f) {
localBuffer[offset++] = (0x00C0 | ((c & 0x07C0) >> 6));
localBuffer[offset++] = (0x0080 | (c & 0x003F));
}
else {
localBuffer[offset++] = c;
}
decodeResult.append(std::begin(localBuffer), std::next(std::begin(localBuffer), offset));
}
return UTF8(decodeResult.c_str());
}
void UCS2::append(const uint32_t &ucs2char)
{
// check if buffer needs to be expanded
if (sizeUsed == sizeAllocated) {
auto newBuffer = std::make_unique<uint32_t[]>(sizeAllocated + ucs2::bufferExt);
memcpy(newBuffer.get(), buffer.get(), sizeAllocated);
buffer = std::move(newBuffer);
sizeAllocated = sizeAllocated + ucs2::bufferExt;
}
// write character to the end of buffer, increment size and add 2 to used bytes ( usc2 character is two byte )
buffer[length] = ucs2char;
length++;
sizeUsed += sizeof(ucs2char);
}
std::string UCS2::str() const noexcept
{
std::stringstream ss;
for (uint32_t i = 0; i < length; i++) {
ss << std::setw(4) << std::setfill('0') << std::hex << std::uppercase;
ss << buffer[i];
}
return ss.str();
}
void UCS2::clear()
{
sizeUsed = 0;
sizeAllocated = ucs2::bufferExt;
buffer = std::make_unique<uint32_t[]>(ucs2::bufferExt);
length = 0;
}
uint32_t UCS2::convertFromUtf(uint32_t utfChar) const noexcept
{
if (0xffff0000 & utfChar) {
// 32 bit conversion
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
const uint16_t x = utfChar & 0x3FF;
const uint16_t y = utfChar & 0xF400;
const uint16_t low = 0xDC00 + x;
const uint16_t high = 0xD800 + (y >> 10);
return (high << 16) + low;
}
return utfChar;
}
std::string inline UCS2::convertToUtf8String(const std::u32string &s) const
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.to_bytes(s);
}
uint32_t UCS2::getUcs2Char(const std::string &string, const std::size_t &i)
{
constexpr uint8_t chunkSize16Bit = 4;
constexpr uint8_t chunkSize32Bit = 8;
constexpr uint8_t indexPosition = 4;
constexpr int base = 16;
uint32_t ucs2char = 0;
uint16_t nextUcs2char = 0;
try {
ucs2char = std::stoi(string.substr(i * indexPosition, chunkSize16Bit), 0, base);
// check next character
if (i < (string.length() / chunkSize16Bit) - 1) {
nextUcs2char = std::stoi(string.substr((i + 1) * indexPosition, chunkSize16Bit), 0, base);
// 32 bit
if (ucs2char & 0xD800 && nextUcs2char & 0xDC00) {
std::istringstream buf{string.substr(i * indexPosition, chunkSize32Bit)};
buf >> std::hex >> ucs2char;
}
}
}
catch (const std::invalid_argument &e) {
clear();
LOG_ERROR("UCS2::UCS2(const std::string& string) failed. Invalid argument.");
}
catch (const std::out_of_range &e) {
clear();
LOG_ERROR("UCS2::UCS2(const std::string& string) failed. Out of range.");
}
return ucs2char;
}