MuditaOS/module-utils/utf8/UTF8.hpp

// Copyright (c) 2017-2020, Mudita Sp. z.o.o. All rights reserved.
// For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md

#pragma once

#include <string>
#include <cstdint>
#include <iosfwd> // for forward declaration for ostream
#include <memory>
#include <optional>

/// single utf8 character representation struct
struct U8char
{
    U8char() = default;
    /// get UTF16 (or U+ ) value and store it as UTF8
    /// @note this is done for little endian
    U8char(uint32_t code);
    /// get UTF8 and store it
    /// @note unsafe
    U8char(char *val, unsigned int size);
    /// get UTF8 code from char* stream depending on size of data
    /// @note unsafe
    U8char(char *);

    static const unsigned int utf8_max_size = 4;

    unsigned char utf8[utf8_max_size];

    uint32_t size = 0;

    void set(char *val, unsigned int size);
    void set(uint32_t code);
};

class UTF8
{
  protected:
    UTF8(const char *data, const uint32_t allocated, const uint32_t used, const uint32_t len);

    /// pointer to buffer
    std::unique_ptr<char[]> data;
    /// total size of buffer in bytes
    uint32_t sizeAllocated;
    /// number of bytes used in buffer
    uint32_t sizeUsed;
    /// umber of characters in the string
    uint32_t strLength;
    /// last used index
    mutable uint32_t lastIndex;
    /// pointer to last indexed character
    mutable char *lastIndexData;

    /// variable used when c_str() is called for a string that has no data yet
    static const char *emptyString;
    /// holds number of bytes by which buffer will be expanded in case when current buffer can't hold new data.
    static const uint32_t stringExpansion;
    /**
     * @brief Calculates size of the buffer to store given number of data bytes.
     * @param dataBytes number of data bytes
     * @return Number of bytes needed to store provided data bytes size rounded up and rounded using stringExpansion
     * value.
     */
    uint32_t getDataBufferSize(uint32_t dataBytes);
    bool expand(uint32_t size = stringExpansion);
  public:
    UTF8();
    UTF8(const char *str);
    UTF8(const std::string &str);
    UTF8(const UTF8 &utf);
    UTF8(UTF8 &&utf);
    static const uint32_t npos;

    virtual ~UTF8() = default;

    /**
     * OPERATORS
     */

    UTF8 &operator=(const UTF8 &utf);
    UTF8 &operator=(UTF8 &&utf) noexcept;
    /// returns UTF16 value of character (this is - utf8 value encoded to utf16)
    /// for utf8 value please use getChar
    uint32_t operator[](const uint32_t &idx) const;
    UTF8 operator+(const UTF8 &utf) const;
    UTF8 &operator+=(const UTF8 &utf);
    bool operator==(const UTF8 &utf) const;
    bool operator!=(const UTF8 &utf) const
    {
        return !operator==(utf);
    }
    operator std::string() const
    {
        return c_str();
    }
    friend std::ostream &operator<<(std::ostream &os, const UTF8 &el);

    /**
     * UTILITY FUNCTIONS
     */

    uint32_t length() const
    {
        return strLength;
    }

    bool empty() const noexcept
    {
        return strLength == 0U;
    }

    uint32_t used() const
    {
        return sizeUsed;
    }
    uint32_t allocated() const
    {
        return sizeAllocated;
    }
    const char *c_str() const;

    /// returns utf8 value on position, to get utf16 use operator[]
    U8char getChar(unsigned int pos);

    /**
     * @brief Removes all content from the string and reduce assigned memory to default value.
     */
    void clear();

    /**
     *@brief Creates substring from current string. New string starts from begin parameter and contains number of
     *characters passed by length.
     *@param begin Index of the first character in newly created string.
     *@param length Number of characters to copy.
     *@return substring created from source string.
     *@note In case of start index greater than length of source string or length that exceeds character empty string is
     *returned.
     **/
    UTF8 substr(const uint32_t begin, const uint32_t length) const;

    /**
     * @brief Finds first occurrence of substring in string
     * @param s string to find
     * @param pos initial searching position
     * @return index of first matched string
     * @note returns npos when substring is not found
     */

    uint32_t find(const char *s, uint32_t pos = 0);
    /**
     * @brief Finds last occurrence of substring in string
     * @param s string to find
     * @param pos initial searching position
     * @return index of first matched string
     * @note returns npos when substring is not found.
     */
    uint32_t findLast(const char *s, uint32_t pos);
    /**
     * @brief splits UTF8 sting into two strings.
     * @param idx index of character from which the division will be made.
     * @return newly created string, character under specified index will be first character in new string.
     * Returns empty string in case of invalid index.
     */
    UTF8 split(const uint32_t &idx);
    /**
     * @brief Creates substring from current string. New string is limited by /r or /n.
     * @return subrstring created from current string. Returns empty string in case of failure.
     */
    UTF8 getLine(void);
    /**
     * @brief Remove characters from string.
     * @param pos position of first char to remove.
     * @param count count of characters to remove.
     * @return true if there was no error, false otherwise
     */
    bool removeChar(const uint32_t &pos = 0, const uint32_t &count = 1);
    /**
     * @brief Inserts character into string on specified position. If position is not specified (UTF8::npos) char is
     * added at the end to current string.
     * @param charPtr pointer to the memory where UTF8 character is located.
     * @param index index in the current string where character should be inserted.
     * @return true is operation was successful false otherwise.
     */
    bool insert(const char *charPtr, const uint32_t &index = UTF8::npos);
    /**
     * @brief Inserts character into string on specified position. If position is not specified (UTF8::npos) char is
     * added at the end to current string.
     * @param charCode code of the character to insert.
     * @param index index in the current string where character should be inserted.
     * @return true is operation was successful false otherwise.
     */
    bool insertCode(const uint32_t &charCode, const uint32_t &index = UTF8::npos);
    /**
     * @brief Inserts string into current string on specified position. If position is not specified (UTF8::npos) is
     * appended at the end of the current string.
     * @param str String to be inserted into current object.
     * @param index index in the current string where character should be inserted.
     * @return true is operation was successful false otherwise.
     */
    bool insertString(const UTF8 &str, const uint32_t &index = UTF8::npos);

    /*
     * @brief Check if string has only ASCII characters
     * @return true if there are only ASCII characters in string, false otherwise.
     */
    bool isAscii(void)
    {
        if (this->sizeUsed - 1 == this->length())
            return true;
        return false;
    }
    /**
     * @brief Returns pointer to character encoded using provided Unicode value.
     * @param code Unicode of the character.
     * @param dest uint32 variable where encoded variable will be stored.
     * @param length number of useful bytes in dest variable.
     * @return True if encoding was successful, false otherwise
     * @note function returns false if character is within prohibited range - <U+D800, U+DFFF> or above value of
     * U+10FFFF.
     */
    static bool encode(const uint32_t &code, uint32_t &dest, uint32_t &length);

    /// get utf16_t value from utf8 character
    static uint32_t decode(const char *utf8_char, uint32_t &length);

    /**
     * @brief Calculates numer of utf8 characters in provided stream
     */
    static uint32_t getCharactersCount(const char *stream);
    /**
     * @brief Calculates number of bytes and character IDs in the provided stream;
     * @param size Variable where number of bytes in the stream will be saved (till null terminator).
     * @param count Variable where number of characters in the stream will be saved.
     * @return true if there was no error, false otherwise.
     */
    static bool getStreamLength(const char *stream, uint32_t &size, uint32_t &count);
    /**
     * @brief Checks if numbers contained in the UTF8 creates ASCII character combination
     * eg. 778568738465 returns true
     * @return true if is acii combination, false otherwise
     */
    [[nodiscard]] bool isASCIICombination() const noexcept;
    /**
     * @brief Converts UTF8 to ASCII character combination
     * eg. 778568738465 returns "MUDITA"
     * @return ASCII converted string
     */
    [[nodiscard]] std::optional<std::string> toASCII() const noexcept;
};