/* This file is part of Konsole, a terminal emulator for KDE. SPDX-FileCopyrightText: 2018 Mariusz Glebocki SPDX-License-Identifier: GPL-2.0-or-later */ #include "template.h" #include #include #include #include #include #include #include #include #include #include #include #include static constexpr unsigned int CODE_POINTS_NUM = 0x110000; static constexpr unsigned int LAST_CODE_POINT = CODE_POINTS_NUM - 1; struct UcdEntry { struct { uint first; uint last; } cp; QStringList fields; }; class UcdParserBase { public: ~UcdParserBase() { _source->close(); } bool hasNext() { bool hadNext = _hasNext; if (!_nextFetched) { _hasNext = fetchNext(); _nextFetched = true; } return hadNext; } protected: UcdParserBase(QIODevice *source, UcdEntry *entry) : _source(source) , _nextFetched(false) , _hasNext(true) , _lineNo(0) , _entry(entry) { Q_ASSERT(_source); Q_ASSERT(_entry); } bool fetchNext() { Q_ASSERT(_source->isOpen()); if (!_source->isOpen()) return false; static const QRegularExpression ENTRY_RE = QRegularExpression(QStringLiteral( // Match 1: "cp1" - first CP / "cp2" (optional) - last CP R"#((?:^(?[[:xdigit:]]+)(?:\.\.(?[[:xdigit:]]+))?[ \t]*;)#" // Match 1: "field0" - first data field" // "udRangeInd" (UnicodeData.txt only) - if present, the line is either first or last line of a range R"#([ \t]*(?[^#;\n]*?(?:, (?First|Last)>)?)[ \t]*(?:;|(?:\#.*)?$))|)#" // Match 2..n: "field" - n-th field R"#((?:\G(?<=;)[ \t]*(?[^#;\n]*?)[ \t]*(?:;|(?:#.*)?$)))#")); static const QRegularExpression UD_RANGE_IND_RE(QStringLiteral(", (First|Last)")); static const QRegularExpression COMMENT_RE(QStringLiteral("^[ \t]*(#.*)?$")); QString line; bool ok; _entry->fields.clear(); while (!_source->atEnd()) { line = QString::fromUtf8(_source->readLine()); _lineNo++; auto mit = ENTRY_RE.globalMatch(line); if (!mit.hasNext()) { // Do not complain about comments and empty lines if (!COMMENT_RE.match(line).hasMatch()) qDebug() << QStringLiteral("Line %1: does not match - skipping").arg(_lineNo); continue; } auto match = mit.next(); _entry->cp.first = match.captured(QStringLiteral("cp1")).toUInt(&ok, 16); if (!ok) { qDebug() << QStringLiteral("Line %d Invalid cp1 - skipping").arg(_lineNo); continue; } _entry->cp.last = match.captured(QStringLiteral("cp2")).toUInt(&ok, 16); if (!ok) { _entry->cp.last = _entry->cp.first; } QString field0 = match.captured(QStringLiteral("field0")); if (field0.isNull()) { qDebug() << QStringLiteral("Line %d: Missing field0 - skipping").arg(_lineNo); continue; } if (!match.captured(QStringLiteral("udRangeInd")).isNull()) { if (match.captured(QStringLiteral("udRangeInd")) == QStringLiteral("First")) { // Fetch next valid line, as it pairs with the current one to form a range QRegularExpressionMatch nlMatch; int firstLineNo = _lineNo; while (!_source->atEnd() && !nlMatch.hasMatch()) { line = QString::fromUtf8(_source->readLine()); _lineNo++; nlMatch = ENTRY_RE.match(line); if (!nlMatch.hasMatch()) { qDebug() << QStringLiteral("Line %d: does not match - skipping").arg(_lineNo); } } if (nlMatch.hasMatch()) { _entry->cp.last = nlMatch.captured(QStringLiteral("cp1")).toUInt(&ok, 16); if (!ok) { qDebug() << QStringLiteral("Line %1-%2: Missing or invalid second cp1 (\"Last\" entry) - skipping").arg(firstLineNo).arg(_lineNo); continue; } } } field0.remove(UD_RANGE_IND_RE); } _entry->fields.append(field0); while (mit.hasNext()) { _entry->fields.append(mit.next().captured(QStringLiteral("field"))); } return !_source->atEnd(); } return false; } QIODevice *_source; bool _nextFetched; bool _hasNext; private: int _lineNo; UcdEntry *_entry; }; template class UcdParser : public UcdParserBase { public: static_assert(std::is_base_of::value, "'EntryType' has to be derived from UcdParser::Entry"); UcdParser(QIODevice *source) : UcdParserBase(source, &_typedEntry) { } inline const EntryType &next() { if (!_nextFetched) fetchNext(); _nextFetched = false; return _typedEntry; } private: EntryType _typedEntry; }; class KIODevice : public QIODevice { public: enum Error { NoError, UnknownError, TimeoutError, UnknownHostError, MalformedUrlError, NotFoundError, }; KIODevice(const QUrl &url) : _url(url) , _job(nullptr) , _error(NoError) { } ~KIODevice() { close(); } bool open() { if (_job) return false; _job = KIO::storedGet(_url); QObject::connect(_job, &KIO::StoredTransferJob::result, _job, [&](KJob *) { if (_job->isErrorPage()) _eventLoop.exit(KIO::ERR_DOES_NOT_EXIST); else if (_job->error() != KJob::NoError) _eventLoop.exit(_job->error()); else _data = _job->data(); _eventLoop.exit(KJob::NoError); }); _eventLoop.exec(); switch (_job->error()) { case KJob::NoError: _error = NoError; setErrorString(QStringLiteral("")); QIODevice::open(QIODevice::ReadOnly | QIODevice::Unbuffered); break; case KJob::KilledJobError: _error = TimeoutError; break; case KIO::ERR_UNKNOWN_HOST: _error = UnknownHostError; break; case KIO::ERR_DOES_NOT_EXIST: _error = NotFoundError; break; case KIO::ERR_MALFORMED_URL: _error = MalformedUrlError; break; default: _error = UnknownError; break; } if (_error != NoError) { setErrorString(QStringLiteral("KIO: ") + _job->errorString()); delete _job; _job = nullptr; _data.clear(); } return _error == NoError; } bool open(OpenMode mode) override { Q_ASSERT(mode == QIODevice::ReadOnly); return open(); } void close() override { if (_job) { delete _job; _job = nullptr; _error = NoError; setErrorString(QStringLiteral("")); _data.clear(); QIODevice::close(); } } qint64 size() const override { return _data.size(); } int error() const { return _error; } void unsetError() { _error = NoError; } protected: qint64 writeData(const char *, qint64) override { return -1; } qint64 readData(char *data, qint64 maxSize) override { Q_UNUSED(maxSize); Q_ASSERT(_job); Q_ASSERT(_job->error() == NoError); Q_ASSERT(data != nullptr); if (maxSize == 0 || pos() >= _data.length()) { return 0; } else if (pos() < _data.length()) { qint64 bytesToCopy = qMin(maxSize, _data.length() - pos()); memcpy(data, _data.data() + pos(), bytesToCopy); return bytesToCopy; } else { return -1; } } private: QUrl _url; KIO::StoredTransferJob *_job; Error _error; QEventLoop _eventLoop; QByteArray _data; }; struct CategoryProperty { enum Flag : uint32_t { Invalid = 0, #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) sym = intVal, #include "properties.h" }; enum Group : uint32_t { #define CATEGORY_PROPERTY_GROUP(val, sym, intVal) sym = intVal, #include "properties.h" }; CategoryProperty(uint32_t value = Unassigned) : _value(value) { } CategoryProperty(const QString &string) : _value(fromString(string)) { } operator uint32_t &() { return _value; } operator const uint32_t &() const { return _value; } bool isValid() const { return _value != Invalid; } private: static uint32_t fromString(const QString &string) { static const QMap map = { #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym}, #include "properties.h" }; return map.contains(string) ? map[string] : uint8_t(Invalid); } uint32_t _value; }; struct EastAsianWidthProperty { enum Value : uint8_t { Invalid = 0x80, #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) sym = intVal, #include "properties.h" }; EastAsianWidthProperty(uint8_t value = Neutral) : _value(value) { } EastAsianWidthProperty(const QString &string) : _value(fromString(string)) { } operator uint8_t &() { return _value; } operator const uint8_t &() const { return _value; } bool isValid() const { return _value != Invalid; } private: static uint8_t fromString(const QString &string) { static const QMap map = { #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), Value::sym}, #include "properties.h" }; return map.contains(string) ? map[string] : Invalid; } uint8_t _value; }; struct EmojiProperty { enum Flag : uint8_t { Invalid = 0x80, #define EMOJI_PROPERTY_VALUE(val, sym, intVal) sym = intVal, #include "properties.h" }; EmojiProperty(uint8_t value = None) : _value(value) { } EmojiProperty(const QString &string) : _value(fromString(string)) { } operator uint8_t &() { return _value; } operator const uint8_t &() const { return _value; } bool isValid() const { return !(_value & Invalid); } private: static uint8_t fromString(const QString &string) { static const QMap map = { #define EMOJI_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym}, #include "properties.h" }; return map.contains(string) ? map[string] : uint8_t(Invalid); } uint8_t _value; }; struct CharacterWidth { enum Width : int8_t { Invalid = SCHAR_MIN, _VALID_START = -3, Ambiguous = -2, NonPrintable = -1, // 0 // 1 Unassigned = 1, // 2 _VALID_END = 3, }; CharacterWidth(const CharacterWidth &other) : _width(other._width) { } CharacterWidth(int8_t width = Invalid) : _width(width) { } CharacterWidth &operator=(const CharacterWidth &other) { _width = other._width; return *this; } int operator=(const int8_t width) { _width = width; return _width; } int width() const { return _width; } operator int() const { return width(); } const QString toString() const { switch (_width) { case Ambiguous: return QStringLiteral("Ambiguous"); case NonPrintable: return QStringLiteral("NonPrintable"); case 0: return QStringLiteral("0"); case 1: return QStringLiteral("1"); case 2: return QStringLiteral("2"); default: case Invalid: return QStringLiteral("Invalid"); } } bool isValid() const { return (_width > _VALID_START && _width < _VALID_END); }; private: int8_t _width; }; struct CharacterProperties { CategoryProperty category; EastAsianWidthProperty eastAsianWidth; EmojiProperty emoji; CharacterWidth customWidth; // For debug purposes in "details" output generator uint8_t widthFromPropsRule; }; struct UnicodeDataEntry : public UcdEntry { enum FieldId { NameId = 0, CategoryId = 1, }; CategoryProperty category() const { return CategoryProperty(this->fields.value(CategoryId)); } }; struct EastAsianWidthEntry : public UcdEntry { enum FieldId { WidthId = 0, }; EastAsianWidthProperty eastAsianWidth() const { return EastAsianWidthProperty(this->fields.value(WidthId)); } }; struct EmojiDataEntry : public UcdEntry { enum FieldId { EmojiId = 0, }; EmojiProperty emoji() const { return EmojiProperty(this->fields.value(EmojiId)); } }; struct GenericWidthEntry : public UcdEntry { enum FieldId { WidthId = 0, }; CharacterWidth width() const { bool ok; CharacterWidth w = this->fields.value(WidthId).toInt(&ok, 10); return (ok && w.isValid()) ? w : CharacterWidth::Invalid; } }; struct WidthsRange { struct { uint first; uint last; } cp; CharacterWidth width; }; QVector rangesFromWidths(const QVector &widths, QPair ucsRange = {0, CODE_POINTS_NUM}) { QVector ranges; if (ucsRange.second >= CODE_POINTS_NUM) ucsRange.second = widths.size() - 1; uint first = ucsRange.first; for (uint cp = first + 1; cp <= uint(ucsRange.second); ++cp) { if (widths[first] != widths[cp]) { ranges.append({{first, cp - 1}, widths[cp - 1]}); first = cp; } } ranges.append({{first, uint(ucsRange.second)}, widths[ucsRange.second]}); return ranges; } // Real ranges look like this (each continuous letter sequence is a range): // // D D D D D D D D 8 ranges // C C C C C C CC C CC 9 ranges // BBB BBB B B BBB BBBBBB 6 ranges // A A A A 4 ranges // ∑: 27 ranges // // To reduce total ranges count, the holes in groups can be filled with ranges // from groups above them: // // D D D D D D D D 8 ranges // CCC C CCCCC CCCCCCC 4 ranges // BBBBBBB BBBBBBB BBBBBBBBBBBBBBBB 3 ranges // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 1 ranges // ∑: 16 ranges // // First range is always without change. Last range (A) can be dropped // (it always contains everything). Search should be done in order: D, C, B (A). // For simplicity the function returns all ranges, including first and last. QMap>> mergedRangesFromWidths(const QVector &widths, const QVector widthsSortOrder, QPair ucsRange = {0, CODE_POINTS_NUM}) { if (ucsRange.second >= CODE_POINTS_NUM) ucsRange.second = widths.size() - 1; QVector ranges = rangesFromWidths(widths, ucsRange); QMap>> mergedRanges; int cmwi; // Currently Merged Width Index int sri = -1; // Start Range Index (for current width) int cri; // Current Range Index // First width ranges are without change. Last one has one range spanning everything, so we can skip this for (cmwi = 1; cmwi < widthsSortOrder.size() - 1; ++cmwi) { const CharacterWidth &cmw = widthsSortOrder[cmwi]; // Currently Merged Width for (cri = 0; cri < ranges.size(); ++cri) { WidthsRange &cr = ranges[cri]; // Current Range if (cr.width == cmw) { // Range is suitable for merge if (sri < 0) { // First one, just remember it sri = cri; } else { // Merge ranges[sri].cp.last = cr.cp.last; cr.width = CharacterWidth::Invalid; } } else { // Current range has another width - can we continue merging? if (sri >= 0) { const int crwi = widthsSortOrder.indexOf(cr.width); // Current Range Width Index if (!(crwi < cmwi && crwi >= 0)) { // current range is not above currently merged width - stop merging sri = -1; } } } } } for (const auto &range : std::as_const(ranges)) { if (range.width.isValid() && range.width != widthsSortOrder.last()) mergedRanges[range.width].append({range.cp.first, range.cp.last}); } mergedRanges[widthsSortOrder.last()].append({ucsRange.first, ucsRange.second}); return mergedRanges; } namespace generators { using GeneratorFunc = bool (*)(QTextStream &, const QVector &, const QVector &, const QMap &); bool code(QTextStream &out, const QVector &props, const QVector &widths, const QMap &args) { static constexpr int DIRECT_LUT_SIZE = 256; Q_UNUSED(props); QTextStream eout(stderr, QIODevice::WriteOnly); if (args.value(QStringLiteral("param")).isEmpty()) { eout << QStringLiteral("Template file not specified.") << Qt::endl << Qt::endl; return false; } QFile templateFile(args.value(QStringLiteral("param"))); if (!templateFile.open(QIODevice::ReadOnly)) { eout << QStringLiteral("Could not open file ") << templateFile.fileName() << ": " << templateFile.errorString(); exit(1); } const QString templateText = QString::fromUtf8(templateFile.readAll()); templateFile.close(); Var::Map data = { {QStringLiteral("gen-file-warning"), QStringLiteral("THIS IS A GENERATED FILE. DO NOT EDIT.")}, {QStringLiteral("cmdline"), args.value(QStringLiteral("cmdline"))}, {QStringLiteral("direct-lut"), Var::Vector(DIRECT_LUT_SIZE)}, {QStringLiteral("direct-lut-size"), DIRECT_LUT_SIZE}, {QStringLiteral("ranges-luts"), Var::Vector()}, {QStringLiteral("ranges-lut-list"), Var::Vector()}, {QStringLiteral("ranges-lut-list-size"), 0}, }; // Fill direct-lut with widths of 0x00-0xFF for (unsigned i = 0; i < DIRECT_LUT_SIZE; ++i) { Q_ASSERT(widths[i].isValid()); data[QStringLiteral("direct-lut")].vec[i] = int(widths[i]); } static const QVector widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1}; const QMap>> mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder, {DIRECT_LUT_SIZE, CODE_POINTS_NUM}); // Find last non-empty ranges lut int lastWidthId = 0; for (int wi = widthsSortOrder.size() - 1; wi > 0; --wi) { if (mergedRanges.contains(widthsSortOrder[wi])) { lastWidthId = wi; break; } } // Create ranges-luts for all widths except last non-empty one and empty ones for (int wi = 0; lastWidthId != 0 && wi < lastWidthId; ++wi) { const CharacterWidth width = widthsSortOrder[wi]; auto currentMergedRangesIt = mergedRanges.find(width); if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty()) continue; const int size = mergedRanges[width].size(); const QString name = QString(QStringLiteral("LUT_%1")).arg(width.toString().toUpper()); data[QStringLiteral("ranges-luts")].vec.append(Var::Map{ {QStringLiteral("name"), name}, {QStringLiteral("ranges"), Var::Vector()}, {QStringLiteral("size"), size}, }); data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{ {QStringLiteral("width"), int(width)}, {QStringLiteral("name"), name}, {QStringLiteral("size"), size}, }); auto ¤tLut = data[QStringLiteral("ranges-luts")].vec.last()[QStringLiteral("ranges")].vec; for (const auto &range : *currentMergedRangesIt) { Q_ASSERT(range.first <= LAST_CODE_POINT); Q_ASSERT(range.second <= LAST_CODE_POINT); currentLut.append(Var(Var::Map{{QStringLiteral("first"), range.first}, {QStringLiteral("last"), range.second}})); } } data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{ {QStringLiteral("width"), widthsSortOrder[lastWidthId].width()}, {QStringLiteral("name"), QStringLiteral("nullptr")}, {QStringLiteral("size"), 1}, }); data[QStringLiteral("ranges-lut-list-size")] = mergedRanges.size(); Template t(templateText); t.parse(); out << t.generate(data); return true; } bool list(QTextStream &out, const QVector &props, const QVector &widths, const QMap &args) { Q_UNUSED(props); out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) { out << QString::asprintf("%06X ; %2d\n", cp, int(widths[cp])); } return true; } bool ranges(QTextStream &out, const QVector &props, const QVector &widths, const QMap &args) { Q_UNUSED(props); const auto ranges = rangesFromWidths(widths); out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); for (const WidthsRange &range : ranges) { if (range.cp.first != range.cp.last) out << QString::asprintf("%06X..%06X ; %2d\n", range.cp.first, range.cp.last, int(range.width)); else out << QString::asprintf("%06X ; %2d\n", range.cp.first, int(range.width)); } return true; } bool compactRanges(QTextStream &out, const QVector &props, const QVector &widths, const QMap &args) { Q_UNUSED(props); static const QVector widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1}; const auto mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder); out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); for (const int width : std::as_const(widthsSortOrder)) { const auto currentMergedRangesIt = mergedRanges.find(width); if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty()) continue; for (const auto &range : currentMergedRangesIt.value()) { if (range.first != range.second) out << QString::asprintf("%06X..%06X ; %2d\n", range.first, range.second, int(width)); else out << QString::asprintf("%06X ; %2d\n", range.first, int(width)); } } return true; } bool details(QTextStream &out, const QVector &props, const QVector &widths, const QMap &args) { out.setFieldAlignment(QTextStream::AlignLeft); out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); out << QString::asprintf("#%-5s ; %-4s ; %-8s ; %-3s ; %-2s ; %-4s ; %-4s\n", "CP", "Wdth", "Cat", "EAW", "EM", "CstW", "Rule"); QMap widthStats; for (uint cp = 0; cp <= LAST_CODE_POINT; ++cp) { out << QString::asprintf("%06X ; %4d ; %08X ; %02X ; %02X ; %4d ; %d\n", cp, int8_t(widths[cp]), uint32_t(props[cp].category), uint8_t(props[cp].eastAsianWidth), uint8_t(props[cp].emoji), int8_t(props[cp].customWidth), props[cp].widthFromPropsRule); if (!widthStats.contains(widths[cp])) widthStats.insert(widths[cp], 0); widthStats[widths[cp]]++; } QMap rangesStats; const auto ranges = rangesFromWidths(widths); for (const auto &range : ranges) { if (!rangesStats.contains(range.width)) rangesStats.insert(range.width, 0); rangesStats[range.width]++; } out << QStringLiteral("# STATS") << Qt::endl; out << QStringLiteral("#") << Qt::endl; out << QStringLiteral("# Characters count for each width:") << Qt::endl; for (auto wi = widthStats.constBegin(); wi != widthStats.constEnd(); ++wi) { out << QString::asprintf("# %2d: %7d\n", int(wi.key()), widthStats[wi.key()]); } out << QStringLiteral("#") << Qt::endl; out << QStringLiteral("# Ranges count for each width:") << Qt::endl; int howmany = 0; for (auto wi = rangesStats.constBegin(); wi != rangesStats.constEnd(); ++wi) { if (howmany >= 20) break; howmany++; out << QString::asprintf("# %2d: %7d\n", int(wi.key()), rangesStats[wi.key()]); } return true; } } // namespace generators template static void processInputFiles(QVector &props, const QStringList &files, const QString &fileTypeName, void (*cb)(CharacterProperties &prop, const EntryType &entry)) { static const QRegularExpression PROTOCOL_RE(QStringLiteral(R"#(^[a-z]+://)#")); for (const QString &fileName : files) { qInfo().noquote() << QStringLiteral("Parsing as %1: %2").arg(fileTypeName).arg(fileName); QSharedPointer source = nullptr; if (PROTOCOL_RE.match(fileName).hasMatch()) { source.reset(new KIODevice(QUrl(fileName))); } else { source.reset(new QFile(fileName)); } if (!source->open(QIODevice::ReadOnly)) { qCritical() << QStringLiteral("Could not open %1: %2").arg(fileName).arg(source->errorString()); exit(1); } UcdParser p(source.data()); while (p.hasNext()) { const auto &e = p.next(); for (uint cp = e.cp.first; cp <= e.cp.last; ++cp) { cb(props[cp], e); } } } } static const QString escapeCmdline(const QStringList &args) { static QString cmdline = QString(); if (!cmdline.isEmpty()) return cmdline; QTextStream stream(&cmdline, QIODevice::WriteOnly); // basename for command name stream << QFileInfo(args[0]).baseName(); for (auto it = args.begin() + 1; it != args.end(); ++it) { if (!it->startsWith(QLatin1Char('-'))) stream << QStringLiteral(" \"") << QString(*it).replace(QRegularExpression(QStringLiteral(R"(["`$\\])")), QStringLiteral(R"(\\\1)")) << '"'; else stream << ' ' << *it; } stream.flush(); return cmdline; } enum ConvertOptions { AmbiguousWidthOpt = 0, EmojiOpt = 1, }; // Character width assignment // // Rules (from highest to lowest priority): // // * Local overlay // * (not implemented) Character unique properties described in The Unicode Standard, Version 10.0 // * Unicode category Cc, Cs: -1 // * Emoji: 2 // * Unicode category Mn, Me, Cf: 0 // * East Asian Width W, F: 2 // * East Asian Width H, N, Na: 1 // * East Asian Width A: (varies) // * Unassigned/Undefined/Private Use: 1 // // The list is loosely based on character width implementations in Vim 8.1 // and glibc 2.27. There are a few cases which could look better // (decomposed Hangul, emoji with modifiers, etc) with different widths, // but interactive terminal programs (at least vim, zsh, everything based // on glibc's wcwidth) would see their width as it is implemented now. static inline CharacterWidth widthFromProps(const CharacterProperties &props, uint cp, const QMap &convertOpts) { CharacterWidth cw; auto &widthFromPropsRule = const_cast(props.widthFromPropsRule); if (props.customWidth.isValid()) { widthFromPropsRule = 1; cw = props.customWidth; } else if ((CategoryProperty::Control | CategoryProperty::Surrogate) & props.category) { widthFromPropsRule = 2; cw = CharacterWidth::NonPrintable; } else if (convertOpts[EmojiOpt] & props.emoji && !(EmojiProperty::EmojiComponent & props.emoji)) { widthFromPropsRule = 3; cw = 2; } else if ((CategoryProperty::NonspacingMark | CategoryProperty::EnclosingMark | CategoryProperty::Format) & props.category) { widthFromPropsRule = 4; cw = 0; } else if ((EastAsianWidthProperty::Wide | EastAsianWidthProperty::Fullwidth) & props.eastAsianWidth) { widthFromPropsRule = 5; cw = 2; } else if ((EastAsianWidthProperty::Halfwidth | EastAsianWidthProperty::Neutral | EastAsianWidthProperty::Narrow) & props.eastAsianWidth) { widthFromPropsRule = 6; cw = 1; } else if ((CategoryProperty::Unassigned | CategoryProperty::PrivateUse) & props.category) { widthFromPropsRule = 7; cw = CharacterWidth::Unassigned; } else if ((EastAsianWidthProperty::Ambiguous)&props.eastAsianWidth) { widthFromPropsRule = 8; cw = convertOpts[AmbiguousWidthOpt]; } else if (!props.category.isValid()) { widthFromPropsRule = 9; qWarning() << QStringLiteral("Code point U+%1 has invalid category - this should not happen. Assuming \"unassigned\"").arg(cp, 4, 16, QLatin1Char('0')); cw = CharacterWidth::Unassigned; } else { widthFromPropsRule = 10; qWarning() << QStringLiteral("Code point U+%1 not classified - this should not happen. Assuming non-printable character").arg(cp, 4, 16, QLatin1Char('0')); cw = CharacterWidth::NonPrintable; } return cw; } int main(int argc, char *argv[]) { static const QMap GENERATOR_FUNCS_MAP = { {QStringLiteral("code"), generators::code}, {QStringLiteral("compact-ranges"), generators::compactRanges}, {QStringLiteral("ranges"), generators::ranges}, {QStringLiteral("list"), generators::list}, {QStringLiteral("details"), generators::details}, {QStringLiteral("dummy"), [](QTextStream &, const QVector &, const QVector &, const QMap &) -> bool { return true; }}, }; qSetMessagePattern(QStringLiteral("%{message}")); QCoreApplication app(argc, argv); QCommandLineParser parser; parser.setApplicationDescription(QStringLiteral("\nUCD files to characters widths converter.\n")); parser.addHelpOption(); parser.addOptions({ {{QStringLiteral("U"), QStringLiteral("unicode-data")}, QStringLiteral("Path or URL to UnicodeData.txt."), QStringLiteral("URL|file")}, {{QStringLiteral("A"), QStringLiteral("east-asian-width")}, QStringLiteral("Path or URL to EastAsianWidth.txt."), QStringLiteral("URL|file")}, {{QStringLiteral("E"), QStringLiteral("emoji-data")}, QStringLiteral("Path or URL to emoji-data.txt."), QStringLiteral("URL|file")}, {{QStringLiteral("W"), QStringLiteral("generic-width")}, QStringLiteral("Path or URL to generic file with width data. Accepts output from compact-ranges, ranges, list and details generator."), QStringLiteral("URL|file")}, {QStringLiteral("ambiguous-width"), QStringLiteral("Ambiguous characters width."), QStringLiteral("separate|1|2"), QString(QStringLiteral("%1")).arg(CharacterWidth::Ambiguous)}, {QStringLiteral("emoji"), QStringLiteral("Which emoji emoji subset is treated as emoji."), QStringLiteral("all|presentation"), QStringLiteral("presentation")}, {{QStringLiteral("g"), QStringLiteral("generator")}, QStringLiteral("Output generator (use \"-\" to list available generators). The code generator requires path to a template file."), QStringLiteral("generator[:template]"), QStringLiteral("details")}, }); parser.addPositionalArgument(QStringLiteral("output"), QStringLiteral("Output file (leave empty for stdout).")); parser.process(app); const QStringList unicodeDataFiles = parser.values(QStringLiteral("unicode-data")); const QStringList eastAsianWidthFiles = parser.values(QStringLiteral("east-asian-width")); const QStringList emojiDataFiles = parser.values(QStringLiteral("emoji-data")); const QStringList genericWidthFiles = parser.values(QStringLiteral("generic-width")); const QString ambiguousWidthStr = parser.value(QStringLiteral("ambiguous-width")); const QString emojiStr = parser.value(QStringLiteral("emoji")); const QString generator = parser.value(QStringLiteral("generator")); const QString outputFileName = parser.positionalArguments().value(0); QTextStream eout(stderr, QIODevice::WriteOnly); if (unicodeDataFiles.isEmpty() && eastAsianWidthFiles.isEmpty() && emojiDataFiles.isEmpty() && genericWidthFiles.isEmpty()) { eout << QStringLiteral("Input files not specified.") << Qt::endl << Qt::endl; parser.showHelp(1); } static QMap convertOpts = { {AmbiguousWidthOpt, CharacterWidth::Ambiguous}, {EmojiOpt, EmojiProperty::EmojiPresentation}, }; if (emojiStr == QStringLiteral("presentation")) convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation; else if (emojiStr == QStringLiteral("all")) convertOpts[EmojiOpt] = EmojiProperty::Emoji; else { convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation; qWarning() << QStringLiteral("invalid emoji option value: %1. Assuming \"presentation\".").arg(emojiStr); } if (ambiguousWidthStr == QStringLiteral("separate")) convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous; else if (ambiguousWidthStr == QStringLiteral("1")) convertOpts[AmbiguousWidthOpt] = 1; else if (ambiguousWidthStr == QStringLiteral("2")) convertOpts[AmbiguousWidthOpt] = 2; else { convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous; qWarning() << QStringLiteral("Invalid ambiguous-width option value: %1. Assuming \"separate\".").arg(emojiStr); } const int sepPos = generator.indexOf(QLatin1Char(':')); const auto generatorName = generator.left(sepPos); const auto generatorParam = sepPos >= 0 ? generator.mid(sepPos + 1) : QString(); if (!GENERATOR_FUNCS_MAP.contains(generatorName)) { int status = 0; if (generatorName != QStringLiteral("-")) { status = 1; eout << QStringLiteral("Invalid output generator. Available generators:") << Qt::endl; } for (auto it = GENERATOR_FUNCS_MAP.constBegin(); it != GENERATOR_FUNCS_MAP.constEnd(); ++it) { eout << it.key() << Qt::endl; } exit(status); } auto generatorFunc = GENERATOR_FUNCS_MAP[generatorName]; QFile outFile; if (!outputFileName.isEmpty()) { outFile.setFileName(outputFileName); if (!outFile.open(QIODevice::WriteOnly)) { eout << QStringLiteral("Could not open file ") << outputFileName << QStringLiteral(": ") << outFile.errorString() << Qt::endl; exit(1); } } else { outFile.open(stdout, QIODevice::WriteOnly); } QTextStream out(&outFile); QVector props(CODE_POINTS_NUM); processInputFiles(props, unicodeDataFiles, QStringLiteral("UnicodeData.txt"), [](CharacterProperties &prop, const UnicodeDataEntry &entry) { prop.category = entry.category(); }); processInputFiles(props, eastAsianWidthFiles, QStringLiteral("EastAsianWidth.txt"), [](CharacterProperties &prop, const EastAsianWidthEntry &entry) { prop.eastAsianWidth = entry.eastAsianWidth(); }); processInputFiles(props, emojiDataFiles, QStringLiteral("emoji-data.txt"), [](CharacterProperties &prop, const EmojiDataEntry &entry) { prop.emoji |= entry.emoji(); }); processInputFiles(props, genericWidthFiles, QStringLiteral("generic width data"), [](CharacterProperties &prop, const GenericWidthEntry &entry) { prop.customWidth = entry.width(); }); qInfo() << "Generating character width data"; QVector widths(CODE_POINTS_NUM); widths[0] = 0; // NULL character always has width 0 for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) { widths[cp] = widthFromProps(props[cp], cp, convertOpts); } const QMap generatorArgs = { {QStringLiteral("cmdline"), escapeCmdline(app.arguments())}, {QStringLiteral("param"), generatorParam}, {QStringLiteral("output"), outputFileName.isEmpty() ? QStringLiteral("") : outputFileName}, }; qInfo() << "Generating output"; if (!generatorFunc(out, props, widths, generatorArgs)) { parser.showHelp(1); } return 0; }