|
| 1 | +#include "Lacewing.h" |
| 2 | +#include "deps/utf8proc.h" |
| 3 | + |
| 4 | +std::string lacewing::codepointsallowlist::setcodepointsallowedlist(std::string acStr) |
| 5 | +{ |
| 6 | + if (acStr.empty()) |
| 7 | + { |
| 8 | + codePointCategories.clear(); |
| 9 | + specificCodePoints.clear(); |
| 10 | + codePointRanges.clear(); |
| 11 | + allAllowed = true; |
| 12 | + list = acStr; |
| 13 | + return std::string(); |
| 14 | + } |
| 15 | + |
| 16 | + codepointsallowlist acTemp = *this; |
| 17 | + |
| 18 | + // Reset to blank |
| 19 | + codePointCategories.clear(); |
| 20 | + specificCodePoints.clear(); |
| 21 | + codePointRanges.clear(); |
| 22 | + allAllowed = false; |
| 23 | + list = acStr; |
| 24 | + |
| 25 | + const auto makeError = [this, &acTemp](const char * str, ...) |
| 26 | + { |
| 27 | + va_list v, v2; |
| 28 | + va_start(v, str); |
| 29 | + va_copy(v2, v); |
| 30 | + |
| 31 | + size_t numChars = vsnprintf(nullptr, 0, str, v); |
| 32 | + std::string error(numChars, ' '); |
| 33 | + vsprintf_s(error.data(), error.size(), str, v2); |
| 34 | + |
| 35 | + va_end(v); |
| 36 | + va_end(v2); |
| 37 | + *this = acTemp; // restore old |
| 38 | + return error; |
| 39 | + }; |
| 40 | + |
| 41 | + // String should be format: |
| 42 | + // 2 letters, or 1 letter + *, or an integer number that is the UTF32 number of char |
| 43 | + if (acStr.front() == ',') |
| 44 | + return makeError("The acceptable code point list \"%hs...\" starts with a comma.", acStr.c_str()); |
| 45 | + |
| 46 | + acStr.erase(std::remove(acStr.begin(), acStr.end(), ' '), acStr.end()); |
| 47 | + if (acStr.empty()) |
| 48 | + return makeError("The acceptable code point list \"%hs\" is all spaces.", acStr.c_str()); |
| 49 | + |
| 50 | + if (acStr.back() == ',') |
| 51 | + return makeError("The acceptable code point list \"%hs...\" ends with a comma.", acStr.c_str()); |
| 52 | + |
| 53 | + if (acStr.find(",,"sv) != std::string::npos) |
| 54 | + return makeError("The acceptable code point list \"%hs...\" contains \",,\".", acStr.c_str()); |
| 55 | + |
| 56 | + acStr += ','; // to make sure when cur is +='d and passes end of string, it'll end with remaining = 0 |
| 57 | + |
| 58 | + const char * cur = acStr.data(); |
| 59 | + size_t remaining = acStr.size(); |
| 60 | + while (true) |
| 61 | + { |
| 62 | + remaining = (acStr.data() + acStr.size()) - cur; |
| 63 | + if (cur >= acStr.data() + acStr.size() - 1) |
| 64 | + break; |
| 65 | + |
| 66 | + // Two-letter category, or letter + * for all categories |
| 67 | + if (remaining >= 2 && |
| 68 | + std::isalpha(cur[0]) && (std::isalpha(cur[1]) || cur[1] == '*')) |
| 69 | + { |
| 70 | + // more than two letters |
| 71 | + if (remaining > 2 && cur[2] != ',') |
| 72 | + { |
| 73 | + return makeError("The acceptable code point list \"%hs\" has a 3+ letter category \"%hs\". Categories are 2 letters.", |
| 74 | + acStr.c_str(), cur); |
| 75 | + } |
| 76 | + |
| 77 | + // See utf8proc.cpp for these defined under utf8proc_category_t |
| 78 | + static const char categoryList[][3] = { "Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co" }; |
| 79 | + static const char wildcardCategory[] = { 'C', 'L', 'M', 'N', 'P','S','Z' }; |
| 80 | + |
| 81 | + bool found = false; |
| 82 | + // Wildcard |
| 83 | + if (cur[1] == '*') |
| 84 | + { |
| 85 | + char firstCharUpper = std::toupper(cur[0]); |
| 86 | + for (size_t i = 0; i < sizeof(wildcardCategory); i++) |
| 87 | + { |
| 88 | + if (firstCharUpper == wildcardCategory[i]) |
| 89 | + { |
| 90 | + // Wildcard category found, yay |
| 91 | + for (size_t j = 0; j < std::size(categoryList); j++) { |
| 92 | + if (firstCharUpper == categoryList[j][0]) |
| 93 | + codePointCategories.push_back((lw_i32)j); |
| 94 | + } |
| 95 | + |
| 96 | + cur += 3; |
| 97 | + goto nextChar; |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + return makeError("Wildcard category \"%.2hs\" not recognised. Check the help file.", cur); |
| 102 | + } |
| 103 | + |
| 104 | + for (size_t i = 0; i < std::size(categoryList); i++) |
| 105 | + { |
| 106 | + if (std::toupper(cur[0]) == categoryList[i][0] && std::tolower(cur[1]) == categoryList[i][1]) |
| 107 | + { |
| 108 | + // Category found, is it already added? |
| 109 | + if (std::find(codePointCategories.cbegin(), codePointCategories.cend(), i) != codePointCategories.cend()) |
| 110 | + return makeError("Category \"%.2hs\" was added twice in list \"%hs\".", cur, acStr.c_str()); |
| 111 | + |
| 112 | + codePointCategories.push_back((lw_i32)i); |
| 113 | + cur += 3; |
| 114 | + goto nextChar; |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + return makeError("Category \"%.2hs\" not recognised. Check the help file.", cur); |
| 119 | + } |
| 120 | + |
| 121 | + // Numeric, or numeric range expected |
| 122 | + if (std::isdigit(cur[0])) { |
| 123 | + char * endPtr; |
| 124 | + unsigned long codePointAllowed = std::strtoul(cur, &endPtr, 0); |
| 125 | + if (codePointAllowed == 0 || codePointAllowed > INT32_MAX) // error in strtoul, or user has put in 0 and approved null char, either way bad |
| 126 | + return makeError("Specific codepoint %hs not a valid codepoint.", cur, acStr.c_str()); |
| 127 | + |
| 128 | + // Single code point, after this it's a new Unicode list, or it's end of string |
| 129 | + cur = endPtr; |
| 130 | + if (cur[0] == '\0' || cur[0] == ',') |
| 131 | + { |
| 132 | + if (std::find(specificCodePoints.cbegin(), specificCodePoints.cend(), codePointAllowed) != specificCodePoints.cend()) |
| 133 | + return makeError("Specific codepoint %lu was added twice in list \"%hs\".", codePointAllowed, acStr.c_str()); |
| 134 | + |
| 135 | + specificCodePoints.push_back(codePointAllowed); |
| 136 | + if (cur[0] == ',') |
| 137 | + ++cur; |
| 138 | + goto nextChar; |
| 139 | + } |
| 140 | + |
| 141 | + // Range of code points |
| 142 | + if (cur[0] == '-') |
| 143 | + { |
| 144 | + ++cur; |
| 145 | + unsigned long lastCodePointNum = std::strtoul(cur, &endPtr, 0); |
| 146 | + if (lastCodePointNum == 0 || lastCodePointNum > INT32_MAX) // error in strtoul, or user has put in 0 and approved null char, either way bad |
| 147 | + return makeError("Ending number in codepoint range %lu to \"%.15hs...\" could not be read.", codePointAllowed, cur, cur); |
| 148 | + // Range is reversed |
| 149 | + if (lastCodePointNum < codePointAllowed) |
| 150 | + return makeError("Range %lu to %lu is backwards.", codePointAllowed, lastCodePointNum); |
| 151 | + |
| 152 | + // Allow range overlaps - we could search by range1 max > range2 min, but we won't. |
| 153 | + // We will check for an exact match in range, though. |
| 154 | + |
| 155 | + auto range = std::make_pair((std::int32_t)codePointAllowed, (std::int32_t)lastCodePointNum); |
| 156 | + if (std::find(codePointRanges.cbegin(), codePointRanges.cend(), range) != codePointRanges.cend()) |
| 157 | + return makeError("Range %lu to %lu is in the list twice.", codePointAllowed, lastCodePointNum); |
| 158 | + |
| 159 | + codePointRanges.push_back(range); |
| 160 | + cur = endPtr + 1; // skip the ',' |
| 161 | + goto nextChar; |
| 162 | + } |
| 163 | + |
| 164 | + // fall through |
| 165 | + } |
| 166 | + |
| 167 | + return makeError("Unrecognised character list starting at \"%.15hs\".", cur); |
| 168 | + |
| 169 | + nextChar: |
| 170 | + /* go to next char */; |
| 171 | + } |
| 172 | + |
| 173 | + return std::string(); |
| 174 | +} |
| 175 | + |
| 176 | +int lacewing::codepointsallowlist::checkcodepointsallowed(const std::string_view toTest, int * const rejectedUTF32CodePoint /* = NULL */) const |
| 177 | +{ |
| 178 | + if (allAllowed) |
| 179 | + return -1; |
| 180 | + |
| 181 | + const utf8proc_uint8_t * str = (const utf8proc_uint8_t *)toTest.data(); |
| 182 | + utf8proc_int32_t thisChar; |
| 183 | + utf8proc_ssize_t numBytesInCodePoint, remainingBytes = toTest.size(); |
| 184 | + int codePointIndex = 0; |
| 185 | + while (remainingBytes > 0) |
| 186 | + { |
| 187 | + numBytesInCodePoint = utf8proc_iterate(str, remainingBytes, &thisChar); |
| 188 | + if (numBytesInCodePoint <= 0 || !utf8proc_codepoint_valid(thisChar)) |
| 189 | + goto badChar; |
| 190 | + |
| 191 | + if (std::find(specificCodePoints.cbegin(), specificCodePoints.cend(), thisChar) != specificCodePoints.cend()) |
| 192 | + goto goodChar; |
| 193 | + if (std::find_if(codePointRanges.cbegin(), codePointRanges.cend(), |
| 194 | + [=](const std::pair<std::int32_t, std::int32_t> & range) { |
| 195 | + return thisChar >= range.first && thisChar <= range.second; |
| 196 | + }) != codePointRanges.cend()) |
| 197 | + { |
| 198 | + goto goodChar; |
| 199 | + } |
| 200 | + utf8proc_category_t category = utf8proc_category(thisChar); |
| 201 | + if (std::find(codePointCategories.cbegin(), codePointCategories.cend(), category) != codePointCategories.cend()) |
| 202 | + goto goodChar; |
| 203 | + |
| 204 | + // ... fall through from above |
| 205 | + badChar: |
| 206 | + if (rejectedUTF32CodePoint != NULL) |
| 207 | + *rejectedUTF32CodePoint = thisChar; |
| 208 | + return codePointIndex; |
| 209 | + |
| 210 | + // Loop around |
| 211 | + goodChar: |
| 212 | + ++codePointIndex; |
| 213 | + str += numBytesInCodePoint; |
| 214 | + remainingBytes -= numBytesInCodePoint; |
| 215 | + } |
| 216 | + |
| 217 | + return -1; // All good |
| 218 | +} |
0 commit comments