diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index dfa0beda..9cddace4 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -116,7 +116,7 @@ unsigned parseAsDecimal(unsigned oct) { static constexpr u32 MAX_NUMBER = INT_MAX; static -void pushDec(u32 *acc, u8 raw_digit) { +void pushDec(u32 *acc, char raw_digit) { assert(raw_digit >= '0' && raw_digit <= '9'); u32 digit_val = raw_digit - '0'; @@ -130,7 +130,7 @@ void pushDec(u32 *acc, u8 raw_digit) { } static -void pushOct(u32 *acc, u8 raw_digit) { +void pushOct(u32 *acc, char raw_digit) { assert(raw_digit >= '0' && raw_digit <= '7'); u32 digit_val = raw_digit - '0'; @@ -169,8 +169,7 @@ ComponentSequence *enterSequence(ComponentSequence *parent, } static -void addLiteral(ComponentSequence *currentSeq, unsigned char c, - const ParseMode &mode) { +void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) { if (mode.utf8 && mode.caseless) { /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); @@ -197,7 +196,7 @@ void addEscaped(ComponentSequence *currentSeq, unichar accum, if (accum > 255) { throw LocatedParseError(err_msg); } - addLiteral(currentSeq, (unsigned char)accum, mode); + addLiteral(currentSeq, (char)accum, mode); } } @@ -217,7 +216,7 @@ void addEscapedHex(ComponentSequence *currentSeq, unichar accum, #define SLASH_C_ERROR "\\c must be followed by an ASCII character" static -u8 decodeCtrl(u8 raw) { +u8 decodeCtrl(char raw) { if (raw & 0x80) { throw LocatedParseError(SLASH_C_ERROR); } @@ -225,10 +224,10 @@ u8 decodeCtrl(u8 raw) { } static -unichar readUtf8CodePoint2c(const u8 *ts) { +unichar readUtf8CodePoint2c(const char *s) { + auto *ts = (const u8 *)s; assert(ts[0] >= 0xc0 && ts[0] < 0xe0); assert(ts[1] >= 0x80 && ts[1] < 0xc0); - unichar val = ts[0] & 0x1f; val <<= 6; val |= ts[1] & 0x3f; @@ -238,7 +237,8 @@ unichar readUtf8CodePoint2c(const u8 *ts) { } static -unichar readUtf8CodePoint3c(const u8 *ts) { +unichar readUtf8CodePoint3c(const char *s) { + auto *ts = (const u8 *)s; assert(ts[0] >= 0xe0 && ts[0] < 0xf0); assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0); @@ -253,7 +253,8 @@ unichar readUtf8CodePoint3c(const u8 *ts) { } static -unichar readUtf8CodePoint4c(const u8 *ts) { +unichar readUtf8CodePoint4c(const char *s) { + auto *ts = (const u8 *)s; assert(ts[0] >= 0xf0 && ts[0] < 0xf8); assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0); @@ -273,12 +274,10 @@ unichar readUtf8CodePoint4c(const u8 *ts) { %%{ machine regex; - alphtype unsigned char; - action throwUnsupportedEscape { ostringstream str; - str << "'\\" << (char)*(ts + 1) << "' at index " - << ts - ptr << " not supported in a character class."; + str << "'\\" << *(ts + 1) << "' at index " << ts - ptr + << " not supported in a character class."; throw ParseError(str.str()); } action unsupportedProperty { @@ -974,7 +973,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { }; '\\o{' [0-7]+ '}' => { - string oct((const char *)ts + 3, te - ts - 4); + string oct(ts + 3, te - ts - 4); long int val = strtol(oct.c_str(), nullptr, 8); if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { throw LocatedParseError("Value in \\o{...} sequence is too large"); @@ -999,7 +998,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { }; # Unicode Hex '\\x{' xdigit+ '}' => { - string hex((const char *)ts + 3, te - ts - 4); + string hex(ts + 3, te - ts - 4); long int val = strtol(hex.c_str(), nullptr, 16); if (val > MAX_UNICODE) { throw LocatedParseError("Value in \\x{...} sequence is too large"); @@ -1089,7 +1088,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { # Literal character (any - ']') => { - currentCls->add(*ts); + currentCls->add((u8)*ts); }; ']' => { @@ -1443,7 +1442,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { // Otherwise, we interpret the first three digits as an // octal escape, and the remaining characters stand for // themselves as literals. - const u8 *s = ts; + const char *s = ts; unsigned int accum = 0; unsigned int oct_digits = 0; assert(*s == '\\'); // token starts at backslash @@ -1488,7 +1487,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { throw LocatedParseError("Invalid reference after \\g"); }; '\\o{' [0-7]+ '}' => { - string oct((const char *)ts + 3, te - ts - 4); + string oct(ts + 3, te - ts - 4); long int val = strtol(oct.c_str(), nullptr, 8); if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { throw LocatedParseError("Value in \\o{...} sequence is too large"); @@ -1505,7 +1504,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { }; # Unicode Hex '\\x{' xdigit+ '}' => { - string hex((const char *)ts + 3, te - ts - 4); + string hex(ts + 3, te - ts - 4); long int val = strtol(hex.c_str(), nullptr, 16); if (val > MAX_UNICODE) { throw LocatedParseError("Value in \\x{...} sequence is too large"); @@ -1529,8 +1528,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) { # A bunch of unsupported (for now) escapes escapedUnsupported => { ostringstream str; - str << "'\\" << (char)*(ts + 1) << "' at index " - << ts - ptr << " not supported."; + str << "'\\" << *(ts + 1) << "' at index " << ts - ptr + << " not supported."; throw ParseError(str.str()); }; @@ -1831,24 +1830,22 @@ unichar readUtf8CodePoint4c(const u8 *ts) { %% write data nofinal; /** \brief Main parser call, returns root Component or nullptr. */ -unique_ptr parse(const char *c_ptr, ParseMode &globalMode) { - assert(c_ptr); +unique_ptr parse(const char *ptr, ParseMode &globalMode) { + assert(ptr); - const u8 *ptr = (const u8 *const)c_ptr; - const u8 *p = ptr; - const u8 *pe = ptr + strlen(c_ptr); + const char *p = ptr; + const char *pe = ptr + strlen(ptr); // First, read the control verbs, set any global mode flags and move the // ptr forward. - p = (const u8 *)read_control_verbs((const char *)p, (const char *)pe, - globalMode); + p = read_control_verbs(p, pe, globalMode); - const u8 *eof = pe; + const char *eof = pe; int cs; UNUSED int act; int top; vector stack; - const u8 *ts, *te; + const char *ts, *te; unichar accumulator = 0; unichar octAccumulator = 0; /* required as we are also accumulating for * back ref when looking for octals */ @@ -1894,7 +1891,7 @@ unique_ptr parse(const char *c_ptr, ParseMode &globalMode) { bool inCharClassEarly = false; // Location at which the current character class began. - const u8 *currentClsBegin = p; + const char *currentClsBegin = p; // We throw exceptions on various parsing failures beyond this point: we // use a try/catch block here to clean up our allocated memory before we