parser: switch to using char* pointers

This commit is contained in:
Justin Viiret 2017-02-02 13:56:30 +11:00 committed by Matthew Barr
parent 1875d55cf1
commit 4def0c8a52

View File

@ -116,7 +116,7 @@ unsigned parseAsDecimal(unsigned oct) {
static constexpr u32 MAX_NUMBER = INT_MAX; static constexpr u32 MAX_NUMBER = INT_MAX;
static static
void pushDec(u32 *acc, u8 raw_digit) { void pushDec(u32 *acc, char raw_digit) {
assert(raw_digit >= '0' && raw_digit <= '9'); assert(raw_digit >= '0' && raw_digit <= '9');
u32 digit_val = raw_digit - '0'; u32 digit_val = raw_digit - '0';
@ -130,7 +130,7 @@ void pushDec(u32 *acc, u8 raw_digit) {
} }
static static
void pushOct(u32 *acc, u8 raw_digit) { void pushOct(u32 *acc, char raw_digit) {
assert(raw_digit >= '0' && raw_digit <= '7'); assert(raw_digit >= '0' && raw_digit <= '7');
u32 digit_val = raw_digit - '0'; u32 digit_val = raw_digit - '0';
@ -169,8 +169,7 @@ ComponentSequence *enterSequence(ComponentSequence *parent,
} }
static static
void addLiteral(ComponentSequence *currentSeq, unsigned char c, void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
const ParseMode &mode) {
if (mode.utf8 && mode.caseless) { if (mode.utf8 && mode.caseless) {
/* leverage ComponentClass to generate the vertices */ /* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode); auto cc = getComponentClass(mode);
@ -197,7 +196,7 @@ void addEscaped(ComponentSequence *currentSeq, unichar accum,
if (accum > 255) { if (accum > 255) {
throw LocatedParseError(err_msg); throw LocatedParseError(err_msg);
} }
addLiteral(currentSeq, (unsigned char)accum, mode); addLiteral(currentSeq, (char)accum, mode);
} }
} }
@ -217,7 +216,7 @@ void addEscapedHex(ComponentSequence *currentSeq, unichar accum,
#define SLASH_C_ERROR "\\c must be followed by an ASCII character" #define SLASH_C_ERROR "\\c must be followed by an ASCII character"
static static
u8 decodeCtrl(u8 raw) { u8 decodeCtrl(char raw) {
if (raw & 0x80) { if (raw & 0x80) {
throw LocatedParseError(SLASH_C_ERROR); throw LocatedParseError(SLASH_C_ERROR);
} }
@ -225,10 +224,10 @@ u8 decodeCtrl(u8 raw) {
} }
static static
unichar readUtf8CodePoint2c(const u8 *ts) { unichar readUtf8CodePoint2c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xc0 && ts[0] < 0xe0); assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[1] >= 0x80 && ts[1] < 0xc0);
unichar val = ts[0] & 0x1f; unichar val = ts[0] & 0x1f;
val <<= 6; val <<= 6;
val |= ts[1] & 0x3f; val |= ts[1] & 0x3f;
@ -238,7 +237,8 @@ unichar readUtf8CodePoint2c(const u8 *ts) {
} }
static static
unichar readUtf8CodePoint3c(const u8 *ts) { unichar readUtf8CodePoint3c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xe0 && ts[0] < 0xf0); assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[1] >= 0x80 && ts[1] < 0xc0);
assert(ts[2] >= 0x80 && ts[2] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0);
@ -253,7 +253,8 @@ unichar readUtf8CodePoint3c(const u8 *ts) {
} }
static static
unichar readUtf8CodePoint4c(const u8 *ts) { unichar readUtf8CodePoint4c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xf0 && ts[0] < 0xf8); assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[1] >= 0x80 && ts[1] < 0xc0);
assert(ts[2] >= 0x80 && ts[2] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0);
@ -273,12 +274,10 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
%%{ %%{
machine regex; machine regex;
alphtype unsigned char;
action throwUnsupportedEscape { action throwUnsupportedEscape {
ostringstream str; ostringstream str;
str << "'\\" << (char)*(ts + 1) << "' at index " str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
<< ts - ptr << " not supported in a character class."; << " not supported in a character class.";
throw ParseError(str.str()); throw ParseError(str.str());
} }
action unsupportedProperty { action unsupportedProperty {
@ -974,7 +973,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
}; };
'\\o{' [0-7]+ '}' => { '\\o{' [0-7]+ '}' => {
string oct((const char *)ts + 3, te - ts - 4); string oct(ts + 3, te - ts - 4);
long int val = strtol(oct.c_str(), nullptr, 8); long int val = strtol(oct.c_str(), nullptr, 8);
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
throw LocatedParseError("Value in \\o{...} sequence is too large"); throw LocatedParseError("Value in \\o{...} sequence is too large");
@ -999,7 +998,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
}; };
# Unicode Hex # Unicode Hex
'\\x{' xdigit+ '}' => { '\\x{' xdigit+ '}' => {
string hex((const char *)ts + 3, te - ts - 4); string hex(ts + 3, te - ts - 4);
long int val = strtol(hex.c_str(), nullptr, 16); long int val = strtol(hex.c_str(), nullptr, 16);
if (val > MAX_UNICODE) { if (val > MAX_UNICODE) {
throw LocatedParseError("Value in \\x{...} sequence is too large"); throw LocatedParseError("Value in \\x{...} sequence is too large");
@ -1089,7 +1088,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
# Literal character # Literal character
(any - ']') => { (any - ']') => {
currentCls->add(*ts); currentCls->add((u8)*ts);
}; };
']' => { ']' => {
@ -1443,7 +1442,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
// Otherwise, we interpret the first three digits as an // Otherwise, we interpret the first three digits as an
// octal escape, and the remaining characters stand for // octal escape, and the remaining characters stand for
// themselves as literals. // themselves as literals.
const u8 *s = ts; const char *s = ts;
unsigned int accum = 0; unsigned int accum = 0;
unsigned int oct_digits = 0; unsigned int oct_digits = 0;
assert(*s == '\\'); // token starts at backslash assert(*s == '\\'); // token starts at backslash
@ -1488,7 +1487,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
throw LocatedParseError("Invalid reference after \\g"); throw LocatedParseError("Invalid reference after \\g");
}; };
'\\o{' [0-7]+ '}' => { '\\o{' [0-7]+ '}' => {
string oct((const char *)ts + 3, te - ts - 4); string oct(ts + 3, te - ts - 4);
long int val = strtol(oct.c_str(), nullptr, 8); long int val = strtol(oct.c_str(), nullptr, 8);
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
throw LocatedParseError("Value in \\o{...} sequence is too large"); throw LocatedParseError("Value in \\o{...} sequence is too large");
@ -1505,7 +1504,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
}; };
# Unicode Hex # Unicode Hex
'\\x{' xdigit+ '}' => { '\\x{' xdigit+ '}' => {
string hex((const char *)ts + 3, te - ts - 4); string hex(ts + 3, te - ts - 4);
long int val = strtol(hex.c_str(), nullptr, 16); long int val = strtol(hex.c_str(), nullptr, 16);
if (val > MAX_UNICODE) { if (val > MAX_UNICODE) {
throw LocatedParseError("Value in \\x{...} sequence is too large"); throw LocatedParseError("Value in \\x{...} sequence is too large");
@ -1529,8 +1528,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
# A bunch of unsupported (for now) escapes # A bunch of unsupported (for now) escapes
escapedUnsupported => { escapedUnsupported => {
ostringstream str; ostringstream str;
str << "'\\" << (char)*(ts + 1) << "' at index " str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
<< ts - ptr << " not supported."; << " not supported.";
throw ParseError(str.str()); throw ParseError(str.str());
}; };
@ -1831,24 +1830,22 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
%% write data nofinal; %% write data nofinal;
/** \brief Main parser call, returns root Component or nullptr. */ /** \brief Main parser call, returns root Component or nullptr. */
unique_ptr<Component> parse(const char *c_ptr, ParseMode &globalMode) { unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
assert(c_ptr); assert(ptr);
const u8 *ptr = (const u8 *const)c_ptr; const char *p = ptr;
const u8 *p = ptr; const char *pe = ptr + strlen(ptr);
const u8 *pe = ptr + strlen(c_ptr);
// First, read the control verbs, set any global mode flags and move the // First, read the control verbs, set any global mode flags and move the
// ptr forward. // ptr forward.
p = (const u8 *)read_control_verbs((const char *)p, (const char *)pe, p = read_control_verbs(p, pe, globalMode);
globalMode);
const u8 *eof = pe; const char *eof = pe;
int cs; int cs;
UNUSED int act; UNUSED int act;
int top; int top;
vector<int> stack; vector<int> stack;
const u8 *ts, *te; const char *ts, *te;
unichar accumulator = 0; unichar accumulator = 0;
unichar octAccumulator = 0; /* required as we are also accumulating for unichar octAccumulator = 0; /* required as we are also accumulating for
* back ref when looking for octals */ * back ref when looking for octals */
@ -1894,7 +1891,7 @@ unique_ptr<Component> parse(const char *c_ptr, ParseMode &globalMode) {
bool inCharClassEarly = false; bool inCharClassEarly = false;
// Location at which the current character class began. // Location at which the current character class began.
const u8 *currentClsBegin = p; const char *currentClsBegin = p;
// We throw exceptions on various parsing failures beyond this point: we // We throw exceptions on various parsing failures beyond this point: we
// use a try/catch block here to clean up our allocated memory before we // use a try/catch block here to clean up our allocated memory before we