mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
parser: switch to using char* pointers
This commit is contained in:
parent
1875d55cf1
commit
4def0c8a52
@ -116,7 +116,7 @@ unsigned parseAsDecimal(unsigned oct) {
|
|||||||
static constexpr u32 MAX_NUMBER = INT_MAX;
|
static constexpr u32 MAX_NUMBER = INT_MAX;
|
||||||
|
|
||||||
static
|
static
|
||||||
void pushDec(u32 *acc, u8 raw_digit) {
|
void pushDec(u32 *acc, char raw_digit) {
|
||||||
assert(raw_digit >= '0' && raw_digit <= '9');
|
assert(raw_digit >= '0' && raw_digit <= '9');
|
||||||
u32 digit_val = raw_digit - '0';
|
u32 digit_val = raw_digit - '0';
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ void pushDec(u32 *acc, u8 raw_digit) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void pushOct(u32 *acc, u8 raw_digit) {
|
void pushOct(u32 *acc, char raw_digit) {
|
||||||
assert(raw_digit >= '0' && raw_digit <= '7');
|
assert(raw_digit >= '0' && raw_digit <= '7');
|
||||||
u32 digit_val = raw_digit - '0';
|
u32 digit_val = raw_digit - '0';
|
||||||
|
|
||||||
@ -169,8 +169,7 @@ ComponentSequence *enterSequence(ComponentSequence *parent,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void addLiteral(ComponentSequence *currentSeq, unsigned char c,
|
void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
|
||||||
const ParseMode &mode) {
|
|
||||||
if (mode.utf8 && mode.caseless) {
|
if (mode.utf8 && mode.caseless) {
|
||||||
/* leverage ComponentClass to generate the vertices */
|
/* leverage ComponentClass to generate the vertices */
|
||||||
auto cc = getComponentClass(mode);
|
auto cc = getComponentClass(mode);
|
||||||
@ -197,7 +196,7 @@ void addEscaped(ComponentSequence *currentSeq, unichar accum,
|
|||||||
if (accum > 255) {
|
if (accum > 255) {
|
||||||
throw LocatedParseError(err_msg);
|
throw LocatedParseError(err_msg);
|
||||||
}
|
}
|
||||||
addLiteral(currentSeq, (unsigned char)accum, mode);
|
addLiteral(currentSeq, (char)accum, mode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,7 +216,7 @@ void addEscapedHex(ComponentSequence *currentSeq, unichar accum,
|
|||||||
#define SLASH_C_ERROR "\\c must be followed by an ASCII character"
|
#define SLASH_C_ERROR "\\c must be followed by an ASCII character"
|
||||||
|
|
||||||
static
|
static
|
||||||
u8 decodeCtrl(u8 raw) {
|
u8 decodeCtrl(char raw) {
|
||||||
if (raw & 0x80) {
|
if (raw & 0x80) {
|
||||||
throw LocatedParseError(SLASH_C_ERROR);
|
throw LocatedParseError(SLASH_C_ERROR);
|
||||||
}
|
}
|
||||||
@ -225,10 +224,10 @@ u8 decodeCtrl(u8 raw) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
unichar readUtf8CodePoint2c(const u8 *ts) {
|
unichar readUtf8CodePoint2c(const char *s) {
|
||||||
|
auto *ts = (const u8 *)s;
|
||||||
assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
|
assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
|
||||||
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
||||||
|
|
||||||
unichar val = ts[0] & 0x1f;
|
unichar val = ts[0] & 0x1f;
|
||||||
val <<= 6;
|
val <<= 6;
|
||||||
val |= ts[1] & 0x3f;
|
val |= ts[1] & 0x3f;
|
||||||
@ -238,7 +237,8 @@ unichar readUtf8CodePoint2c(const u8 *ts) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
unichar readUtf8CodePoint3c(const u8 *ts) {
|
unichar readUtf8CodePoint3c(const char *s) {
|
||||||
|
auto *ts = (const u8 *)s;
|
||||||
assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
|
assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
|
||||||
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
||||||
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
||||||
@ -253,7 +253,8 @@ unichar readUtf8CodePoint3c(const u8 *ts) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
unichar readUtf8CodePoint4c(const u8 *ts) {
|
unichar readUtf8CodePoint4c(const char *s) {
|
||||||
|
auto *ts = (const u8 *)s;
|
||||||
assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
|
assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
|
||||||
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
||||||
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
||||||
@ -273,12 +274,10 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
%%{
|
%%{
|
||||||
machine regex;
|
machine regex;
|
||||||
|
|
||||||
alphtype unsigned char;
|
|
||||||
|
|
||||||
action throwUnsupportedEscape {
|
action throwUnsupportedEscape {
|
||||||
ostringstream str;
|
ostringstream str;
|
||||||
str << "'\\" << (char)*(ts + 1) << "' at index "
|
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
|
||||||
<< ts - ptr << " not supported in a character class.";
|
<< " not supported in a character class.";
|
||||||
throw ParseError(str.str());
|
throw ParseError(str.str());
|
||||||
}
|
}
|
||||||
action unsupportedProperty {
|
action unsupportedProperty {
|
||||||
@ -974,7 +973,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
'\\o{' [0-7]+ '}' => {
|
'\\o{' [0-7]+ '}' => {
|
||||||
string oct((const char *)ts + 3, te - ts - 4);
|
string oct(ts + 3, te - ts - 4);
|
||||||
long int val = strtol(oct.c_str(), nullptr, 8);
|
long int val = strtol(oct.c_str(), nullptr, 8);
|
||||||
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
||||||
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
||||||
@ -999,7 +998,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
};
|
};
|
||||||
# Unicode Hex
|
# Unicode Hex
|
||||||
'\\x{' xdigit+ '}' => {
|
'\\x{' xdigit+ '}' => {
|
||||||
string hex((const char *)ts + 3, te - ts - 4);
|
string hex(ts + 3, te - ts - 4);
|
||||||
long int val = strtol(hex.c_str(), nullptr, 16);
|
long int val = strtol(hex.c_str(), nullptr, 16);
|
||||||
if (val > MAX_UNICODE) {
|
if (val > MAX_UNICODE) {
|
||||||
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
||||||
@ -1089,7 +1088,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
|
|
||||||
# Literal character
|
# Literal character
|
||||||
(any - ']') => {
|
(any - ']') => {
|
||||||
currentCls->add(*ts);
|
currentCls->add((u8)*ts);
|
||||||
};
|
};
|
||||||
|
|
||||||
']' => {
|
']' => {
|
||||||
@ -1443,7 +1442,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
// Otherwise, we interpret the first three digits as an
|
// Otherwise, we interpret the first three digits as an
|
||||||
// octal escape, and the remaining characters stand for
|
// octal escape, and the remaining characters stand for
|
||||||
// themselves as literals.
|
// themselves as literals.
|
||||||
const u8 *s = ts;
|
const char *s = ts;
|
||||||
unsigned int accum = 0;
|
unsigned int accum = 0;
|
||||||
unsigned int oct_digits = 0;
|
unsigned int oct_digits = 0;
|
||||||
assert(*s == '\\'); // token starts at backslash
|
assert(*s == '\\'); // token starts at backslash
|
||||||
@ -1488,7 +1487,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
throw LocatedParseError("Invalid reference after \\g");
|
throw LocatedParseError("Invalid reference after \\g");
|
||||||
};
|
};
|
||||||
'\\o{' [0-7]+ '}' => {
|
'\\o{' [0-7]+ '}' => {
|
||||||
string oct((const char *)ts + 3, te - ts - 4);
|
string oct(ts + 3, te - ts - 4);
|
||||||
long int val = strtol(oct.c_str(), nullptr, 8);
|
long int val = strtol(oct.c_str(), nullptr, 8);
|
||||||
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
||||||
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
||||||
@ -1505,7 +1504,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
};
|
};
|
||||||
# Unicode Hex
|
# Unicode Hex
|
||||||
'\\x{' xdigit+ '}' => {
|
'\\x{' xdigit+ '}' => {
|
||||||
string hex((const char *)ts + 3, te - ts - 4);
|
string hex(ts + 3, te - ts - 4);
|
||||||
long int val = strtol(hex.c_str(), nullptr, 16);
|
long int val = strtol(hex.c_str(), nullptr, 16);
|
||||||
if (val > MAX_UNICODE) {
|
if (val > MAX_UNICODE) {
|
||||||
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
||||||
@ -1529,8 +1528,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
# A bunch of unsupported (for now) escapes
|
# A bunch of unsupported (for now) escapes
|
||||||
escapedUnsupported => {
|
escapedUnsupported => {
|
||||||
ostringstream str;
|
ostringstream str;
|
||||||
str << "'\\" << (char)*(ts + 1) << "' at index "
|
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
|
||||||
<< ts - ptr << " not supported.";
|
<< " not supported.";
|
||||||
throw ParseError(str.str());
|
throw ParseError(str.str());
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1831,24 +1830,22 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
%% write data nofinal;
|
%% write data nofinal;
|
||||||
|
|
||||||
/** \brief Main parser call, returns root Component or nullptr. */
|
/** \brief Main parser call, returns root Component or nullptr. */
|
||||||
unique_ptr<Component> parse(const char *c_ptr, ParseMode &globalMode) {
|
unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
|
||||||
assert(c_ptr);
|
assert(ptr);
|
||||||
|
|
||||||
const u8 *ptr = (const u8 *const)c_ptr;
|
const char *p = ptr;
|
||||||
const u8 *p = ptr;
|
const char *pe = ptr + strlen(ptr);
|
||||||
const u8 *pe = ptr + strlen(c_ptr);
|
|
||||||
|
|
||||||
// First, read the control verbs, set any global mode flags and move the
|
// First, read the control verbs, set any global mode flags and move the
|
||||||
// ptr forward.
|
// ptr forward.
|
||||||
p = (const u8 *)read_control_verbs((const char *)p, (const char *)pe,
|
p = read_control_verbs(p, pe, globalMode);
|
||||||
globalMode);
|
|
||||||
|
|
||||||
const u8 *eof = pe;
|
const char *eof = pe;
|
||||||
int cs;
|
int cs;
|
||||||
UNUSED int act;
|
UNUSED int act;
|
||||||
int top;
|
int top;
|
||||||
vector<int> stack;
|
vector<int> stack;
|
||||||
const u8 *ts, *te;
|
const char *ts, *te;
|
||||||
unichar accumulator = 0;
|
unichar accumulator = 0;
|
||||||
unichar octAccumulator = 0; /* required as we are also accumulating for
|
unichar octAccumulator = 0; /* required as we are also accumulating for
|
||||||
* back ref when looking for octals */
|
* back ref when looking for octals */
|
||||||
@ -1894,7 +1891,7 @@ unique_ptr<Component> parse(const char *c_ptr, ParseMode &globalMode) {
|
|||||||
bool inCharClassEarly = false;
|
bool inCharClassEarly = false;
|
||||||
|
|
||||||
// Location at which the current character class began.
|
// Location at which the current character class began.
|
||||||
const u8 *currentClsBegin = p;
|
const char *currentClsBegin = p;
|
||||||
|
|
||||||
// We throw exceptions on various parsing failures beyond this point: we
|
// We throw exceptions on various parsing failures beyond this point: we
|
||||||
// use a try/catch block here to clean up our allocated memory before we
|
// use a try/catch block here to clean up our allocated memory before we
|
||||||
|
Loading…
x
Reference in New Issue
Block a user