mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
392 lines
10 KiB
C++
392 lines
10 KiB
C++
/*
|
|
* Copyright (c) 2015-2017, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** \file
|
|
* \brief Tools for string manipulation, ue2_literal definition.
|
|
*/
|
|
|
|
#include "ue2string.h"
|
|
|
|
#include "charreach.h"
|
|
#include "compare.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
#include <iomanip>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
using namespace std;
|
|
|
|
namespace ue2 {
|
|
|
|
#if defined(DUMP_SUPPORT) || defined(DEBUG)
|
|
|
|
// Escape a string so that it's screen-printable
|
|
string escapeString(const string &s) {
|
|
ostringstream os;
|
|
for (unsigned int i = 0; i < s.size(); ++i) {
|
|
char c = s[i];
|
|
if (0x20 <= c && c <= 0x7e && c != '\\') {
|
|
os << c;
|
|
} else if (c == '\n') {
|
|
os << "\\n";
|
|
} else if (c == '\r') {
|
|
os << "\\r";
|
|
} else if (c == '\t') {
|
|
os << "\\t";
|
|
} else {
|
|
os << "\\x" << hex << setw(2) << setfill('0')
|
|
<< (unsigned)(c & 0xff) << dec;
|
|
}
|
|
}
|
|
return os.str();
|
|
}
|
|
|
|
string escapeString(const ue2_literal &lit) {
|
|
ostringstream os;
|
|
for (ue2_literal::const_iterator it = lit.begin(); it != lit.end(); ++it) {
|
|
char c = it->c;
|
|
if (0x20 <= c && c <= 0x7e && c != '\\') {
|
|
os << c;
|
|
} else if (c == '\n') {
|
|
os << "\\n";
|
|
} else {
|
|
os << "\\x" << hex << setw(2) << setfill('0')
|
|
<< (unsigned)(c & 0xff) << dec;
|
|
}
|
|
}
|
|
return os.str();
|
|
}
|
|
|
|
// escape any metacharacters in a literal string
|
|
string escapeStringMeta(const string &s) {
|
|
ostringstream os;
|
|
for (unsigned int i = 0; i < s.size(); ++i) {
|
|
char c = s[i];
|
|
switch (c) {
|
|
case '#': case '$': case '(': case ')':
|
|
case '*': case '+': case '.': case '/':
|
|
case '?': case '[': case ']': case '^':
|
|
case '|':
|
|
os << "\\" << c; break;
|
|
default:
|
|
os << c; break;
|
|
}
|
|
}
|
|
return os.str();
|
|
}
|
|
|
|
string dotEscapeString(const string &s) {
|
|
string ss = escapeString(s);
|
|
string out;
|
|
out.reserve(ss.size());
|
|
for (size_t i = 0; i != ss.size(); i++) {
|
|
char c = ss[i];
|
|
switch (c) {
|
|
case '\"':
|
|
case '\\':
|
|
out.push_back('\\');
|
|
// fall through
|
|
default:
|
|
out.push_back(c);
|
|
break;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
string dumpString(const ue2_literal &lit) {
|
|
string s = escapeString(lit.get_string());
|
|
if (lit.any_nocase()) {
|
|
s += " (nocase)";
|
|
}
|
|
|
|
return s;
|
|
}
|
|
#endif
|
|
|
|
void upperString(string &s) {
|
|
for (auto &c : s) {
|
|
c = mytoupper(c);
|
|
}
|
|
}
|
|
|
|
size_t maxStringOverlap(const string &a, const string &b, bool nocase) {
|
|
size_t lena = a.length(), lenb = b.length();
|
|
const char *astart = a.c_str();
|
|
const char *bstart = b.c_str();
|
|
const char *aend = astart + lena;
|
|
size_t i = lenb;
|
|
|
|
for (; i > lena; i--) {
|
|
if (!cmp(astart, bstart + i - lena, lena, nocase)) {
|
|
return i;
|
|
}
|
|
}
|
|
|
|
for (; i && cmp(aend - i, bstart, i, nocase); i--) {
|
|
;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b) {
|
|
/* todo: handle nocase better */
|
|
return maxStringOverlap(a.get_string(), b.get_string(),
|
|
a.any_nocase() || b.any_nocase());
|
|
}
|
|
|
|
size_t maxStringSelfOverlap(const string &a, bool nocase) {
|
|
size_t lena = a.length();
|
|
const char *astart = a.c_str();
|
|
const char *bstart = a.c_str();
|
|
const char *aend = astart + lena;
|
|
size_t i = lena - 1;
|
|
|
|
for (; i && cmp(aend - i, bstart, i, nocase); i--) {
|
|
;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
u32 cmp(const char *a, const char *b, size_t len, bool nocase) {
|
|
if (!nocase) {
|
|
return memcmp(a, b, len);
|
|
}
|
|
|
|
for (const auto *a_end = a + len; a < a_end; a++, b++) {
|
|
if (mytoupper(*a) != mytoupper(*b)) {
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
case_iter::case_iter(const ue2_literal &ss) : s(ss.get_string()),
|
|
s_orig(ss.get_string()) {
|
|
for (ue2_literal::const_iterator it = ss.begin(); it != ss.end(); ++it) {
|
|
nocase.push_back(it->nocase);
|
|
}
|
|
}
|
|
|
|
case_iter caseIterateBegin(const ue2_literal &s) {
|
|
return case_iter(s);
|
|
}
|
|
|
|
case_iter caseIterateEnd() {
|
|
return case_iter(ue2_literal());
|
|
}
|
|
|
|
case_iter &case_iter::operator++ () {
|
|
for (size_t i = s.length(); i != 0; i--) {
|
|
char lower = mytolower(s[i - 1]);
|
|
if (nocase[i - 1] && lower != s[i - 1]) {
|
|
s[i - 1] = lower;
|
|
copy(s_orig.begin() + i, s_orig.end(), s.begin() + i);
|
|
return *this;
|
|
}
|
|
}
|
|
|
|
s.clear();
|
|
return *this;
|
|
}
|
|
|
|
static
|
|
string toUpperString(string s) {
|
|
upperString(s);
|
|
return s;
|
|
}
|
|
|
|
ue2_literal::elem::operator CharReach () const {
|
|
if (!nocase) {
|
|
return CharReach(c);
|
|
} else {
|
|
CharReach rv;
|
|
rv.set(mytoupper(c));
|
|
rv.set(mytolower(c));
|
|
return rv;
|
|
}
|
|
}
|
|
|
|
ue2_literal::ue2_literal(const std::string &s_in, bool nc_in)
|
|
: s(nc_in ? toUpperString(s_in) : s_in), nocase(s_in.size()) {
|
|
if (nc_in) {
|
|
// Switch on nocase bit for all alpha characters.
|
|
for (size_t i = 0; i < s.length(); i++) {
|
|
if (ourisalpha(s[i])) {
|
|
nocase.set(i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ue2_literal::ue2_literal(char c, bool nc)
|
|
: s(1, nc ? mytoupper(c) : c), nocase(1, ourisalpha(c) ? nc : false) {}
|
|
|
|
ue2_literal ue2_literal::substr(size_type pos, size_type n) const {
|
|
ue2_literal rv;
|
|
rv.s = s.substr(pos, n);
|
|
size_type upper = nocase.size();
|
|
if (n != npos && n + pos < nocase.size()) {
|
|
upper = n + pos;
|
|
}
|
|
|
|
rv.nocase.resize(upper - pos, false);
|
|
for (size_t i = pos; i < upper; i++) {
|
|
rv.nocase.set(i - pos, nocase.test(i));
|
|
}
|
|
assert(s.size() == nocase.size());
|
|
return rv;
|
|
}
|
|
|
|
ue2_literal &ue2_literal::erase(size_type pos, size_type n) {
|
|
s.erase(pos, n);
|
|
|
|
if (n != npos) {
|
|
for (size_type i = pos + n; i < nocase.size(); i++) {
|
|
nocase.set(i - n, nocase.test(i));
|
|
}
|
|
}
|
|
nocase.resize(s.size());
|
|
return *this;
|
|
}
|
|
|
|
void ue2_literal::push_back(char c, bool nc) {
|
|
assert(!nc || ourisalpha(c));
|
|
if (nc) {
|
|
c = mytoupper(c);
|
|
}
|
|
nocase.push_back(nc);
|
|
s.push_back(c);
|
|
}
|
|
|
|
// Return a copy of this literal in reverse order.
|
|
ue2_literal reverse_literal(const ue2_literal &in) {
|
|
ue2_literal rv;
|
|
if (in.empty()) {
|
|
return rv;
|
|
}
|
|
|
|
for (ue2_literal::const_iterator it = in.end(); it != in.begin();) {
|
|
--it;
|
|
rv.push_back(it->c, it->nocase);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
bool ue2_literal::operator<(const ue2_literal &b) const {
|
|
if (s < b.s) {
|
|
return true;
|
|
}
|
|
if (s > b.s) {
|
|
return false;
|
|
}
|
|
return nocase < b.nocase;
|
|
}
|
|
|
|
void ue2_literal::operator+=(const ue2_literal &b) {
|
|
s += b.s;
|
|
size_t prefix = nocase.size();
|
|
nocase.resize(prefix + b.nocase.size());
|
|
for (size_t i = 0; i < b.nocase.size(); i++) {
|
|
nocase.set(prefix + i, b.nocase[i]);
|
|
}
|
|
}
|
|
|
|
bool ue2_literal::any_nocase() const {
|
|
return nocase.any();
|
|
}
|
|
|
|
void make_nocase(ue2_literal *lit) {
|
|
ue2_literal rv;
|
|
|
|
for (const auto &elem: *lit) {
|
|
rv.push_back(elem.c, ourisalpha(elem.c));
|
|
}
|
|
|
|
lit->swap(rv);
|
|
}
|
|
|
|
static
|
|
bool testchar(char c, const CharReach &cr, bool nocase) {
|
|
if (nocase) {
|
|
return cr.test((unsigned char)mytolower(c))
|
|
|| cr.test((unsigned char)mytoupper(c));
|
|
} else {
|
|
return cr.test((unsigned char)c);
|
|
}
|
|
}
|
|
|
|
// Returns true if the given literal contains a char in the given CharReach
|
|
bool contains(const ue2_literal &s, const CharReach &cr) {
|
|
for (ue2_literal::const_iterator it = s.begin(), ite = s.end();
|
|
it != ite; ++it) {
|
|
if (testchar(it->c, cr, it->nocase)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
size_t maxStringSelfOverlap(const ue2_literal &a) {
|
|
/* overly conservative if only part of the string is nocase, TODO: fix */
|
|
return maxStringSelfOverlap(a.get_string(), a.any_nocase());
|
|
}
|
|
|
|
size_t minStringPeriod(const ue2_literal &a) {
|
|
return a.length() - maxStringSelfOverlap(a);
|
|
}
|
|
|
|
// Returns true if `a' is a suffix of (or equal to) `b'.
|
|
bool isSuffix(const ue2_literal &a, const ue2_literal &b) {
|
|
size_t alen = a.length(), blen = b.length();
|
|
if (alen > blen) {
|
|
return false;
|
|
}
|
|
return equal(a.begin(), a.end(), b.begin() + (blen - alen));
|
|
}
|
|
|
|
bool is_flood(const ue2_literal &s) {
|
|
assert(!s.empty());
|
|
|
|
ue2_literal::const_iterator it = s.begin(), ite = s.end();
|
|
ue2_literal::elem f = *it;
|
|
for (++it; it != ite; ++it) {
|
|
if (*it != f) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
} // namespace ue2
|