vectorscan/src/util/ue2string.cpp
Justin Viiret 1aad3b0ed1 ue2_literal: make nocase member a dynamic_bitset
We were previously using vector<bool>, but dynamic_bitset provides a
faster any() impl
2017-09-18 13:26:18 +10:00

392 lines
10 KiB
C++

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Tools for string manipulation, ue2_literal definition.
*/
#include "ue2string.h"
#include "charreach.h"
#include "compare.h"
#include <algorithm>
#include <cstring>
#include <iomanip>
#include <sstream>
#include <string>
using namespace std;
namespace ue2 {
#if defined(DUMP_SUPPORT) || defined(DEBUG)
// Escape a string so that it's screen-printable
string escapeString(const string &s) {
ostringstream os;
for (unsigned int i = 0; i < s.size(); ++i) {
char c = s[i];
if (0x20 <= c && c <= 0x7e && c != '\\') {
os << c;
} else if (c == '\n') {
os << "\\n";
} else if (c == '\r') {
os << "\\r";
} else if (c == '\t') {
os << "\\t";
} else {
os << "\\x" << hex << setw(2) << setfill('0')
<< (unsigned)(c & 0xff) << dec;
}
}
return os.str();
}
string escapeString(const ue2_literal &lit) {
ostringstream os;
for (ue2_literal::const_iterator it = lit.begin(); it != lit.end(); ++it) {
char c = it->c;
if (0x20 <= c && c <= 0x7e && c != '\\') {
os << c;
} else if (c == '\n') {
os << "\\n";
} else {
os << "\\x" << hex << setw(2) << setfill('0')
<< (unsigned)(c & 0xff) << dec;
}
}
return os.str();
}
// escape any metacharacters in a literal string
string escapeStringMeta(const string &s) {
ostringstream os;
for (unsigned int i = 0; i < s.size(); ++i) {
char c = s[i];
switch (c) {
case '#': case '$': case '(': case ')':
case '*': case '+': case '.': case '/':
case '?': case '[': case ']': case '^':
case '|':
os << "\\" << c; break;
default:
os << c; break;
}
}
return os.str();
}
string dotEscapeString(const string &s) {
string ss = escapeString(s);
string out;
out.reserve(ss.size());
for (size_t i = 0; i != ss.size(); i++) {
char c = ss[i];
switch (c) {
case '\"':
case '\\':
out.push_back('\\');
// fall through
default:
out.push_back(c);
break;
}
}
return out;
}
string dumpString(const ue2_literal &lit) {
string s = escapeString(lit.get_string());
if (lit.any_nocase()) {
s += " (nocase)";
}
return s;
}
#endif
void upperString(string &s) {
for (auto &c : s) {
c = mytoupper(c);
}
}
size_t maxStringOverlap(const string &a, const string &b, bool nocase) {
size_t lena = a.length(), lenb = b.length();
const char *astart = a.c_str();
const char *bstart = b.c_str();
const char *aend = astart + lena;
size_t i = lenb;
for (; i > lena; i--) {
if (!cmp(astart, bstart + i - lena, lena, nocase)) {
return i;
}
}
for (; i && cmp(aend - i, bstart, i, nocase); i--) {
;
}
return i;
}
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b) {
/* todo: handle nocase better */
return maxStringOverlap(a.get_string(), b.get_string(),
a.any_nocase() || b.any_nocase());
}
size_t maxStringSelfOverlap(const string &a, bool nocase) {
size_t lena = a.length();
const char *astart = a.c_str();
const char *bstart = a.c_str();
const char *aend = astart + lena;
size_t i = lena - 1;
for (; i && cmp(aend - i, bstart, i, nocase); i--) {
;
}
return i;
}
u32 cmp(const char *a, const char *b, size_t len, bool nocase) {
if (!nocase) {
return memcmp(a, b, len);
}
for (const auto *a_end = a + len; a < a_end; a++, b++) {
if (mytoupper(*a) != mytoupper(*b)) {
return 1;
}
}
return 0;
}
case_iter::case_iter(const ue2_literal &ss) : s(ss.get_string()),
s_orig(ss.get_string()) {
for (ue2_literal::const_iterator it = ss.begin(); it != ss.end(); ++it) {
nocase.push_back(it->nocase);
}
}
case_iter caseIterateBegin(const ue2_literal &s) {
return case_iter(s);
}
case_iter caseIterateEnd() {
return case_iter(ue2_literal());
}
case_iter &case_iter::operator++ () {
for (size_t i = s.length(); i != 0; i--) {
char lower = mytolower(s[i - 1]);
if (nocase[i - 1] && lower != s[i - 1]) {
s[i - 1] = lower;
copy(s_orig.begin() + i, s_orig.end(), s.begin() + i);
return *this;
}
}
s.clear();
return *this;
}
static
string toUpperString(string s) {
upperString(s);
return s;
}
ue2_literal::elem::operator CharReach () const {
if (!nocase) {
return CharReach(c);
} else {
CharReach rv;
rv.set(mytoupper(c));
rv.set(mytolower(c));
return rv;
}
}
ue2_literal::ue2_literal(const std::string &s_in, bool nc_in)
: s(nc_in ? toUpperString(s_in) : s_in), nocase(s_in.size()) {
if (nc_in) {
// Switch on nocase bit for all alpha characters.
for (size_t i = 0; i < s.length(); i++) {
if (ourisalpha(s[i])) {
nocase.set(i);
}
}
}
}
ue2_literal::ue2_literal(char c, bool nc)
: s(1, nc ? mytoupper(c) : c), nocase(1, ourisalpha(c) ? nc : false) {}
ue2_literal ue2_literal::substr(size_type pos, size_type n) const {
ue2_literal rv;
rv.s = s.substr(pos, n);
size_type upper = nocase.size();
if (n != npos && n + pos < nocase.size()) {
upper = n + pos;
}
rv.nocase.resize(upper - pos, false);
for (size_t i = pos; i < upper; i++) {
rv.nocase.set(i - pos, nocase.test(i));
}
assert(s.size() == nocase.size());
return rv;
}
ue2_literal &ue2_literal::erase(size_type pos, size_type n) {
s.erase(pos, n);
if (n != npos) {
for (size_type i = pos + n; i < nocase.size(); i++) {
nocase.set(i - n, nocase.test(i));
}
}
nocase.resize(s.size());
return *this;
}
void ue2_literal::push_back(char c, bool nc) {
assert(!nc || ourisalpha(c));
if (nc) {
c = mytoupper(c);
}
nocase.push_back(nc);
s.push_back(c);
}
// Return a copy of this literal in reverse order.
ue2_literal reverse_literal(const ue2_literal &in) {
ue2_literal rv;
if (in.empty()) {
return rv;
}
for (ue2_literal::const_iterator it = in.end(); it != in.begin();) {
--it;
rv.push_back(it->c, it->nocase);
}
return rv;
}
bool ue2_literal::operator<(const ue2_literal &b) const {
if (s < b.s) {
return true;
}
if (s > b.s) {
return false;
}
return nocase < b.nocase;
}
void ue2_literal::operator+=(const ue2_literal &b) {
s += b.s;
size_t prefix = nocase.size();
nocase.resize(prefix + b.nocase.size());
for (size_t i = 0; i < b.nocase.size(); i++) {
nocase.set(prefix + i, b.nocase[i]);
}
}
bool ue2_literal::any_nocase() const {
return nocase.any();
}
void make_nocase(ue2_literal *lit) {
ue2_literal rv;
for (const auto &elem: *lit) {
rv.push_back(elem.c, ourisalpha(elem.c));
}
lit->swap(rv);
}
static
bool testchar(char c, const CharReach &cr, bool nocase) {
if (nocase) {
return cr.test((unsigned char)mytolower(c))
|| cr.test((unsigned char)mytoupper(c));
} else {
return cr.test((unsigned char)c);
}
}
// Returns true if the given literal contains a char in the given CharReach
bool contains(const ue2_literal &s, const CharReach &cr) {
for (ue2_literal::const_iterator it = s.begin(), ite = s.end();
it != ite; ++it) {
if (testchar(it->c, cr, it->nocase)) {
return true;
}
}
return false;
}
size_t maxStringSelfOverlap(const ue2_literal &a) {
/* overly conservative if only part of the string is nocase, TODO: fix */
return maxStringSelfOverlap(a.get_string(), a.any_nocase());
}
size_t minStringPeriod(const ue2_literal &a) {
return a.length() - maxStringSelfOverlap(a);
}
// Returns true if `a' is a suffix of (or equal to) `b'.
bool isSuffix(const ue2_literal &a, const ue2_literal &b) {
size_t alen = a.length(), blen = b.length();
if (alen > blen) {
return false;
}
return equal(a.begin(), a.end(), b.begin() + (blen - alen));
}
bool is_flood(const ue2_literal &s) {
assert(!s.empty());
ue2_literal::const_iterator it = s.begin(), ite = s.end();
ue2_literal::elem f = *it;
for (++it; it != ite; ++it) {
if (*it != f) {
return false;
}
}
return true;
}
} // namespace ue2