vectorscan/src/util/ue2string.h
2023-09-05 13:45:33 +03:00

337 lines
9.6 KiB
C++

/*
* Copyright (c) 2015-2019, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Tools for string manipulation, ue2_literal definition.
*/
#ifndef UE2STRING_H
#define UE2STRING_H
#include "ue2common.h"
#include "util/charreach.h"
#include "util/compare.h"
#include "util/hash.h"
#include "util/operators.h"
#include <iterator>
#include <string>
#include <vector>
#include <boost/dynamic_bitset.hpp>
#include <boost/iterator/iterator_facade.hpp>
namespace ue2 {
/// Force the given string to upper-case.
void upperString(std::string &s);
size_t maxStringOverlap(const std::string &a, const std::string &b,
bool nocase);
size_t maxStringSelfOverlap(const std::string &a, bool nocase);
/// Compares two strings, returns non-zero if they're different.
u32 cmp(const char *a, const char *b, size_t len, bool nocase);
/**
* \brief String type that also records whether the whole string is caseful or
* caseless.
*
* You should use \ref ue2_literal if you need to represent a mixed-case
* literal.
*/
struct ue2_case_string {
ue2_case_string(std::string s_in, bool nocase_in)
: s(std::move(s_in)), nocase(nocase_in) {
if (nocase) {
upperString(s);
}
}
bool operator==(const ue2_case_string &other) const {
return s == other.s && nocase == other.nocase;
}
std::string s;
bool nocase;
};
struct ue2_literal : totally_ordered<ue2_literal> {
public:
/// Single element proxy, pointed to by our const_iterator.
struct elem {
elem() : c(0), nocase(false) {}
elem(char c_in, bool nc_in) : c(c_in), nocase(nc_in) {}
bool operator==(const elem &o) const {
return c == o.c && nocase == o.nocase;
}
bool operator!=(const elem &o) const {
return c != o.c || nocase != o.nocase;
}
operator CharReach() const;
char c;
bool nocase;
};
/// Boost iterator_facade lets us synthesize an iterator simply.
class const_iterator : public boost::iterator_facade<
const_iterator,
elem const,
boost::random_access_traversal_tag,
elem const> {
public:
const_iterator() {}
private:
friend class boost::iterator_core_access;
void increment() {
++idx;
}
void decrement() {
--idx;
}
void advance(size_t n) {
idx += n;
}
difference_type distance_to(const const_iterator &other) const {
return other.idx - idx;
}
bool equal(const const_iterator &other) const {
return idx == other.idx && lit == other.lit;
}
const elem dereference() const {
return elem(lit->s[idx], lit->nocase[idx]);
}
friend struct ue2_literal;
const_iterator(const ue2_literal &lit_in, size_t idx_in)
: lit(&lit_in), idx(idx_in) {}
const ue2_literal *lit = nullptr;
size_t idx = 0;
};
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
using size_type = std::string::size_type;
static const size_type npos;
ue2_literal() = default;
ue2_literal(const std::string &s_in, bool nc_in);
ue2_literal(char c, bool nc_in);
ue2_literal(const ue2_literal &) = default;
ue2_literal(ue2_literal &&) = default;
ue2_literal &operator=(const ue2_literal &) = default;
ue2_literal &operator=(ue2_literal &&) = default;
template<typename InputIt>
ue2_literal(InputIt b, InputIt e) {
for (; b != e; ++b) {
push_back(*b);
}
}
size_type length() const { return s.length(); }
bool empty() const { return s.empty(); }
ue2_literal substr(size_type pos, size_type n = npos) const;
const char *c_str() const { return s.c_str(); }
bool any_nocase() const;
const_iterator begin() const {
return const_iterator(*this, 0);
}
const_iterator end() const {
return const_iterator(*this, s.size());
}
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
ue2_literal &erase(size_type pos = 0, size_type n = npos);
void push_back(const elem &e) {
push_back(e.c, e.nocase);
}
void push_back(char c, bool nc);
const elem back() const { return *rbegin(); }
friend ue2_literal operator+(ue2_literal a, const ue2_literal &b) {
a += b;
return a;
}
/// Reverse this literal in-place.
void reverse();
void operator+=(const ue2_literal &b);
bool operator==(const ue2_literal &b) const {
return s == b.s && nocase == b.nocase;
}
bool operator<(const ue2_literal &b) const;
void clear(void) { s.clear(); nocase.clear(); }
const std::string &get_string() const { return s; }
void swap(ue2_literal &other) {
s.swap(other.s);
nocase.swap(other.nocase);
}
size_t hash() const;
private:
friend const_iterator;
std::string s;
boost::dynamic_bitset<> nocase;
};
/// Return a reversed copy of this literal.
ue2_literal reverse_literal(const ue2_literal &in);
// Escape any meta characters in a string
std::string escapeStringMeta(const std::string &s);
/** Note: may be overly conservative if only partially nocase */
size_t maxStringSelfOverlap(const ue2_literal &a);
size_t minStringPeriod(const ue2_literal &a);
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
/**
* \brief True iff the range of a literal given cannot be considered entirely
* case-sensitive nor entirely case-insensitive.
*/
template<class Iter>
bool mixed_sensitivity_in(Iter begin, Iter end) {
bool cs = false;
bool nc = false;
for (auto it = begin; it != end; ++it) {
if (!ourisalpha(it->c)) {
continue;
}
if (it->nocase) {
nc = true;
} else {
cs = true;
}
}
return cs && nc;
}
/**
* \brief True iff the literal cannot be considered entirely case-sensitive
* nor entirely case-insensitive.
*/
inline
bool mixed_sensitivity(const ue2_literal &s) {
return mixed_sensitivity_in(s.begin(), s.end());
}
void make_nocase(ue2_literal *lit);
struct case_iter {
explicit case_iter(const ue2_literal &ss);
const std::string &operator*() const { return s; } /* limited lifetime */
case_iter &operator++ ();
bool operator!=(const case_iter &b) const { return s != b.s; }
private:
std::string s;
std::string s_orig;
std::vector<bool> nocase;
};
case_iter caseIterateBegin(const ue2_literal &lit);
case_iter caseIterateEnd();
/** \brief True if there is any overlap between the characters in \a s and the
* set characters in \a cr.
*
* Note: this means that if \a s is nocase, then \a cr only needs to have
* either the lower-case or upper-case version of a letter set. */
bool contains(const ue2_literal &s, const CharReach &cr);
/// Returns true if \a a is a suffix of (or equal to) \a b.
bool isSuffix(const ue2_literal &a, const ue2_literal &b);
static inline
std::vector<CharReach> as_cr_seq(const ue2_literal &s) {
std::vector<CharReach> rv;
rv.reserve(s.length());
rv.insert(rv.end(), s.begin(), s.end());
return rv;
}
/** \brief True if the given literal consists entirely of a flood of the same
* character. */
bool is_flood(const ue2_literal &s);
#if defined(DUMP_SUPPORT) || defined(DEBUG)
/* Utility functions for debugging/dumping */
/// Escape a string so it's dot-printable.
std::string dotEscapeString(const std::string &s);
std::string dumpString(const ue2_literal &lit);
/// Escape a string so that it's screen-printable.
std::string escapeString(const std::string &s);
/// Escape a ue2_literal so that it's screen-printable.
std::string escapeString(const ue2_literal &lit);
#endif
} // namespace ue2
namespace std {
template<>
struct hash<ue2::ue2_literal::elem> {
size_t operator()(const ue2::ue2_literal::elem &elem) const {
return ue2::hash_all(elem.c, elem.nocase);
}
};
template<>
struct hash<ue2::ue2_literal> {
size_t operator()(const ue2::ue2_literal &lit) const {
return lit.hash();
}
};
} // namespace std
#endif