vectorscan/src/util/dump_charclass.cpp

/*
 * Copyright (c) 2015-2018, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Dump code for character classes (expressed as CharReach objects).
 */

#include "config.h"

// Everything in this file is dump code
#if defined(DUMP_SUPPORT)

#include "charreach.h"
#include "dump_charclass.h"

#include <cctype>
#include <iomanip>
#include <ostream>
#include <sstream>

using std::string;
using std::ostream;

namespace ue2 {

static
void describeChar(ostream &os, char c, enum cc_output_t out_type) {
    // these characters must always be escaped
    static const string escaped("^-[].");
    static const string dot_single_escaped("\"\'");

    const string backslash((out_type == CC_OUT_DOT ? 2 : 1), '\\');

    if (isgraph(c) && c != '\\') {
        if (escaped.find(c) != string::npos) {
            os << backslash << c;
        } else if (out_type == CC_OUT_DOT
                   && dot_single_escaped.find(c) != string::npos) {
            os << '\\' << c;
        } else {
            os << c;
        }
    } else if (c == 0x09) {
        os << backslash << 't';
    } else if (c == 0x0a) {
        os << backslash << 'n';
    } else if (c == 0x0d) {
        os << backslash << 'r';
    } else {
        auto fmt(os.flags());
        os << backslash << 'x' << std::hex << std::setw(2)
           << std::setfill('0') << (unsigned)(c & 0xff);
        os.flags(fmt);
    }
}

static
void describeRange(ostream &os, unsigned char c1, unsigned char c2, enum cc_output_t out_type) {
    assert(c1 <= c2);
    if (c1 == c2) {
        describeChar(os, (char)c1, out_type);
    }
    else if (c2 - c1 < 4) {
        // render as individual chars
        do {
            describeChar(os, (char)c1, out_type);
        } while (c1++ != c2);
    } else {
        // range
        describeChar(os, (char)c1, out_type);
        os << '-';
        describeChar(os, (char)c2, out_type);
    }
}

static
bool extractMnemonic(ostream &os, CharReach &cr, enum cc_output_t out_type) {
    const string backslash((out_type == CC_OUT_DOT ? 2 : 1), '\\');

    // \w (word characters: any letter, digit, or underscore)
    static const CharReach words(string("_0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYXZ"));
    if (words == (cr & words)) {
        cr &= ~words;
        os << backslash << 'w';
        return true;
    }

    // \d (digits)
    static const CharReach digits(string("0123456789"));
    if (digits == (cr & digits)) {
        cr &= ~digits;
        os << backslash << 'd';
        return true;
    }

    // \s (whitespace)
    static const CharReach whitespace(string("\x09\x0a\x0b\x0c\x0d\x20", 6));
    if (whitespace == (cr & whitespace)) {
        cr &= ~whitespace;
        os << backslash << 's';
        return true;
    }

    return false;
}

static
bool isContiguous(const CharReach &cr, size_t &first, size_t &last) {
    first = cr.find_first();
    size_t c = first;
    while (c != CharReach::npos) {
        last = c;
        c = cr.find_next(c);
        if (c == CharReach::npos) {
            break;
        }
        if (c != last + 1) {
            return false;
        }
    }
    return true;
}

static
size_t describeClassInt(ostream &os, const CharReach &incr, size_t maxLength,
        enum cc_output_t out_type) {

    // Approx size of output
    size_t i = 0;
    // one we can break
    CharReach cr(incr);

    // If we can be rendered as a single range, do it
    size_t first = 0, last = 0;
    if (isContiguous(cr, first, last)) {
        describeRange(os, first, last, out_type);
        i = 2;
        return i;
    }

    // Extract any mnemonics
    while (extractMnemonic(os, cr, out_type)) {
        if (++i == maxLength) {
            os << "...]";
            return maxLength;
        }
    }

    if (cr.none()) {
        // all mnemonics, all the time
        return i;
    }

    // Render charclass as a series of ranges
    size_t c_start = cr.find_first();
    size_t c = c_start;
    while (c != CharReach::npos) {
        size_t c_last = c;
        c = cr.find_next(c);
        if (c != c_last + 1 || c_last == 0xff) {
            describeRange(os, c_start, c_last, out_type);
            c_start = c;
            if (++i == maxLength && c != CharReach::npos) {
                os << "...]";
                return maxLength;
            }
        }
    }
    return i;
}

////
//// Functions exported outside this unit.
////

// C++ iostreams interface
void describeClass(ostream &os, const CharReach &incr, size_t maxLength,
                   enum cc_output_t out_type) {
    if (incr.all()) {
        os << "<any>";
        return;
    }

    if (incr.none()) {
        os << "<empty>";
        return;
    }

    if (incr.count() == 1) {
        describeChar(os, (char)incr.find_first(), out_type);
        return;
    }
    if ((~incr).count() == 1) {
        os << "[^";
        describeChar(os, (char)(~incr).find_first(), out_type);
        os << ']';
        return;
    }

    // build up a normal string and a negated one, and see which is shorter
    std::ostringstream out;
    int out_count = describeClassInt(out, incr, maxLength, out_type);

    std::ostringstream neg;
    UNUSED int neg_count = describeClassInt(neg, ~incr, maxLength, out_type);

    if (out.tellp() <= neg.tellp()) {
        if (out_count > 1) {
            os << '[' << out.str() << ']';
        } else {
            os << out.str();
        }
    } else {
        // TODO: negated single mnemonics
        os << "[^" << neg.str() << ']';
    }
}

// Version that returns a string, for convenience.
string describeClass(const CharReach &cr, size_t maxLength,
                     enum cc_output_t out_type) {
    std::ostringstream oss;
    describeClass(oss, cr, maxLength, out_type);
    return oss.str();
}

// C stdio wrapper
void describeClass(FILE *f, const CharReach &cr, size_t maxLength,
                   enum cc_output_t out_type) {
    fprintf(f, "%s", describeClass(cr, maxLength, out_type).c_str());
}

} // namespace ue2

#endif // DUMP_SUPPORT