diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index 937513a8..f7451492 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -53,13 +53,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include using namespace std; @@ -71,12 +74,11 @@ class FDRCompiler : boost::noncopyable { private: const FDREngineDescription ŋ vector tab; - const vector &lits; + vector lits; map > bucketToLits; bool make_small; u8 *tabIndexToMask(u32 indexInTable); - void assignStringToBucket(LiteralIndex l, BucketIndex b); void assignStringsToBuckets(); #ifdef DEBUG void dumpMasks(const u8 *defaultMask); @@ -86,9 +88,9 @@ private: void createInitialState(FDR *fdr); public: - FDRCompiler(const vector &lits_in, - const FDREngineDescription &eng_in, bool make_small_in) - : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in), + FDRCompiler(vector lits_in, const FDREngineDescription &eng_in, + bool make_small_in) + : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(move(lits_in)), make_small(make_small_in) {} aligned_unique_ptr build(pair, size_t> &link); @@ -197,66 +199,121 @@ FDRCompiler::setupFDR(pair, size_t> &link) { return fdr; } -void FDRCompiler::assignStringToBucket(LiteralIndex l, BucketIndex b) { - bucketToLits[b].push_back(l); +//#define DEBUG_ASSIGNMENT + +static +double getScoreUtil(u32 len, u32 count) { + return len == 0 ? numeric_limits::max() + : pow(count, 1.05) * pow(len, -3.0); } -struct LitOrder { - explicit LitOrder(const vector &vl_) : vl(vl_) {} - bool operator()(const u32 &i1, const u32 &i2) const { - const string &i1s = vl[i1].s; - const string &i2s = vl[i2].s; +/** + * Returns true if the two given literals should be placed in the same chunk as + * they are identical except for a difference in caselessness. + */ +static +bool isEquivLit(const hwlmLiteral &a, const hwlmLiteral &b, + const hwlmLiteral *last_nocase_lit) { + const size_t a_len = a.s.size(); + const size_t b_len = b.s.size(); - size_t len1 = i1s.size(), len2 = i2s.size(); + if (a_len != b_len) { + return false; + } - if (len1 != len2) { - return len1 < len2; - } else { - auto p = std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin()); - if (p.first == i1s.rend()) { - return false; + bool nocase = last_nocase_lit && a_len == last_nocase_lit->s.size() && + !cmp(a.s.c_str(), last_nocase_lit->s.c_str(), a_len, true); + return !cmp(a.s.c_str(), b.s.c_str(), a.s.size(), nocase); +} + +struct Chunk { + Chunk(u32 first_id_in, u32 count_in, u32 length_in) + : first_id(first_id_in), count(count_in), length(length_in) {} + u32 first_id; //!< first id in this chunk + u32 count; //!< how many are in this chunk + u32 length; //!< how long things in the chunk are +}; + +static +vector assignChunks(const vector &lits, + const map &lenCounts) { + const u32 CHUNK_MAX = 512; + const u32 MAX_CONSIDERED_LENGTH = 16; + + // TODO: detailed early stage literal analysis for v. small cases (actually + // look at lits) yes - after we factor this out and merge in the Teddy + // style of building we can look at this, although the teddy merge + // modelling is quite different. It's still probably adaptable to some + // extent for this class of problem. + + vector chunks; + chunks.reserve(CHUNK_MAX); + + const u32 maxPerChunk = lits.size() / + (CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1; + + u32 currentSize = 0; + u32 chunkStartID = 0; + const hwlmLiteral *last_nocase_lit = nullptr; + + for (u32 i = 0; i < lits.size() && chunks.size() < CHUNK_MAX - 1; i++) { + const auto &lit = lits[i]; + + DEBUG_PRINTF("i=%u, lit=%s%s\n", i, escapeString(lit.s).c_str(), + lit.nocase ? " (nocase)" : ""); + + // If this literal is identical to the last one (aside from differences + // in caselessness), keep going even if we will "overfill" a chunk; we + // don't want to split identical literals into different buckets. + if (i != 0 && isEquivLit(lit, lits[i - 1], last_nocase_lit)) { + DEBUG_PRINTF("identical lit\n"); + goto next_literal; + } + + if ((currentSize < MAX_CONSIDERED_LENGTH && + (lit.s.size() != currentSize)) || + (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) { + currentSize = lit.s.size(); + if (!chunks.empty()) { + chunks.back().count = i - chunkStartID; } - return *p.first < *p.second; + chunkStartID = i; + chunks.emplace_back(i, 0, currentSize); + } +next_literal: + if (lit.nocase) { + last_nocase_lit = &lit; } } -private: - const vector &vl; -}; + assert(!chunks.empty()); + chunks.back().count = lits.size() - chunkStartID; + // close off chunks with an empty row + chunks.emplace_back(lits.size(), 0, 0); -static u64a getScoreUtil(u32 len, u32 count) { - if (len == 0) { - return (u64a)-1; +#ifdef DEBUG_ASSIGNMENT + for (size_t j = 0; j < chunks.size(); j++) { + const auto &chunk = chunks[j]; + printf("chunk %zu first_id=%u count=%u length=%u\n", j, chunk.first_id, + chunk.count, chunk.length); } - const u32 LEN_THRESH = 128; - const u32 elen = (len > LEN_THRESH) ? LEN_THRESH : len; - const u64a lenScore = - (LEN_THRESH * LEN_THRESH * LEN_THRESH) / (elen * elen * elen); - return count * lenScore; // deemphasize count - possibly more than needed - // this might be overkill in the other direction +#endif + + DEBUG_PRINTF("built %zu chunks (%zu lits)\n", chunks.size(), lits.size()); + assert(chunks.size() <= CHUNK_MAX); + return chunks; } -//#define DEBUG_ASSIGNMENT void FDRCompiler::assignStringsToBuckets() { - typedef u64a SCORE; // 'Score' type - const SCORE MAX_SCORE = (SCORE)-1; - const u32 CHUNK_MAX = 512; - const u32 BUCKET_MAX = 16; - typedef pair SCORE_INDEX_PAIR; + const double MAX_SCORE = numeric_limits::max(); - u32 ls = verify_u32(lits.size()); - assert(ls); // Shouldn't be called with no literals. + assert(!lits.empty()); // Shouldn't be called with no literals. - // make a vector that contains our literals as pointers or u32 LiteralIndex values - vector vli; - vli.resize(ls); + // Count the number of literals for each length. map lenCounts; - for (LiteralIndex l = 0; l < ls; l++) { - vli[l] = l; - lenCounts[lits[l].s.size()]++; + for (const auto &lit : lits) { + lenCounts[lit.s.size()]++; } - // sort vector by literal length + if tied on length, 'magic' criteria of some kind (tbd) - stable_sort(vli.begin(), vli.end(), LitOrder(lits)); #ifdef DEBUG_ASSIGNMENT for (const auto &m : lenCounts) { @@ -265,103 +322,94 @@ void FDRCompiler::assignStringsToBuckets() { printf("\n"); #endif - // TODO: detailed early stage literal analysis for v. small cases (actually look at lits) - // yes - after we factor this out and merge in the Teddy style of building we can look - // at this, although the teddy merge modelling is quite different. It's still probably - // adaptable to some extent for this class of problem + // Sort literals by literal length. If tied on length, use lexicographic + // ordering (of the reversed literals). + stable_sort(lits.begin(), lits.end(), + [](const hwlmLiteral &a, const hwlmLiteral &b) { + if (a.s.size() != b.s.size()) { + return a.s.size() < b.s.size(); + } + auto p = mismatch(a.s.rbegin(), a.s.rend(), b.s.rbegin()); + if (p.first != a.s.rend()) { + return *p.first < *p.second; + } + // Sort caseless variants first. + return a.nocase > b.nocase; + }); - u32 firstIds[CHUNK_MAX]; // how many are in this chunk (CHUNK_MAX - 1 contains 'last' bound) - u32 count[CHUNK_MAX]; // how many are in this chunk - u32 length[CHUNK_MAX]; // how long things in the chunk are + vector chunks = assignChunks(lits, lenCounts); - const u32 MAX_CONSIDERED_LENGTH = 16; - u32 currentChunk = 0; - u32 currentSize = 0; - u32 chunkStartID = 0; - u32 maxPerChunk = ls/(CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1; + const u32 numChunks = chunks.size(); + const u32 numBuckets = eng.getNumBuckets(); - for (u32 i = 0; i < ls && currentChunk < CHUNK_MAX - 1; i++) { - LiteralIndex l = vli[i]; - if ((currentSize < MAX_CONSIDERED_LENGTH && (lits[l].s.size() != currentSize)) || - (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) { - currentSize = lits[l].s.size(); - if (currentChunk) { - count[currentChunk - 1 ] = i - chunkStartID; - } - chunkStartID = firstIds[currentChunk] = i; - length[currentChunk] = currentSize; - currentChunk++; - } - } + // 2D array of (score, chunk index) pairs, indexed by + // [chunk_index][bucket_index]. + boost::multi_array, 2> t( + boost::extents[numChunks][numBuckets]); - assert(currentChunk > 0); - count[currentChunk - 1] = ls - chunkStartID; - // close off chunks with an empty row - firstIds[currentChunk] = ls; - length[currentChunk] = 0; - count[currentChunk] = 0; - u32 nChunks = currentChunk + 1; - -#ifdef DEBUG_ASSIGNMENT - for (u32 j = 0; j < nChunks; j++) { - printf("%d %d %d %d\n", j, firstIds[j], count[j], length[j]); - } -#endif - - SCORE_INDEX_PAIR t[CHUNK_MAX][BUCKET_MAX]; // pair of score, index - u32 nb = eng.getNumBuckets(); - - for (u32 j = 0; j < nChunks; j++) { + for (u32 j = 0; j < numChunks; j++) { u32 cnt = 0; - for (u32 k = j; k < nChunks; ++k) { - cnt += count[k]; + for (u32 k = j; k < numChunks; ++k) { + cnt += chunks[k].count; } - t[j][0] = {getScoreUtil(length[j], cnt), 0}; + t[j][0] = {getScoreUtil(chunks[j].length, cnt), 0}; } - for (u32 i = 1; i < nb; i++) { - for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row - SCORE_INDEX_PAIR best = {MAX_SCORE, 0}; - u32 cnt = count[j]; - for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) { - SCORE score = getScoreUtil(length[j], cnt); + for (u32 i = 1; i < numBuckets; i++) { + for (u32 j = 0; j < numChunks - 1; j++) { // don't do last, empty row + pair best = {MAX_SCORE, 0}; + u32 cnt = chunks[j].count; + for (u32 k = j + 1; k < numChunks - 1; k++) { + auto score = getScoreUtil(chunks[j].length, cnt); if (score > best.first) { - break; // if we're now worse locally than our best score, give up + break; // now worse locally than our best score, give up } score += t[k][i-1].first; if (score < best.first) { best = {score, k}; } + cnt += chunks[k].count; } t[j][i] = best; } - t[nChunks - 1][i] = {0,0}; // fill in empty final row for next iteration + t[numChunks - 1][i] = {0,0}; // fill in empty final row for next iter } #ifdef DEBUG_ASSIGNMENT - for (u32 j = 0; j < nChunks; j++) { - for (u32 i = 0; i < nb; i++) { - SCORE_INDEX_PAIR v = t[j][i]; - printf("<%7lld,%3d>", v.first, v.second); + for (u32 j = 0; j < numChunks; j++) { + printf("%03u: ", j); + for (u32 i = 0; i < numBuckets; i++) { + const auto &v = t[j][i]; + printf("<%0.3f,%3d> ", v.first, v.second); } printf("\n"); } #endif - // our best score is in best[0][N_BUCKETS-1] and we can follow the links + // our best score is in t[0][N_BUCKETS-1] and we can follow the links // to find where our buckets should start and what goes into them - for (u32 i = 0, n = nb; n && (i != nChunks - 1); n--) { + for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) { u32 j = t[i][n - 1].second; if (j == 0) { - j = nChunks - 1; + j = numChunks - 1; } - // put chunks between i - j into bucket (NBUCKETS-1) - n -#ifdef DEBUG_ASSIGNMENT - printf("placing from %d to %d in bucket %d\n", firstIds[i], firstIds[j], - nb - n); -#endif - for (u32 k = firstIds[i]; k < firstIds[j]; k++) { - assignStringToBucket((LiteralIndex)vli[k], nb - n); + + // put chunks between i - j into bucket (numBuckets - n). + u32 first_id = chunks[i].first_id; + u32 last_id = chunks[j].first_id; + assert(first_id < last_id); + u32 bucket = numBuckets - n; + UNUSED const auto &first_lit = lits[first_id]; + UNUSED const auto &last_lit = lits[last_id - 1]; + DEBUG_PRINTF("placing [%u-%u) in bucket %u (%u lits, len %zu-%zu, " + "score %0.4f)\n", + first_id, last_id, bucket, last_id - first_id, + first_lit.s.length(), last_lit.s.length(), + getScoreUtil(first_lit.s.length(), last_id - first_id)); + + auto &bucket_lits = bucketToLits[bucket]; + for (u32 k = first_id; k < last_id; k++) { + bucket_lits.push_back(k); } i = j; }