blob: fbc1dbfeb41f441d29988ffa972f4fabfc8f3fa0 [file] [log] [blame]
#include "./sieve.h"
typedef struct Slot {
uint32_t next;
uint32_t offset;
uint16_t presence;
uint16_t mark;
} Slot;
static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
size_t middle, uint16_t minPresence, uint16_t iteration) {
int from = -2;
int to = -1;
size_t result = 0;
uint16_t targetPresence = minPresence;
for (uint32_t i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
Slot& item = map[shortcut[i]];
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
if (to < i) {
if (from > 0) {
result += to - from;
}
from = i;
}
to = i + sliceLen;
}
}
}
if (from > 0) {
result += to - from;
}
return result;
}
static std::string createDictionary(const uint8_t* data, size_t sliceLen,
Slot* map, uint32_t* shortcut, size_t end, size_t middle,
uint16_t minPresence, uint16_t iteration) {
std::string output;
int from = -2;
int to = -1;
uint16_t targetPresence = minPresence;
for (uint32_t i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
Slot& item = map[shortcut[i]];
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
if (to < i) {
if (from > 0) {
output.insert(output.end(), &data[from], &data[to]);
}
from = i;
}
to = i + sliceLen;
}
}
}
if (from > 0) {
output.insert(output.end(), &data[from], &data[to]);
}
return output;
}
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
/* Parameters aliasing */
size_t targetSize = dictionary_size_limit;
size_t sliceLen = slice_len;
const uint8_t* data = sample_data;
size_t total = 0;
std::vector<size_t> offsets;
for (size_t i = 0; i < sample_sizes.size(); ++i) {
total += sample_sizes[i];
offsets.push_back(total);
}
/*****************************************************************************
* Build coverage map.
****************************************************************************/
std::vector<Slot> map;
std::vector<uint32_t> shortcut;
map.push_back({0, 0, 0, 0});
size_t end = total - sliceLen;
int hashLen = 8;
while ((1 << hashLen) < end) {
hashLen += 3;
}
hashLen -= 3;
uint32_t hashMask = (1u << hashLen) - 1u;
std::vector<uint32_t> hashHead(1 << hashLen);
uint32_t hashSlot = 1;
uint16_t piece = 0;
uint32_t hash = 0;
int lShift = 3;
int rShift = hashLen - lShift;
for (int i = 0; i < sliceLen - 1; ++i) {
uint32_t v = data[i];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
}
int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
int rShiftX = hashLen - lShiftX;
for (uint32_t i = 0; i < end; ++i) {
uint32_t v = data[i + sliceLen - 1];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
if (offsets[piece] == i) {
piece++;
}
uint32_t slot = hashHead[hash];
while (slot != 0) {
Slot& item = map[slot];
int start = item.offset;
bool miss = false;
for (size_t j = 0; j < sliceLen; ++j) {
if (data[i + j] != data[start + j]) {
miss = true;
break;
}
}
if (!miss) {
if (item.mark != piece) {
item.mark = piece;
item.presence++;
}
shortcut.push_back(slot);
break;
}
slot = item.next;
}
if (slot == 0) {
map.push_back({hashHead[hash], i, 1, piece});
hashHead[hash] = hashSlot;
shortcut.push_back(hashSlot);
hashSlot++;
}
v = data[i];
hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
}
/*****************************************************************************
* Build dictionary of specified size.
****************************************************************************/
size_t a = 1;
size_t size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
/* Maximal output is smaller than target. */
if (size <= targetSize) {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
}
size_t b = offsets.size();
size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
if (size == targetSize) {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
}
/* Run binary search. */
if (size < targetSize) {
/* size(a) > targetSize > size(b) && a < m < b */
while (a + 1 < b) {
size_t m = (a + b) / 2;
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
if (size < targetSize) {
b = m;
} else if (size > targetSize) {
a = m;
} else {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
}
}
} else {
a = b;
}
/* size(minPresence) > targetSize > size(minPresence + 1) */
size_t minPresence = a;
a = 0;
b = end;
/* size(a) < targetSize < size(b) && a < m < b */
while (a + 1 < b) {
size_t m = (a + b) / 2;
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
if (size < targetSize) {
a = m;
} else if (size > targetSize) {
b = m;
} else {
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
m, minPresence, ++piece);
}
}
bool unrestricted = false;
if (minPresence <= 2 && !unrestricted) {
minPresence = 2;
a = end;
}
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
minPresence, ++piece);
}