third_party/brotli/research/sieve.cc - cobalt - Git at Google

 #include "./sieve.h"

 /* Pointer to position in (combined corpus) text. */
 typedef uint32_t TextIdx;

 /* Index of sample / generation. */
 typedef uint16_t SampleIdx;

 typedef struct Slot {
   TextIdx next;
   TextIdx offset;
   SampleIdx presence;
   SampleIdx mark;
 } Slot;

 static const TextIdx kNowhere = static_cast<TextIdx>(-1);

 static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
     TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
   TextIdx from = kNowhere;
   TextIdx to = kNowhere;
   TextIdx result = 0;
   SampleIdx targetPresence = minPresence;
   for (TextIdx i = 0; i < end; ++i) {
     if (i == middle) {
       targetPresence++;
     }
     Slot& item = map[shortcut[i]];
     if (item.mark != iteration) {
       item.mark = iteration;
       if (item.presence >= targetPresence) {
         if ((to == kNowhere) || (to < i)) {
           if (from != kNowhere) {
             result += to - from;
           }
           from = i;
         }
         to = i + sliceLen;
       }
     }
   }
   if (from != kNowhere) {
     result += to - from;
   }
   return result;
 }

 static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
     Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
     SampleIdx minPresence, SampleIdx iteration) {
   std::string output;
   TextIdx from = kNowhere;
   TextIdx to = kNowhere;
   SampleIdx targetPresence = minPresence;
   for (TextIdx i = 0; i < end; ++i) {
     if (i == middle) {
       targetPresence++;
     }
     Slot& item = map[shortcut[i]];
     if (item.mark != iteration) {
       item.mark = iteration;
       if (item.presence >= targetPresence) {
         if ((to == kNowhere) || (to < i)) {
           if (from != kNowhere) {
             output.insert(output.end(), &data[from], &data[to]);
           }
           from = i;
         }
         to = i + sliceLen;
       }
     }
   }
   if (from != kNowhere) {
     output.insert(output.end(), &data[from], &data[to]);
   }
   return output;
 }

 std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
   /* Parameters aliasing */
   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
   if (targetSize != dictionary_size_limit) {
     fprintf(stderr, "dictionary_size_limit is too large\n");
     return "";
   }
   TextIdx sliceLen = static_cast<TextIdx>(slice_len);
   if (sliceLen != slice_len) {
     fprintf(stderr, "slice_len is too large\n");
     return "";
   }
   if (sliceLen < 1) {
     fprintf(stderr, "slice_len is too small\n");
     return "";
   }
   SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
   if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
     fprintf(stderr, "too many samples\n");
     return "";
   }
   const uint8_t* data = sample_data;

   TextIdx total = 0;
   std::vector<TextIdx> offsets;
   for (SampleIdx i = 0; i < numSamples; ++i) {
     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
     if (delta != sample_sizes[i]) {
       fprintf(stderr, "sample is too large\n");
       return "";
     }
     if (delta == 0) {
       fprintf(stderr, "empty samples are prohibited\n");
       return "";
     }
     if (total + delta <= total) {
       fprintf(stderr, "corpus is too large\n");
       return "";
     }
     total += delta;
     offsets.push_back(total);
   }

   if (total * 2 < total) {
     fprintf(stderr, "corpus is too large\n");
     return "";
   }

   if (total < sliceLen) {
     fprintf(stderr, "slice_len is larger than corpus size\n");
     return "";
   }

   /*****************************************************************************
    * Build coverage map.
    ****************************************************************************/
   std::vector<Slot> map;
   std::vector<TextIdx> shortcut;
   map.push_back({0, 0, 0, 0});
   TextIdx end = total - sliceLen;
   TextIdx hashLen = 11;
   while (hashLen < 29 && ((1u << hashLen) < end)) {
     hashLen += 3;
   }
   hashLen -= 3;
   TextIdx hashMask = (1u << hashLen) - 1u;
   std::vector<TextIdx> hashHead(1 << hashLen);
   TextIdx hashSlot = 1;
   SampleIdx piece = 0;
   TextIdx hash = 0;
   TextIdx lShift = 3;
   TextIdx rShift = hashLen - lShift;
   for (TextIdx i = 0; i < sliceLen - 1; ++i) {
     TextIdx v = data[i];
     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
   }
   TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
   TextIdx rShiftX = hashLen - lShiftX;
   for (TextIdx i = 0; i < end; ++i) {
     TextIdx v = data[i + sliceLen - 1];
     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;

     if (offsets[piece] == i) {
       piece++;
     }
     TextIdx slot = hashHead[hash];
     while (slot != 0) {
       Slot& item = map[slot];
       TextIdx start = item.offset;
       bool miss = false;
       for (TextIdx j = 0; j < sliceLen; ++j) {
         if (data[i + j] != data[start + j]) {
           miss = true;
           break;
         }
       }
       if (!miss) {
         if (item.mark != piece) {
           item.mark = piece;
           item.presence++;
         }
         shortcut.push_back(slot);
         break;
       }
       slot = item.next;
     }
     if (slot == 0) {
       map.push_back({hashHead[hash], i, 1, piece});
       hashHead[hash] = hashSlot;
       shortcut.push_back(hashSlot);
       hashSlot++;
     }
     v = data[i];
     hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
   }

   /*****************************************************************************
    * Build dictionary of specified size.
    ****************************************************************************/
   SampleIdx a = 1;
   TextIdx size = dryRun(
       sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
   /* Maximal output is smaller than target. */
   if (size <= targetSize) {
     return createDictionary(
         data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
   }

   SampleIdx b = numSamples;
   size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
   if (size == targetSize) {
     return createDictionary(
         data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
   }
   /* Run binary search. */
   if (size < targetSize) {
     /* size(a) > targetSize > size(b) && a < m < b */
     while (a + 1 < b) {
       SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
       size = dryRun(
           sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
       if (size < targetSize) {
         b = m;
       } else if (size > targetSize) {
         a = m;
       } else {
         return createDictionary(
             data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
       }
     }
   } else {
     a = b;
   }
   /* size(minPresence) > targetSize > size(minPresence + 1) */
   SampleIdx minPresence = a;
   TextIdx c = 0;
   TextIdx d = end;
   /* size(a) < targetSize < size(b) && a < m < b */
   while (c + 1 < d) {
     TextIdx m = (c + d) / 2;
     size = dryRun(
         sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
     if (size < targetSize) {
       c = m;
     } else if (size > targetSize) {
       d = m;
     } else {
       return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
           m, minPresence, ++piece);
     }
   }

   bool unrestricted = false;
   if (minPresence <= 2 && !unrestricted) {
     minPresence = 2;
     c = end;
   }
   return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
       minPresence, ++piece);
 }
	#include "./sieve.h"

	/* Pointer to position in (combined corpus) text. */
	typedef uint32_t TextIdx;

	/* Index of sample / generation. */
	typedef uint16_t SampleIdx;

	typedef struct Slot {
	TextIdx next;
	TextIdx offset;
	SampleIdx presence;
	SampleIdx mark;
	} Slot;

	static const TextIdx kNowhere = static_cast<TextIdx>(-1);

	static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
	TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
	TextIdx from = kNowhere;
	TextIdx to = kNowhere;
	TextIdx result = 0;
	SampleIdx targetPresence = minPresence;
	for (TextIdx i = 0; i < end; ++i) {
	if (i == middle) {
	targetPresence++;
	}
	Slot& item = map[shortcut[i]];
	if (item.mark != iteration) {
	item.mark = iteration;
	if (item.presence >= targetPresence) {
	if ((to == kNowhere) \|\| (to < i)) {
	if (from != kNowhere) {
	result += to - from;
	}
	from = i;
	}
	to = i + sliceLen;
	}
	}
	}
	if (from != kNowhere) {
	result += to - from;
	}
	return result;
	}

	static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
	Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
	SampleIdx minPresence, SampleIdx iteration) {
	std::string output;
	TextIdx from = kNowhere;
	TextIdx to = kNowhere;
	SampleIdx targetPresence = minPresence;
	for (TextIdx i = 0; i < end; ++i) {
	if (i == middle) {
	targetPresence++;
	}
	Slot& item = map[shortcut[i]];
	if (item.mark != iteration) {
	item.mark = iteration;
	if (item.presence >= targetPresence) {
	if ((to == kNowhere) \|\| (to < i)) {
	if (from != kNowhere) {
	output.insert(output.end(), &data[from], &data[to]);
	}
	from = i;
	}
	to = i + sliceLen;
	}
	}
	}
	if (from != kNowhere) {
	output.insert(output.end(), &data[from], &data[to]);
	}
	return output;
	}

	std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
	const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
	/* Parameters aliasing */
	TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
	if (targetSize != dictionary_size_limit) {
	fprintf(stderr, "dictionary_size_limit is too large\n");
	return "";
	}
	TextIdx sliceLen = static_cast<TextIdx>(slice_len);
	if (sliceLen != slice_len) {
	fprintf(stderr, "slice_len is too large\n");
	return "";
	}
	if (sliceLen < 1) {
	fprintf(stderr, "slice_len is too small\n");
	return "";
	}
	SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
	if ((numSamples != sample_sizes.size()) \|\| (numSamples * 2 < numSamples)) {
	fprintf(stderr, "too many samples\n");
	return "";
	}
	const uint8_t* data = sample_data;

	TextIdx total = 0;
	std::vector<TextIdx> offsets;
	for (SampleIdx i = 0; i < numSamples; ++i) {
	TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
	if (delta != sample_sizes[i]) {
	fprintf(stderr, "sample is too large\n");
	return "";
	}
	if (delta == 0) {
	fprintf(stderr, "empty samples are prohibited\n");
	return "";
	}
	if (total + delta <= total) {
	fprintf(stderr, "corpus is too large\n");
	return "";
	}
	total += delta;
	offsets.push_back(total);
	}

	if (total * 2 < total) {
	fprintf(stderr, "corpus is too large\n");
	return "";
	}

	if (total < sliceLen) {
	fprintf(stderr, "slice_len is larger than corpus size\n");
	return "";
	}

	/*****************************************************************************
	* Build coverage map.
	****************************************************************************/
	std::vector<Slot> map;
	std::vector<TextIdx> shortcut;
	map.push_back({0, 0, 0, 0});
	TextIdx end = total - sliceLen;
	TextIdx hashLen = 11;
	while (hashLen < 29 && ((1u << hashLen) < end)) {
	hashLen += 3;
	}
	hashLen -= 3;
	TextIdx hashMask = (1u << hashLen) - 1u;
	std::vector<TextIdx> hashHead(1 << hashLen);
	TextIdx hashSlot = 1;
	SampleIdx piece = 0;
	TextIdx hash = 0;
	TextIdx lShift = 3;
	TextIdx rShift = hashLen - lShift;
	for (TextIdx i = 0; i < sliceLen - 1; ++i) {
	TextIdx v = data[i];
	hash = (((hash << lShift) \| (hash >> rShift)) & hashMask) ^ v;
	}
	TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
	TextIdx rShiftX = hashLen - lShiftX;
	for (TextIdx i = 0; i < end; ++i) {
	TextIdx v = data[i + sliceLen - 1];
	hash = (((hash << lShift) \| (hash >> rShift)) & hashMask) ^ v;

	if (offsets[piece] == i) {
	piece++;
	}
	TextIdx slot = hashHead[hash];
	while (slot != 0) {
	Slot& item = map[slot];
	TextIdx start = item.offset;
	bool miss = false;
	for (TextIdx j = 0; j < sliceLen; ++j) {
	if (data[i + j] != data[start + j]) {
	miss = true;
	break;
	}
	}
	if (!miss) {
	if (item.mark != piece) {
	item.mark = piece;
	item.presence++;
	}
	shortcut.push_back(slot);
	break;
	}
	slot = item.next;
	}
	if (slot == 0) {
	map.push_back({hashHead[hash], i, 1, piece});
	hashHead[hash] = hashSlot;
	shortcut.push_back(hashSlot);
	hashSlot++;
	}
	v = data[i];
	hash ^= ((v << lShiftX) \| (v >> rShiftX)) & hashMask;
	}

	/*****************************************************************************
	* Build dictionary of specified size.
	****************************************************************************/
	SampleIdx a = 1;
	TextIdx size = dryRun(
	sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
	/* Maximal output is smaller than target. */
	if (size <= targetSize) {
	return createDictionary(
	data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
	}

	SampleIdx b = numSamples;
	size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
	if (size == targetSize) {
	return createDictionary(
	data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
	}
	/* Run binary search. */
	if (size < targetSize) {
	/* size(a) > targetSize > size(b) && a < m < b */
	while (a + 1 < b) {
	SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
	size = dryRun(
	sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
	if (size < targetSize) {
	b = m;
	} else if (size > targetSize) {
	a = m;
	} else {
	return createDictionary(
	data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
	}
	}
	} else {
	a = b;
	}
	/* size(minPresence) > targetSize > size(minPresence + 1) */
	SampleIdx minPresence = a;
	TextIdx c = 0;
	TextIdx d = end;
	/* size(a) < targetSize < size(b) && a < m < b */
	while (c + 1 < d) {
	TextIdx m = (c + d) / 2;
	size = dryRun(
	sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
	if (size < targetSize) {
	c = m;
	} else if (size > targetSize) {
	d = m;
	} else {
	return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
	m, minPresence, ++piece);
	}
	}

	bool unrestricted = false;
	if (minPresence <= 2 && !unrestricted) {
	minPresence = 2;
	c = end;
	}
	return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
	minPresence, ++piece);
	}