Blame - third_party/llvm-project/libcxx/include/__format/unicode.h - cobalt

blob: 12aed507990e873bbb04b5b09d2617edb3a5554c [file] [log] [blame]

Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	1	// -- C++ --
				2	//===----------------------------------------------------------------------===//
				3	//
				4	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				5	// See https://llvm.org/LICENSE.txt for license information.
				6	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#ifndef _LIBCPP___FORMAT_UNICODE_H
				11	#define _LIBCPP___FORMAT_UNICODE_H
				12
				13	#include <__assert>
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	14	#include <__bit/countl.h>
				15	#include <__concepts/same_as.h>
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	16	#include <__config>
				17	#include <__format/extended_grapheme_cluster_table.h>
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	18	#include <__iterator/concepts.h>
				19	#include <__iterator/readable_traits.h> // iter_value_t
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	20	#include <__type_traits/make_unsigned.h>
				21	#include <__utility/unreachable.h>
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	22	#include <string_view>
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	23
				24	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
				25	# pragma GCC system_header
				26	#endif
				27
				28	_LIBCPP_BEGIN_NAMESPACE_STD
				29
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	30	#if _LIBCPP_STD_VER >= 20
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	31
				32	namespace __unicode {
				33
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	34	// Helper struct for the result of a consume operation.
				35	//
				36	// The status value for a correct code point is 0. This allows a valid value to
				37	// be used without masking.
				38	// When the decoding fails it know the number of code units affected. For the
				39	// current use-cases that value is not needed, therefore it is not stored.
				40	// The escape routine needs the number of code units for both a valid and
				41	// invalid character and keeps track of it itself. Doing it in this result
				42	// unconditionally would give some overhead when the value is unneeded.
				43	struct __consume_result {
				44	// When __status == __ok it contains the decoded code point.
				45	// Else it contains the replacement character U+FFFD
				46	char32_t __code_point : 31;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	47
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	48	enum : char32_t {
				49	// Consumed a well-formed code point.
				50	__ok = 0,
				51	// Encountered invalid UTF-8
				52	__error = 1
				53	} __status : 1 {__ok};
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	54	};
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	55	static_assert(sizeof(__consume_result) == sizeof(char32_t));
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	56
				57	# ifndef _LIBCPP_HAS_NO_UNICODE
				58
				59	/// Implements the grapheme cluster boundary rules
				60	///
				61	/// These rules are used to implement format's width estimation as stated in
				62	/// [format.string.std]/11
				63	///
				64	/// The Standard refers to UAX \#29 for Unicode 12.0.0
				65	/// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
				66	///
				67	/// The data tables used are
				68	/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
				69	/// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
				70	/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
				71
				72	inline constexpr char32_t __replacement_character = U'\ufffd';
				73
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	74	// The error of a consume operation.
				75	//
				76	// This sets the code point to the replacement character. This code point does
				77	// not participate in the grapheme clustering, so grapheme clustering code can
				78	// ignore the error status and always use the code point.
				79	inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
				80
				81	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
				82	return __value >= 0xd800 && __value <= 0xdbff;
				83	}
				84
				85	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
				86	return __value >= 0xdc00 && __value <= 0xdfff;
				87	}
				88
				89	// https://www.unicode.org/glossary/#surrogate_code_point
				90	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
				91	return __value >= 0xd800 && __value <= 0xdfff;
				92	}
				93
				94	// https://www.unicode.org/glossary/#code_point
				95	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
				96	return __value <= 0x10ffff;
				97	}
				98
				99	// https://www.unicode.org/glossary/#unicode_scalar_value
				100	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
				101	return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
				102	}
				103
				104	template <contiguous_iterator _Iterator>
				105	requires same_as<iter_value_t<_Iterator>, char>
				106	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	107	do {
				108	if ((*__char & 0b1000'0000) != 0b1000'0000)
				109	return false;
				110	--__count;
				111	++__char;
				112	} while (__count);
				113	return true;
				114	}
				115
				116	/// Helper class to extract a code unit from a Unicode character range.
				117	///
				118	/// The stored range is a view. There are multiple specialization for different
				119	/// character types.
				120	template <class _CharT>
				121	class __code_point_view;
				122
				123	/// UTF-8 specialization.
				124	template <>
				125	class __code_point_view<char> {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	126	using _Iterator = basic_string_view<char>::const_iterator;
				127
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	128	public:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	129	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	130	: __first_(__first), __last_(__last) {}
				131
				132	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	133	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	134
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	135	// https://www.unicode.org/versions/latest/ch03.pdf#G7404
				136	// Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
				137	//
				138	// Code Points First Byte Second Byte Third Byte Fourth Byte Remarks
				139	// U+0000..U+007F 00..7F U+0000..U+007F 1 code unit range
				140	// C0..C1 80..BF invalid overlong encoding
				141	// U+0080..U+07FF C2..DF 80..BF U+0080..U+07FF 2 code unit range
				142	// E0 80..9F 80..BF invalid overlong encoding
				143	// U+0800..U+0FFF E0 A0..BF 80..BF U+0800..U+FFFF 3 code unit range
				144	// U+1000..U+CFFF E1..EC 80..BF 80..BF
				145	// U+D000..U+D7FF ED 80..9F 80..BF
				146	// U+D800..U+DFFF ED A0..BF 80..BF invalid encoding of surrogate code point
				147	// U+E000..U+FFFF EE..EF 80..BF 80..BF
				148	// F0 80..8F 80..BF 80..BF invalid overlong encoding
				149	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF U+10000..U+10FFFF 4 code unit range
				150	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
				151	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
				152	// F4 90..BF 80..BF 80..BF U+110000.. invalid code point range
				153	//
				154	// Unlike other parsers, these invalid entries are tested after decoding.
				155	// - The parser always needs to consume these code units
				156	// - The code is optimized for well-formed UTF-8
				157	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	158	_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
				159
				160	// Based on the number of leading 1 bits the number of code units in the
				161	// code point can be determined. See
				162	// https://en.wikipedia.org/wiki/UTF-8#Encoding
				163	switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
				164	case 0:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	165	return {static_cast<unsigned char>(*__first_++)};
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	166
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	167	case 2: {
				168	if (__last_ - __first_ < 2 \|\| !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	169	break;
				170
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	171	char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
				172	__value <<= 6;
				173	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	174
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	175	// These values should be encoded in 1 UTF-8 code unit.
				176	if (__value < 0x0080) [[unlikely]]
				177	return __consume_result_error;
				178
				179	return {__value};
				180	}
				181
				182	case 3: {
				183	if (__last_ - __first_ < 3 \|\| !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	184	break;
				185
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	186	char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
				187	__value <<= 6;
				188	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
				189	__value <<= 6;
				190	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	191
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	192	// These values should be encoded in 1 or 2 UTF-8 code units.
				193	if (__value < 0x0800) [[unlikely]]
				194	return __consume_result_error;
				195
				196	// A surrogate value is always encoded in 3 UTF-8 code units.
				197	if (__unicode::__is_surrogate(__value)) [[unlikely]]
				198	return __consume_result_error;
				199
				200	return {__value};
				201	}
				202
				203	case 4: {
				204	if (__last_ - __first_ < 4 \|\| !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	205	break;
				206
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	207	char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
				208	__value <<= 6;
				209	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
				210	__value <<= 6;
				211	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
				212	__value <<= 6;
				213	__value \|= static_cast<unsigned char>(*__first_++) & 0x3f;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	214
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	215	// These values should be encoded in 1, 2, or 3 UTF-8 code units.
				216	if (__value < 0x10000) [[unlikely]]
				217	return __consume_result_error;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	218
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	219	// A value too large is always encoded in 4 UTF-8 code units.
				220	if (!__unicode::__is_code_point(__value)) [[unlikely]]
				221	return __consume_result_error;
				222
				223	return {__value};
				224	}
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	225	}
				226	// An invalid number of leading ones can be garbage or a code unit in the
				227	// middle of a code point. By consuming one code unit the parser may get
				228	// "in sync" after a few code units.
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	229	++__first_;
				230	return __consume_result_error;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	231	}
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	232
				233	private:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	234	_Iterator __first_;
				235	_Iterator __last_;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	236	};
				237
				238	# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
				239	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
				240	return __value >= 0xd800 && __value <= 0xdbff;
				241	}
				242
				243	_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
				244	return __value >= 0xdc00 && __value <= 0xdfff;
				245	}
				246
				247	/// This specialization depends on the size of wchar_t
				248	/// - 2 UTF-16 (for example Windows and AIX)
				249	/// - 4 UTF-32 (for example Linux)
				250	template <>
				251	class __code_point_view<wchar_t> {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	252	using _Iterator = typename basic_string_view<wchar_t>::const_iterator;
				253
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	254	public:
				255	static_assert(sizeof(wchar_t) == 2 \|\| sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
				256
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	257	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	258	: __first_(__first), __last_(__last) {}
				259
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	260	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	261	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
				262
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	263	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	264	_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
				265
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	266	char32_t __value = static_cast<char32_t>(*__first_++);
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	267	if constexpr (sizeof(wchar_t) == 2) {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	268	if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
				269	return __consume_result_error;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	270
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	271	if (__unicode::__is_high_surrogate(__value)) {
				272	if (__first_ == __last_ \|\| !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
				273	return __consume_result_error;
				274
				275	__value -= 0xd800;
				276	__value <<= 10;
				277	__value += static_cast<char32_t>(*__first_++) - 0xdc00;
				278	__value += 0x10000;
				279
				280	if (!__unicode::__is_code_point(__value)) [[unlikely]]
				281	return __consume_result_error;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	282	}
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	283	} else {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	284	if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
				285	return __consume_result_error;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	286	}
				287
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	288	return {__value};
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	289	}
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	290
				291	private:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	292	_Iterator __first_;
				293	_Iterator __last_;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	294	};
				295	# endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
				296
				297	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
				298	bool& __ri_break_allowed,
				299	bool __has_extened_pictographic,
				300	__extended_grapheme_custer_property_boundary::__property __prev,
				301	__extended_grapheme_custer_property_boundary::__property __next) {
				302	using __extended_grapheme_custer_property_boundary::__property;
				303
				304	__has_extened_pictographic \|= __prev == __property::__Extended_Pictographic;
				305
				306	// https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
				307
				308	// * Break at the start and end of text, unless the text is empty. *
				309
				310	_LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
				311	_LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
				312
				313	// * Do not break between a CR and LF. Otherwise, break before and after controls. *
				314	if (__prev == __property::__CR && __next == __property::__LF) // GB3
				315	return false;
				316
				317	if (__prev == __property::__Control \|\| __prev == __property::__CR \|\| __prev == __property::__LF) // GB4
				318	return true;
				319
				320	if (__next == __property::__Control \|\| __next == __property::__CR \|\| __next == __property::__LF) // GB5
				321	return true;
				322
				323	// * Do not break Hangul syllable sequences. *
				324	if (__prev == __property::__L &&
				325	(__next == __property::__L \|\| __next == __property::__V \|\| __next == __property::__LV \|\|
				326	__next == __property::__LVT)) // GB6
				327	return false;
				328
				329	if ((__prev == __property::__LV \|\| __prev == __property::__V) &&
				330	(__next == __property::__V \|\| __next == __property::__T)) // GB7
				331	return false;
				332
				333	if ((__prev == __property::__LVT \|\| __prev == __property::__T) && __next == __property::__T) // GB8
				334	return false;
				335
				336	// * Do not break before extending characters or ZWJ. *
				337	if (__next == __property::__Extend \|\| __next == __property::__ZWJ)
				338	return false; // GB9
				339
				340	// * Do not break before SpacingMarks, or after Prepend characters. *
				341	if (__next == __property::__SpacingMark) // GB9a
				342	return false;
				343
				344	if (__prev == __property::__Prepend) // GB9b
				345	return false;
				346
				347	// * Do not break within emoji modifier sequences or emoji zwj sequences. *
				348
				349	// GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
				350	//
				351	// Note that several parts of this rule are matched by GB9: Any x (Extend \| ZWJ)
				352	// - \p{Extended_Pictographic} x Extend
				353	// - Extend x Extend
				354	// - \p{Extended_Pictographic} x ZWJ
				355	// - Extend x ZWJ
				356	//
				357	// So the only case left to test is
				358	// - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
				359	// where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
				360	if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic)
				361	return false;
				362
				363	// * Do not break within emoji flag sequences *
				364
				365	// That is, do not break between regional indicator (RI) symbols if there
				366	// is an odd number of RI characters before the break point.
				367
				368	if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13
				369	__ri_break_allowed = !__ri_break_allowed;
				370	return __ri_break_allowed;
				371	}
				372
				373	// * Otherwise, break everywhere. *
				374	return true; // GB999
				375	}
				376
				377	/// Helper class to extract an extended grapheme cluster from a Unicode character range.
				378	///
				379	/// This function is used to determine the column width of an extended grapheme
				380	/// cluster. In order to do that only the first code point is evaluated.
				381	/// Therefore only this code point is extracted.
				382	template <class _CharT>
				383	class __extended_grapheme_cluster_view {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	384	using _Iterator = typename basic_string_view<_CharT>::const_iterator;
				385
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	386	public:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	387	_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	388	: __code_point_view_(__first, __last),
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	389	__next_code_point_(__code_point_view_.__consume().__code_point),
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	390	__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
				391
				392	struct __cluster {
				393	/// The first code point of the extended grapheme cluster.
				394	///
				395	/// The first code point is used to estimate the width of the extended
				396	/// grapheme cluster.
				397	char32_t __code_point_;
				398
				399	/// Points one beyond the last code unit in the extended grapheme cluster.
				400	///
				401	/// It's expected the caller has the start position and thus can determine
				402	/// the code unit range of the extended grapheme cluster.
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	403	_Iterator __last_;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	404	};
				405
				406	_LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
				407	_LIBCPP_ASSERT(
				408	__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
				409	"can't move beyond the end of input");
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	410
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	411	char32_t __code_point = __next_code_point_;
				412	if (!__code_point_view_.__at_end())
				413	return {__code_point, __get_break()};
				414
				415	__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
				416	return {__code_point, __code_point_view_.__position()};
				417	}
				418
				419	private:
				420	__code_point_view<_CharT> __code_point_view_;
				421
				422	char32_t __next_code_point_;
				423	__extended_grapheme_custer_property_boundary::__property __next_prop_;
				424
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	425	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __get_break() {
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	426	bool __ri_break_allowed = true;
				427	bool __has_extened_pictographic = false;
				428	while (true) {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	429	_Iterator __result = __code_point_view_.__position();
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	430	__extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
				431	if (__code_point_view_.__at_end()) {
				432	__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
				433	return __result;
				434	}
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	435	__next_code_point_ = __code_point_view_.__consume().__code_point;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	436	__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
				437
				438	__has_extened_pictographic \|=
				439	__prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
				440
				441	if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
				442	return __result;
				443	}
				444	}
				445	};
				446
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	447	template <contiguous_iterator _Iterator>
				448	__extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	449
				450	# else // _LIBCPP_HAS_NO_UNICODE
				451
				452	// For ASCII every character is a "code point".
				453	// This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
				454	template <class _CharT>
				455	class __code_point_view {
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	456	using _Iterator = typename basic_string_view<_CharT>::const_iterator;
				457
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	458	public:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	459	_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	460	: __first_(__first), __last_(__last) {}
				461
				462	_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	463	_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	464
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	465	[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	466	_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	467	return {static_cast<char32_t>(*__first_++)};
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	468	}
				469
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	470	private:
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	471	_Iterator __first_;
				472	_Iterator __last_;
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	473	};
				474
				475	# endif // _LIBCPP_HAS_NO_UNICODE
				476
				477	} // namespace __unicode
				478
Kaido Kert	56d7c4e	2024-04-13 12:59:27 -0700	[diff] [blame]	479	#endif //_LIBCPP_STD_VER >= 20
Kaido Kert	788710a	2023-06-05 07:50:22 -0700	[diff] [blame]	480
				481	_LIBCPP_END_NAMESPACE_STD
				482
				483	#endif // _LIBCPP___FORMAT_UNICODE_H