third_party/llvm-project/libcxx/utils/generate_extended_grapheme_cluster_test.py - cobalt - Git at Google

 #!/usr/bin/env python
 # ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # ===----------------------------------------------------------------------===##

 # The code is based on
 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
 #
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import Optional, TextIO
 from array import array
 import sys


 @dataclass
 class BreakTestItem:
     code_points: list[int] = field(default_factory=list)
     encoded: str = ""
     breaks_utf8: list[int] = field(default_factory=list)
     breaks_utf16: list[int] = field(default_factory=list)
     breaks_utf32: list[int] = field(default_factory=list)


 class CommentLine:
     pass


 class EOF:
     pass


 def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
     result = BreakTestItem()
     code_point = -1
     utf8 = 0
     utf16 = 0
     utf32 = 0

     while True:
         c = input.read(1)
         if c == "\N{DIVISION SIGN}":
             # The line starts with a division sign, don't add it to the output.
             if code_point != -1:
                 result.code_points.append(code_point)
                 code_point = -1
                 result.breaks_utf8.append(utf8)
                 result.breaks_utf16.append(utf16)
                 result.breaks_utf32.append(utf32)

             assert input.read(1).isspace()
             continue
         if c == "\N{MULTIPLICATION SIGN}":
             assert input.read(1).isspace()
             continue
         if c.isalnum():
             while next := input.read(1):
                 if next.isalnum():
                     c += next
                 else:
                     assert next.isspace()
                     break
             i = int(c, base=16)
             if code_point == -1:
                 code_point = i

             result.encoded += f"\\U{i:08x}"
             c = chr(i)
             utf8 += c.encode().__len__()
             # Since we only care about the number of code units the byte order
             # doesn't matter. The byte order is specified to avoid the BOM
             utf16 += int(c.encode("utf-16-le").__len__() / 2)
             utf32 += int(c.encode("utf-32-le").__len__() / 4)
             continue
         if c == "#":
             input.readline()
             return result
         if c == "\n":
             return result
         if c == "":
             return None
         assert False


 cpp_template = """// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 // WARNING, this entire header is generated by
 // utils/generate_extended_grapheme_cluster_test.py
 // DO NOT MODIFY!

 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 //
 // See Terms of Use <https://www.unicode.org/copyright.html>
 // for definitions of Unicode Inc.'s Data Files and Software.
 //
 // NOTICE TO USER: Carefully read the following legal agreement.
 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 // TERMS AND CONDITIONS OF THIS AGREEMENT.
 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 // THE DATA FILES OR SOFTWARE.
 //
 // COPYRIGHT AND PERMISSION NOTICE
 //
 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 //
 // Permission is hereby granted, free of charge, to any person obtaining
 // a copy of the Unicode data files and any associated documentation
 // (the "Data Files") or Unicode software and any associated documentation
 // (the "Software") to deal in the Data Files or Software
 // without restriction, including without limitation the rights to use,
 // copy, modify, merge, publish, distribute, and/or sell copies of
 // the Data Files or Software, and to permit persons to whom the Data Files
 // or Software are furnished to do so, provided that either
 // (a) this copyright and permission notice appear with all copies
 // of the Data Files or Software, or
 // (b) this copyright and permission notice appear in associated
 // Documentation.
 //
 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 //
 // Except as contained in this notice, the name of a copyright holder
 // shall not be used in advertising or otherwise to promote the sale,
 // use or other dealings in these Data Files or Software without prior
 // written authorization of the copyright holder.

 #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
 #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H

 #include <array>
 #include <string_view>
 #include <vector>

 #include "test_macros.h"

 template <class CharT>
 struct data {{
   /// The input to parse.
   std::basic_string_view<CharT> input;

   /// The first code point all extended grapheme clusters in the input.
   std::vector<char32_t> code_points;

   /// The offset of the last code units of the extended grapheme clusters in the input.
   ///
   /// The vector has the same number of entries as \\ref code_points.
   std::vector<std::size_t> breaks;
 }};

 /// The data for UTF-8.
 std::array<data<char>, {0}> data_utf8 = {{{{
 {1}}}}};

 /// The data for UTF-16.
 ///
 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 /// since the size of the code units differ the breaks can contain different
 /// values.
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
 std::array<data<wchar_t>, {0}> data_utf16 = {{{{
 {2}}}}};

 /// The data for UTF-8.
 ///
 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 /// since the size of the code units differ the breaks can contain different
 /// values.
 std::array<data<wchar_t>, {0}> data_utf32 = {{{{
 {3}}}}};
 #endif // TEST_HAS_NO_WIDE_CHARACTERS

 #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""

 cpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"


 def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
     return cpp_test_data_line_template.format(
         f'"{line.encoded}"',
         ", ".join([str(x) for x in line.code_points]),
         ", ".join([str(x) for x in line.breaks_utf8]),
     )


 def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
     return cpp_test_data_line_template.format(
         f'L"{line.encoded}"',
         ", ".join([str(x) for x in line.code_points]),
         ", ".join([str(x) for x in line.breaks_utf16]),
     )


 def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
     return cpp_test_data_line_template.format(
         f'L"{line.encoded}"',
         ", ".join([str(x) for x in line.code_points]),
         ", ".join([str(x) for x in line.breaks_utf32]),
     )


 """
 Generate test data from "GraphemeBreakText.txt"
 This file can be downloaded from:
 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
 This script looks for GraphemeBreakTest.txt in same directory as this script
 """


 def generate_all() -> str:
     test_data_path = Path(__file__)
     test_data_path = test_data_path.absolute()
     test_data_path = (
         test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
     )
     lines = list()
     with open(test_data_path, mode="rt", encoding="utf-8") as file:
         while line := parseBreakTestLine(file):
             if len(line.encoded) > 0:
                 lines.append(line)
     return cpp_template.format(
         len(lines),
         ",\n".join(map(lineToCppDataLineUtf8, lines)),
         ",\n".join(map(lineToCppDataLineUtf16, lines)),
         ",\n".join(map(lineToCppDataLineUtf32, lines)),
     )


 if __name__ == "__main__":
     if len(sys.argv) == 2:
         sys.stdout = open(sys.argv[1], "w")
     print(generate_all())
	#!/usr/bin/env python
	# ===----------------------------------------------------------------------===##
	#
	# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	#
	# ===----------------------------------------------------------------------===##

	# The code is based on
	# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
	#
	# Copyright (c) Microsoft Corporation.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	from pathlib import Path
	from dataclasses import dataclass, field
	from typing import Optional, TextIO
	from array import array
	import sys


	@dataclass
	class BreakTestItem:
	code_points: list[int] = field(default_factory=list)
	encoded: str = ""
	breaks_utf8: list[int] = field(default_factory=list)
	breaks_utf16: list[int] = field(default_factory=list)
	breaks_utf32: list[int] = field(default_factory=list)


	class CommentLine:
	pass


	class EOF:
	pass


	def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
	result = BreakTestItem()
	code_point = -1
	utf8 = 0
	utf16 = 0
	utf32 = 0

	while True:
	c = input.read(1)
	if c == "\N{DIVISION SIGN}":
	# The line starts with a division sign, don't add it to the output.
	if code_point != -1:
	result.code_points.append(code_point)
	code_point = -1
	result.breaks_utf8.append(utf8)
	result.breaks_utf16.append(utf16)
	result.breaks_utf32.append(utf32)

	assert input.read(1).isspace()
	continue
	if c == "\N{MULTIPLICATION SIGN}":
	assert input.read(1).isspace()
	continue
	if c.isalnum():
	while next := input.read(1):
	if next.isalnum():
	c += next
	else:
	assert next.isspace()
	break
	i = int(c, base=16)
	if code_point == -1:
	code_point = i

	result.encoded += f"\\U{i:08x}"
	c = chr(i)
	utf8 += c.encode().__len__()
	# Since we only care about the number of code units the byte order
	# doesn't matter. The byte order is specified to avoid the BOM
	utf16 += int(c.encode("utf-16-le").__len__() / 2)
	utf32 += int(c.encode("utf-32-le").__len__() / 4)
	continue
	if c == "#":
	input.readline()
	return result
	if c == "\n":
	return result
	if c == "":
	return None
	assert False


	cpp_template = """// -- C++ --
	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	// WARNING, this entire header is generated by
	// utils/generate_extended_grapheme_cluster_test.py
	// DO NOT MODIFY!

	// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
	//
	// See Terms of Use <https://www.unicode.org/copyright.html>
	// for definitions of Unicode Inc.'s Data Files and Software.
	//
	// NOTICE TO USER: Carefully read the following legal agreement.
	// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
	// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
	// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
	// TERMS AND CONDITIONS OF THIS AGREEMENT.
	// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
	// THE DATA FILES OR SOFTWARE.
	//
	// COPYRIGHT AND PERMISSION NOTICE
	//
	// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
	// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
	//
	// Permission is hereby granted, free of charge, to any person obtaining
	// a copy of the Unicode data files and any associated documentation
	// (the "Data Files") or Unicode software and any associated documentation
	// (the "Software") to deal in the Data Files or Software
	// without restriction, including without limitation the rights to use,
	// copy, modify, merge, publish, distribute, and/or sell copies of
	// the Data Files or Software, and to permit persons to whom the Data Files
	// or Software are furnished to do so, provided that either
	// (a) this copyright and permission notice appear with all copies
	// of the Data Files or Software, or
	// (b) this copyright and permission notice appear in associated
	// Documentation.
	//
	// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
	// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
	// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
	// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
	// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
	// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
	// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
	// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
	// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
	//
	// Except as contained in this notice, the name of a copyright holder
	// shall not be used in advertising or otherwise to promote the sale,
	// use or other dealings in these Data Files or Software without prior
	// written authorization of the copyright holder.

	#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
	#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H

	#include <array>
	#include <string_view>
	#include <vector>

	#include "test_macros.h"

	template <class CharT>
	struct data {{
	/// The input to parse.
	std::basic_string_view<CharT> input;

	/// The first code point all extended grapheme clusters in the input.
	std::vector<char32_t> code_points;

	/// The offset of the last code units of the extended grapheme clusters in the input.
	///
	/// The vector has the same number of entries as \\ref code_points.
	std::vector<std::size_t> breaks;
	}};

	/// The data for UTF-8.
	std::array<data<char>, {0}> data_utf8 = {{{{
	{1}}}}};

	/// The data for UTF-16.
	///
	/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
	/// since the size of the code units differ the breaks can contain different
	/// values.
	#ifndef TEST_HAS_NO_WIDE_CHARACTERS
	std::array<data<wchar_t>, {0}> data_utf16 = {{{{
	{2}}}}};

	/// The data for UTF-8.
	///
	/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
	/// since the size of the code units differ the breaks can contain different
	/// values.
	std::array<data<wchar_t>, {0}> data_utf32 = {{{{
	{3}}}}};
	#endif // TEST_HAS_NO_WIDE_CHARACTERS

	#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""

	cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}"


	def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
	return cpp_test_data_line_template.format(
	f'"{line.encoded}"',
	", ".join([str(x) for x in line.code_points]),
	", ".join([str(x) for x in line.breaks_utf8]),
	)


	def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
	return cpp_test_data_line_template.format(
	f'L"{line.encoded}"',
	", ".join([str(x) for x in line.code_points]),
	", ".join([str(x) for x in line.breaks_utf16]),
	)


	def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
	return cpp_test_data_line_template.format(
	f'L"{line.encoded}"',
	", ".join([str(x) for x in line.code_points]),
	", ".join([str(x) for x in line.breaks_utf32]),
	)


	"""
	Generate test data from "GraphemeBreakText.txt"
	This file can be downloaded from:
	https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
	This script looks for GraphemeBreakTest.txt in same directory as this script
	"""


	def generate_all() -> str:
	test_data_path = Path(__file__)
	test_data_path = test_data_path.absolute()
	test_data_path = (
	test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
	)
	lines = list()
	with open(test_data_path, mode="rt", encoding="utf-8") as file:
	while line := parseBreakTestLine(file):
	if len(line.encoded) > 0:
	lines.append(line)
	return cpp_template.format(
	len(lines),
	",\n".join(map(lineToCppDataLineUtf8, lines)),
	",\n".join(map(lineToCppDataLineUtf16, lines)),
	",\n".join(map(lineToCppDataLineUtf32, lines)),
	)


	if __name__ == "__main__":
	if len(sys.argv) == 2:
	sys.stdout = open(sys.argv[1], "w")
	print(generate_all())