Skip to content

Commit

Permalink
Move _Width_estimate_intervals_v2 to __msvc_format_ucd_tables.hpp (
Browse files Browse the repository at this point in the history
…#4446)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
  • Loading branch information
cpplearner and StephanTLavavej authored Mar 8, 2024
1 parent ddc5a62 commit 4378648
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 231 deletions.
19 changes: 19 additions & 0 deletions stl/inc/__msvc_format_ucd_tables.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t
0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11,
0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}};

// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9,
0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693,
0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea,
0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728,
0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf,
0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0,
0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0,
0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10,
0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0,
0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff,
0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004,
0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249,
0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5,
0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00,
0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0,
0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe};

_STD_END

#pragma pop_macro("new")
Expand Down
24 changes: 0 additions & 24 deletions stl/inc/format
Original file line number Diff line number Diff line change
Expand Up @@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
#endif // ^^^ EDG workaround ^^^
}

// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
// in the https://github.com/microsoft/stl repository.

// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u,
0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du,
0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu,
0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu,
0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u,
0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u,
0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u,
0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u,
0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u,
0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu,
0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u,
0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u,
0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u,
0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu,
0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};

_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
// Computes the width estimation for Unicode characters from N4950 [format.string.std]/13
// The two branches are functionally equivalent; `12` is chosen for performance here.
Expand Down
201 changes: 0 additions & 201 deletions tools/unicode_properties_parse/format_width_estimate_intervals.py

This file was deleted.

54 changes: 48 additions & 6 deletions tools/unicode_properties_parse/unicode_properties_data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
}};
"""

INTERVALS_TEMPLATE = """
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _{prop_name}_ranges[{size}] = {{
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
{data}
}};
"""
Expand Down Expand Up @@ -148,7 +148,6 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
#if _STL_COMPILER_PREPROCESSOR
#include <cstdint>
#include <limits>
#include <xutility>
#pragma pack(push, _CRT_PACKING)
Expand All @@ -166,7 +165,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
uint16_t _Props_and_size[_NumRanges];
_NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
if (_Upper_idx == 0) {{
return _No_value_constant;
}}
Expand Down Expand Up @@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]:
return filename, timestamp, ranges


def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
values = []

for width_2_range in width_2_ranges:
values.append(width_2_range.lower)
values.append(width_2_range.upper + 1)

return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))


def generate_data_tables() -> str:
"""
Generate Unicode data for inclusion into <format> from
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
and EastAsianWidth.txt.
GraphemeBreakProperty.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
Expand All @@ -291,28 +302,59 @@ def generate_data_tables() -> str:
DerivedCoreProperties.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
EastAsianWidth.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
All files are expected to be in the same directory as this script.
"""
gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")

printable_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes")
for x in cat_ranges
if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
], key=lambda x: x.lower))

# N4971 [format.string.std]/13
std_wide_ranges = [
range(0x4DC0, 0x4DFF),
range(0x1F300, 0x1F5FF),
range(0x1F900, 0x1F9FF),
]

def has_width_2(prop_range):
if prop_range.prop in ("F", "W"):
return True

for std_wide_range in std_wide_ranges:
if prop_range.lower in std_wide_range:
assert prop_range.upper <= std_wide_range.stop

return True
else:
assert prop_range.upper not in std_wide_range

return False

width_2_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
], key=lambda x: x.lower))

gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
# _printable follows a different naming scheme, to indicate that it is a fake Unicode property.
printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
x for x in derived_ranges if x.prop == "Grapheme_Extend"])
width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)

return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data])
return "\n".join(
[gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])


if __name__ == "__main__":
Expand Down

0 comments on commit 4378648

Please sign in to comment.