// This file contains a draft interface for querying Unicode character // properties. // It does not compile yet. #include #include namespace boost { #ifndef __UNICODEINTEGRALTYPES typedef uint32_t codepoint; typedef uint16_t char16_t; #endif //__UNICODEINTEGRALTYPES namespace unicode { typedef ::boost::char32_t codepoint; /********************************************************************************* ** This is automatically generated from blocks.txt, taking each entry and ** replacing '-' with '_' and removing all ' ' ** Note: Private is added to this list *********************************************************************************/ struct range { enum type { latin_1_supplement, latin_extended_a, latin_extended_b, ipa_extensions, spacing_modifier_letters, combining_diacritical_marks, greek_and_coptic, cyrillic, cyrillic_supplement, armenian, hebrew, arabic, syriac, arabic_supplement, thaana, devanagari, bengali, gurmukhi, gujarati, oriya, tamil, telugu, kannada, malayalam, sinhala, thai, lao, tibetan, myanmar, georgian, hangulJamo, ethiopic, ethiopic_supplement, cherokee, unified_canadian_aboriginal_syllabics, ogham, runic, tagalog, hanunoo, buhid, tagbanwa, khmer, mongolian, limbu, taiLe, new_tai_lue, khmerSymbols, buginese, phonetic_extensions, phonetic_extensions_supplement, combining_diacritical_marks_supplement, latin_extended_additional, greek_extended, general_punctuation, superscripts_and_subscripts, currency_symbols, combining_diacritical_marks_for_symbols, letterlike_symbols, number_forms, arrows, mathematical_operators, miscellaneous_technical, control_pictures, optical_character_recognition, enclosed_alphanumerics, box_drawing, block_elements, geometric_shapes, miscellaneous_symbols, dingbats, miscellaneous_mathematical_symbols_a, supplemental_arrows_a, braille_patterns, supplemental_arrows_b, miscellaneous_mathematical_symbols_b, supplemental_mathematical_operators, miscellaneous_symbols_and_arrows, glagolitic, coptic, georgian_supplement, tifinagh, ethiopicextended, supplemental_punctuation, cjk_radicals_supplement, kangxi_radicals, ideographic_description_characters, cjk_symbols_and_punctuation, hiragana, katakana, bopomofo, hangul_compatibility_jamo, kanbun, bopomofo_extended, cjk_strokes, katakana_phonetic_extensions, enclosed_cjk_letters_and_months, cjk_compatibility, cjk_unified_ideographs_extension_a, yijing_hexagram_symbols, cjk_unified_ideographs, yi_syllables, yi_radicals, modifier_tone_letters, syloti_nagri, hangul_syllables, high_surrogates, high_private_use_surrogates, low_surrogates, private_use_area, cjk_compatibility_ideographs, alphabetic_presentation_forms, arabic_presentation_forms_a, variation_selectors, vertical_forms, combining_half_marks, cjk_compatibility_forms, small_form_variants, arabic_presentation_forms_b, halfwidth_and_fullwidth_forms, specials, linear_b_syllabary, linear_b_ideograms, aegean_numbers, ancient_greek_numbers, old_italic, gothic, ugaritic, old_persian, deseret, shavian, osmanya, cypriot_syllabary, kharoshthi, byzantine_musical_symbols, musical_symbols, ancient_greek_musical_notation, tai_xuan_jing_symbols, mathemat, private_ }; }; /********************************************************************************* ** Unicode types *********************************************************************************/ struct category { enum type { letter, mark, number, separator, other, punctuation, symbol }; }; struct category_casing { enum type { uppercase, lowercase, titlecase, modifier, other }; }; struct category_mark { enum type { non_spacing, space_combining, mark_enclosing }; }; struct category_number { enum type { decimal_digit, letter, other }; }; struct category_separator { enum type { space, line, paragraph }; }; struct category_other { enum type { control, format, surrogate, private_use, not_assigned }; }; struct category_punctuation { enum type { connector, dash, open, close, initial_quote, final_quote, other }; }; struct category_symbol { enum type { math, currency, modifier, other }; }; struct join_type { enum type { none, right, left, dual, causing, transparent }; }; struct bidi_char_type { enum type { strong_left_to_right = 0x100, strong_left_to_right_embedding, strong_left_to_right_override, strong_right_to_left = 0x200, strong_right_to_left_arabic, strong_right_to_left_embedding, strong_right_to_left_override, weak_pop_direction_format = 0x1000, weak_european_digits, weak_european_number_separator, weak_european_number_terminator, weak_arabic_number, weak_common_number_separator, weak_non_spacing_mark, weak_boundary_neutral, neutral_paragraph_separator = 0x2000, neutral_segment_separator, neutral_whitespace, neutral_other, }; }; struct break_class { enum type { mandatory, carriage_return, line_feed, combining_marks, next_line, surrogates, word_joiner, zero_width_space, non_breaking, contingent_break_opport, space, break_opport_before_after, break_opport_after, break_opport_before, hyphen, closing_punct, exclamation_interrog, inseparable, non_starter, opening_punct, ambiguous_quote, infix_separator, numeric, postfix_numeric, prefix_numeric, symbols_allowing_breaks, ambiguous, ordinary_alpabetic_and_symbol_chars, hangul_lv_syllable, hangul_lvt_syllable, ideograph, hangul_l_jamo, hangul_v_jamo, hangul_t_jamo, complex_context, unknown }; }; struct break_action { enum type { direct, indirect, combiningIndirect, combiningProhibited, prohibited, explicit_ }; }; /********************************************************************************* ** Functions *********************************************************************************/ /** \todo The behaviour of these functions when called with any respective out-of-range values should be specified. **/ category::type get_category(codepoint ch); size_t get_combining_class(codepoint ch); category_casing::type get_category_casing(codepoint ch); category_mark::type get_category_mark(codepoint ch); category_number::type get_category_number(codepoint ch); category_separator::type get_category_separator(codepoint ch); category_other::type get_category_other(codepoint ch); category_punctuation::type get_category_punctuation(codepoint ch); category_symbol::type get_category_symbol(codepoint ch); join_type::type get_join_type(codepoint ch); bidi_char_type::type get_bidi_char_type(codepoint ch); break_class::type get_break_class(codepoint ch); // forms information codepoint get_nominal_form(codepoint ch); codepoint get_left_form(codepoint ch); codepoint get_right_form(codepoint ch); codepoint get_medial_form(codepoint ch); // blocks range::type get_range(codepoint ch); // separators /// \pre: current != begin template BidirectionalIterator previous_grapheme( BidirectionalIterator begin, BidirectionalIterator current); /// \pre: current != end template ForwardIterator next_grapheme( ForwardIterator current, ForwardIterator end); /// \pre: current != begin template BidirectionalIterator previous_word( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); /// \pre: current != end template BidirectionalIterator next_word( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); /// \pre: current != begin template BidirectionalIterator previous_sentence( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); /// \pre: current != end template BidirectionalIterator next_sentence( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); /// \pre: current != begin template std::pair previous_line_break( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); /// \pre: current != end template std::pair next_line_break( BidirectionalIterator begin, BidirectionalIterator current, BidirectionalIterator end); // Exactly what objects this class will compare will become clear // after the unicode string class has been defined. // It should be possible to use this class for STL containers // (e.g. std::map) and algorithms (e.g. std::lower_bound). class collation { public: template <...> bool operator () ( const string<...> & s1, const string<...> & s2) const; template <...> bool operator () ( const string_with_sort_data<...> & s1, const string_with_sort_data<...> & s2) const; }; class locale { public: template void lowercase(InputIterator begin, InputIterator end, OutputIterator out) const; template void uppercase(InputIterator begin, InputIterator end, OutputIterator out) const; // A sketch of what collation might look like. collation collate_base_characters() const; collation collate_accents() const; collation collate_case() const; collation collate_punctuation() const; collation collate_tie_breaker() const; }; locale default_locale(); // ---------------------------------------------------------------------------- // inline helpers // TODO add lots based on core functions above // bool is_space(codepoint ch) { // space for any kind of separator. return get_category_separator (ch) == category_separator::space; } bool is_strong_ltor(codepoint ch) { return get_break_class (ch) & 0xF00 == 0x100; } bool is_strong_rtol(codepoint ch) { return get_break_class (ch) & 0xF00 == 0x100; } } // namespace unicode } // namespace boost