// This file contains a draft interface for querying Unicode character // properties. // It does not compile yet. #include #include namespace boost { #ifndef __UNICODEINTEGRALTYPES typedef uint32_t codepoint; typedef uint16_t char16_t; #endif //__UNICODEINTEGRALTYPES namespace unicode { typedef ::boost::char32_t codepoint; /********************************************************************************* ** This is automatically generated from blocks.txt, taking each entry and ** replacing '-' with '_' and removing all ' ' ** Note: Private is added to this list *********************************************************************************/ struct range { enum type { latin_1_supplement, latin_extended_a, latin_extended_b, ipa_extensions, spacing_modifier_letters, combining_diacritical_marks, greek_and_coptic, cyrillic, cyrillic_supplement, armenian, hebrew, arabic, syriac, arabic_supplement, thaana, devanagari, bengali, gurmukhi, gujarati, oriya, tamil, telugu, kannada, malayalam, sinhala, thai, lao, tibetan, myanmar, georgian, hangulJamo, ethiopic, ethiopic_supplement, cherokee, unified_canadian_aboriginal_syllabics, ogham, runic, tagalog, hanunoo, buhid, tagbanwa, khmer, mongolian, limbu, taiLe, new_tai_lue, khmerSymbols, buginese, phonetic_extensions, phonetic_extensions_supplement, combining_diacritical_marks_supplement, latin_extended_additional, greek_extended, general_punctuation, superscripts_and_subscripts, currency_symbols, combining_diacritical_marks_for_symbols, letterlike_symbols, number_forms, arrows, mathematical_operators, miscellaneous_technical, control_pictures, optical_character_recognition, enclosed_alphanumerics, box_drawing, block_elements, geometric_shapes, miscellaneous_symbols, dingbats, miscellaneous_mathematical_symbols_a, supplemental_arrows_a, braille_patterns, supplemental_arrows_b, miscellaneous_mathematical_symbols_b, supplemental_mathematical_operators, miscellaneous_symbols_and_arrows, glagolitic, coptic, georgian_supplement, tifinagh, ethiopicextended, supplemental_punctuation, cjk_radicals_supplement, kangxi_radicals, ideographic_description_characters, cjk_symbols_and_punctuation, hiragana, katakana, bopomofo, hangul_compatibility_jamo, kanbun, bopomofo_extended, cjk_strokes, katakana_phonetic_extensions, enclosed_cjk_letters_and_months, cjk_compatibility, cjk_unified_ideographs_extension_a, yijing_hexagram_symbols, cjk_unified_ideographs, yi_syllables, yi_radicals, modifier_tone_letters, syloti_nagri, hangul_syllables, high_surrogates, high_private_use_surrogates, low_surrogates, private_use_area, cjk_compatibility_ideographs, alphabetic_presentation_forms, arabic_presentation_forms_a, variation_selectors, vertical_forms, combining_half_marks, cjk_compatibility_forms, small_form_variants, arabic_presentation_forms_b, halfwidth_and_fullwidth_forms, specials, linear_b_syllabary, linear_b_ideograms, aegean_numbers, ancient_greek_numbers, old_italic, gothic, ugaritic, old_persian, deseret, shavian, osmanya, cypriot_syllabary, kharoshthi, byzantine_musical_symbols, musical_symbols, ancient_greek_musical_notation, tai_xuan_jing_symbols, mathemat, private_ }; }; /********************************************************************************* ** Unicode types *********************************************************************************/ struct category { enum type { letter_uppercase, letter_lowercase, letter_titlecase, letter_modifier, letter_other mark_non_spacing, mark_space_combining, mark_enclosing number_decimal_digit, number_letter, number_other type_space, type_line, type_paragraph other_control, other_format, other_surrogate, other_private_use, other_not_assigned punctuation_connector, punctuation_dash, punctuation_open, punctuation_close, punctuation_initial_quote, punctuation_final_quote, punctuation_other symbol_math, symbol_currency, symbol_modifier, symbol_other unknown // default value for unknown characters }; }; struct join_type { enum type { none, // default value for unknown characters right, left, dual, causing, transparent }; }; struct bidi_char_type { enum type { strong_left_to_right = 1, // default value for unknown characters strong_left_to_right_embedding, strong_left_to_right_override, strong_right_to_left = 4, strong_right_to_left_arabic, strong_right_to_left_embedding, strong_right_to_left_override, weak_pop_direction_format = 8, weak_european_digits, weak_european_number_separator, weak_european_number_terminator, weak_arabic_number, weak_common_number_separator, weak_non_spacing_mark, weak_boundary_neutral, neutral_paragraph_separator = 16, neutral_segment_separator, neutral_whitespace, neutral_other, }; }; struct break_class { enum type { mandatory, carriage_return, line_feed, combining_marks, next_line, surrogates, word_joiner, zero_width_space, non_breaking, contingent_break_opport, space, break_opport_before_after, break_opport_after, break_opport_before, hyphen, closing_punct, exclamation_interrog, inseparable, non_starter, opening_punct, ambiguous_quote, infix_separator, numeric, postfix_numeric, prefix_numeric, symbols_allowing_breaks, ambiguous, ordinary_alpabetic_and_symbol_chars, hangul_lv_syllable, hangul_lvt_syllable, ideograph, hangul_l_jamo, hangul_v_jamo, hangul_t_jamo, complex_context, unknown // default value for unknown characters }; }; struct break_action { enum type { direct, indirect, // default value for unknown characters combiningIndirect, combiningProhibited, prohibited, always, }; }; struct compare_level { enum type { Insensitive = 1, // No Accent, No case, No punctuation - role < roles < rule Accents, // Yes Accent, No case, No punctuation - rrole < rôle < roles CaseAccents, // Yes Accent, Yes case, No punctuation - role < Role < rôle CaseAccentsPunctuation, // Yes Accent, Yes case, Yes punctuation - role < “role” < Role }; }; /********************************************************************************* ** Functions *********************************************************************************/ #define DECLARE_UnicodeCharacterProperties(Pure) \ virtual category::type get_category(codepoint ch) Pure \ virtual size_t get_combining_class(codepoint ch) Pure \ virtual join_type::type get_join_type(codepoint ch) Pure \ virtual bidi_char_type::type get_bidi_char_type(codepoint ch) Pure \ virtual break_class::type get_break_class(codepoint ch) Pure \ \ /* forms information */ \ virtual codepoint get_nominal_form(codepoint ch) Pure \ virtual codepoint get_left_form(codepoint ch) Pure \ virtual codepoint get_right_form(codepoint ch) Pure \ virtual codepoint get_medial_form(codepoint ch) Pure \ \ /* blocks */ \ virtual range::type get_range(codepoint ch) Pure \ \ /* seperators */ \ virtual bool is_start_of_grapheme(codepoint chPrev, codepoint ch, codepoint chNext) Pure \ virtual bool is_start_of_word(codepoint chPrev, codepoint ch, codepoint chNext) Pure \ virtual bool is_start_of_identifier(codepoint chPrev, codepoint ch, codepoint chNext) Pure \ virtual bool is_start_of_sentence(codepoint chPrev, codepoint ch, codepoint chNext) Pure \ virtual breakAction::type start_of_line(codepoint chPrev, codepoint ch, codepoint chNext) Pure \ \ /* sort */ \ /* note that a locale of 0 will be handled as 'default' */ \ virtual void append_sort_data(uint16_t locale, codepoint ch, vector& dest) Pure \ /* returns +1 for +ve, 0 for equal and -1 for -ve */ \ virtual int compare_sort_data(const vector& data1, \ const vector& data2, compareLevel::type nLevel) Pure \ \ /* upper and lower case - note that a locale of 0 will be handled as 'default' */ \ virtual tuple get_uppercase(uint16_t locale, codepoint* pCh, size_t n) Pure \ virtual const codepoint * get_complex_lowercase(uint16_t locale, codepoint ch) Pure \ virtual const codepoint get_common_or_simple_lowercase(uint16_t locale, codepoint ch) Pure \ // Any class that will implement unicode character properties should: // 1. Derive from IUnicodeCharacterProperties // 2. Include in it's header DECLARE_UnicodeCharacterProperties(;) class IUnicodeCharacterProperties { DECLARE_UnicodeCharacterProperties(=0;) }; // Exactly what objects this class will compare will become clear // after the unicode string class has been defined. // It should be possible to use this class for STL containers // (e.g. std::map) and algorithms (e.g. std::lower_bound). class collation { public: template <...> bool operator () ( const string<...> & s1, const string<...> & s2) const; template <...> bool operator () ( const string_with_sort_data<...> & s1, const string_with_sort_data<...> & s2) const; }; // ---------------------------------------------------------------------------- // inline helpers // TODO add lots based on core functions above // bool is_space(codepoint ch) { // space for any kind of separator. return get_category_separator (ch) == category_separator::space; } bool is_strong_ltor(codepoint ch) { return get_break_class (ch) & 0xF00 == 0x100; } bool is_strong_rtol(codepoint ch) { return get_break_class (ch) & 0xF00 == 0x100; } } // namespace unicode } // namespace boost