// ---------------------------------------------------------------------------- // automatically generated header // /********************************************************************************* ** AUTOMATICALLY GENERATED - DO NOT ALTER ** This is automatically generated from blocks.txt, taking each entry and ** replacing '-' with '_' and removing all ' ' ** Note: ucr_Private is added to this list *********************************************************************************/ enum eUniRange { ucr_Latin1Supplement, ucr_LatinExtendedA, ucr_LatinExtendedB, ucr_IPAExtensions, ucr_SpacingModifierLetters, ucr_CombiningDiacriticalMarks, ucr_GreekandCoptic, ucr_Cyrillic, ucr_CyrillicSupplement, ucr_Armenian, ucr_Hebrew, ucr_Arabic, ucr_Syriac, ucr_ArabicSupplement, ucr_Thaana, ucr_Devanagari, ucr_Bengali, ucr_Gurmukhi, ucr_Gujarati, ucr_Oriya, ucr_Tamil, ucr_Telugu, ucr_Kannada, ucr_Malayalam, ucr_Sinhala, ucr_Thai, ucr_Lao, ucr_Tibetan, ucr_Myanmar, ucr_Georgian, ucr_HangulJamo, ucr_Ethiopic, ucr_EthiopicSupplement, ucr_Cherokee, ucr_UnifiedCanadianAboriginalSyllabics, ucr_Ogham, ucr_Runic, ucr_Tagalog, ucr_Hanunoo, ucr_Buhid, ucr_Tagbanwa, ucr_Khmer, ucr_Mongolian, ucr_Limbu, ucr_TaiLe, ucr_NewTaiLue, ucr_KhmerSymbols, ucr_Buginese, ucr_PhoneticExtensions, ucr_PhoneticExtensionsSupplement, ucr_CombiningDiacriticalMarksSupplement, ucr_LatinExtendedAdditional, ucr_GreekExtended, ucr_GeneralPunctuation, ucr_SuperscriptsandSubscripts, ucr_CurrencySymbols, ucr_CombiningDiacriticalMarksforSymbols, ucr_LetterlikeSymbols, ucr_NumberForms, ucr_Arrows, ucr_MathematicalOperators, ucr_MiscellaneousTechnical, ucr_ControlPictures, ucr_OpticalCharacterRecognition, ucr_EnclosedAlphanumerics, ucr_BoxDrawing, ucr_BlockElements, ucr_GeometricShapes, ucr_MiscellaneousSymbols, ucr_Dingbats, ucr_MiscellaneousMathematicalSymbols_A, ucr_SupplementalArrows_A, ucr_BraillePatterns, ucr_SupplementalArrowsB, ucr_MiscellaneousMathematicalSymbolsB, ucr_SupplementalMathematicalOperators, ucr_MiscellaneousSymbolsandArrows, ucr_Glagolitic, ucr_Coptic, ucr_GeorgianSupplement, ucr_Tifinagh, ucr_EthiopicExtended, ucr_SupplementalPunctuation, ucr_CJKRadicalsSupplement, ucr_KangxiRadicals, ucr_IdeographicDescriptionCharacters, ucr_CJKSymbolsandPunctuation, ucr_Hiragana, ucr_Katakana, ucr_Bopomofo, ucr_HangulCompatibilityJamo, ucr_Kanbun, ucr_BopomofoExtended, ucr_CJKStrokes, ucr_KatakanaPhoneticExtensions, ucr_EnclosedCJKLettersandMonths, ucr_CJKCompatibility, ucr_CJKUnifiedIdeographsExtensionA, ucr_YijingHexagramSymbols, ucr_CJKUnifiedIdeographs, ucr_YiSyllables, ucr_YiRadicals, ucr_ModifierToneLetters, ucr_SylotiNagri, ucr_HangulSyllables, ucr_HighSurrogates, ucr_HighPrivateUseSurrogates, ucr_LowSurrogates, ucr_PrivateUseArea, ucr_CJKCompatibilityIdeographs, ucr_AlphabeticPresentationForms, ucr_ArabicPresentationForms_A, ucr_VariationSelectors, ucr_VerticalForms, ucr_CombiningHalfMarks, ucr_CJKCompatibilityForms, ucr_SmallFormVariants, ucr_ArabicPresentationForms_B, ucr_HalfwidthandFullwidthForms, ucr_Specials, ucr_LinearBSyllabary, ucr_LinearBIdeograms, ucr_AegeanNumbers, ucr_AncientGreekNumbers, ucr_OldItalic, ucr_Gothic, ucr_Ugaritic, ucr_OldPersian, ucr_Deseret, ucr_Shavian, ucr_Osmanya, ucr_CypriotSyllabary, ucr_Kharoshthi, ucr_ByzantineMusicalSymbols, ucr_MusicalSymbols, ucr_AncientGreekMusicalNotation, ucr_TaiXuanJingSymbols, ucr_Mathemat, ucr_Private, ucr__Count }; // ---------------------------------------------------------------------------- // fixed definitions // enum eUniCategoryLetter { uccle_Uppercase = 0, uccle_Lowercase, uccle_Titlecase, uccle_Modifier, uccle_Other, uccle__Count }; enum eUniCategoryMark { uccma_NonSpacing = 0, uccma_SpaceCombining, uccma_MarkEnclosing, uccma__Count }; enum eUniCategoryNumber { uccnu_DecimalDigit = 0, uccnu_Letter, uccnu_Other, uccnu__Count }; enum eUniCategorySeperator { uccse_Space = 0, uccse_Line, uccse_Paragraph, uccse__Count }; enum eUniCategoryOther { ucco_Control = 0, ucco_Format, ucco_Surrogate, ucco_PrivateUse, ucco_NotAssigned, ucco__Count }; enum eUniCategoryPunctuation { uccpu_Connector = 0, uccpu_Dash, uccpu_Open, uccpu_Close, uccpu_InitialQuote, uccpu_FinalQuote, uccpu_Other, uccpu__Count }; enum eUniCategorySymbol { uccsy_Math = 0, uccsy_Currency, uccsy_Modifier, uccsy_Other, uccsy__Count }; enum eUniJoinType { ucjty_None = 0, ucjty_Right, ucjty_Left, ucjty_Dual, ucjty_Causing, ucjty_Transparent, ucjty__Count }; enum eUniBreakClass { ucbcl_Mandatory = 0, ucbcl_CarriageReturn, ucbcl_LineFeed, ucbcl_CombiningMarks, ucbcl_NextLine, ucbcl_Surrogates, ucbcl_WordJoiner, ucbcl_ZeroWidthSpace, ucbcl_NonBreaking, ucbcl_ContingentBreakOpport, ucbcl_Space, ucbcl_BreakOpportBeforeAfter, ucbcl_BreakOpportAfter, ucbcl_BreakOpportBefore, ucbcl_Hyphen, ucbcl_ClosingPunct, ucbcl_ExclamationInterrog, ucbcl_Hyphen, ucbcl_Inseperable, ucbcl_NonStarter, ucbcl_OpeningPuct, ucbcl_AmbiguousQuote, ucbcl_InfixSeperator, ucbcl_Numeric, ucbcl_PostfixNumeric, ucbcl_PrefixNumeric, ucbcl_SymbolsAllowingBreaks, ucbcl_Ambiguous, ucbcl_OrdinaryAlpabeticAndSymbolChars, ucbcl_HangulLVSyllable, ucbcl_HangulLVTSyllable, ucbcl_Ideograph, ucbcl_HangulLJamo, ucbcl_HangulVJamo, ucbcl_HangulTJamo, ucbcl_ComplexContext, ucbcl_Unknown, ucbcl__Count }; enum eUniBreakAction { ucbac_Direct = 0, ucbac_Indirect, ucbac_CombiningIndirect, ucbac_CombiningProhibited, ucbac_Prohibited, ucbac_Explicit, ucbac__Count }; enum eUniBiDiCharType { ucbct_StrongLeftToRight = 0x100, ucbct_StrongLeftToRightEmbedding, ucbct_StrongLeftToRightOverride, ucbct_StrongRightToLeft = 0x200, ucbct_StrongRightToLeftArabic, ucbct_StrongRightToLeftEmbedding, ucbct_StrongRightToLeftOverride, ucbct_WeakPopDirectionFormat = 0x1000, ucbct_WeakEuropeanDigits, ucbct_WeakEuropeanNumberSeperator, ucbct_WeakEuropeanNumberTerminator, ucbct_WeakArabicNumber, ucbct_WeakCommonNumberSeperator, ucbct_WeakNonSpacingMark, ucbct_WeakBoundaryNeutral, ucbct_NeutralParagraphSeperator = 0x2000, ucbct_NeutralSegmentSeperator, ucbct_NeutralWhiteppace, ucbct_NeutralsOther, }; enum uniCompareLevel { ucle_One = 1, // Use for case and accent insensitive comparison ucle_Two, // See unicode spec ucle_Three, // See unicode spec ucle_Four, // Identical for equivalence. This level MUST ALWAYS be used when ordering }; // ---------------------------------------------------------------------------- // core function definitions // size_t uniCombiningClass(char32_t ch); eUniCase uniCase(char32_t ch); eUniCategoryLetter uniCategoryLetter(char32_t ch); eUniCategoryMark uniCategoryMark(char32_t ch); eUniCategoryNumber uniCategoryNumber(char32_t ch); eUniCategorySeperator uniCategorySeperator(char32_t ch); eUniCategoryOther uniCategoryOther(char32_t ch); eUniCategoryPunctuation uniCategoryPunctuation(char32_t ch); eUniCategorySymbol uniCategorySymbol(char32_t ch); eUniJoinType uniJoinType(char32_t ch); eUniBiDiCharType uniBiDiType(char32_t ch); // blocks BOOL uniPage0(char32_t ch); eUniRange uniRange(char32_t ch); // break information // Parameters: // pass 0 in where a character is not available, e.g. at the start or end of a string BOOL uniStartOfGrapheme(char32_t chPrev, char32_t ch, char32_t chNext); BOOL uniStartOfWord(char32_t chPrev, char32_t ch, char32_t chNext); BOOL uniStartOfSentence(char32_t chPrev, char32_t ch, char32_t chNext); eUniBreakAction uniStartOfLine(char32_t chPrev, char32_t ch, char32_t chNext); // Parameters: // pch is a pointer to a string // nChars are the number of characters in the string that could be processed - must be >= 1 // Returns: // char32_t is a the upper case character // int is the number of lowercase characters that were used to find the uppercase character tuple uniGetUppercase(char32_t * pch, size_t nChars); // Parameters: // ch is the character to be processed // Returns: // const char32_t * is a pointer to a zero terminated set of char32_t that are the lower case version // THIS CAN RETURN NULL if no complex version is available const char32_t * uniGetComplexLowercase(char32_t ch); // Parameters: // ch is the character to be processed // Returns: // const char32_t * is a pointer to a zero terminated set of char32_t that are the lower case version // THIS CAN RETURN NULL if no complex version is available - if NULL is returned use uniGetCommonOrSimpleLowercase const char32_t * uniGetComplexLowercase(char32_t ch); // Parameters: // ch is the character to be processed // Returns: // char32_t is a the lower case character - note: uniGetComplexLowercase should be tried first char32_t uniGetCommonOrSimpleLowercase(char32_t ch); // Parameters: // pDest is a vector that will have it's contents replaced by the sort data void uniGetSortData(const char32_t * pch, int nChars, vector& pDest); void uniGetSortData(const char16_t * pch, int nDataLength, vector& pDest); int uniCompareSortData(const vector& pData1, const vector& pData2, uniCompareLevel nLevel); // forms char32_t uniGetNominalForm(char32_t ch); char32_t uniGetLeftForm(char32_t ch); char32_t uniGetRightForm(char32_t ch); char32_t uniGetMedialForm(char32_t ch); // ---------------------------------------------------------------------------- // inline helpers // TODO add lots based on core functions above // inline BOOL uniSpace(char16_t ch) { // space for any kind of separator. return (uniCategorySeperator(ch) == uccs_Space); } inline BOOL uniStrongLTOR(char16_t ch) { // space for any kind of separator. return ((uniBiDiType(ch) & 0xF00) == 0x100); } inline BOOL uniStrongRTOL(char16_t ch) { // space for any kind of separator. return ((uniBiDiType(ch) & 0xF00) == 0x200); }