#ifndef __UNICODEINTEGRALTYPES typedef unsigned long char32_t; typedef unsigned int char16_t; #endif //__UNICODEINTEGRALTYPES /********************************************************************************* ** UnicodeDerivedTypes.hpp AUTOMATICALLY GENERATED FILE - DO NOT ALTER *********************************************************************************/ namespace unicode { /********************************************************************************* ** This is automatically generated from blocks.txt, taking each entry and ** replacing '-' with '_' and removing all ' ' ** Note: Private is added to this list *********************************************************************************/ struct range { enum type { Latin1Supplement = 0, LatinExtendedA, LatinExtendedB, IPAExtensions, SpacingModifierLetters, CombiningDiacriticalMarks, GreekandCoptic, Cyrillic, CyrillicSupplement, Armenian, Hebrew, Arabic, Syriac, ArabicSupplement, Thaana, Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar, Georgian, HangulJamo, Ethiopic, EthiopicSupplement, Cherokee, UnifiedCanadianAboriginalSyllabics, Ogham, Runic, Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian, Limbu, TaiLe, NewTaiLue, KhmerSymbols, Buginese, PhoneticExtensions, PhoneticExtensionsSupplement, CombiningDiacriticalMarksSupplement, LatinExtendedAdditional, GreekExtended, GeneralPunctuation, SuperscriptsandSubscripts, CurrencySymbols, CombiningDiacriticalMarksforSymbols, LetterlikeSymbols, NumberForms, Arrows, MathematicalOperators, MiscellaneousTechnical, ControlPictures, OpticalCharacterRecognition, EnclosedAlphanumerics, BoxDrawing, BlockElements, GeometricShapes, MiscellaneousSymbols, Dingbats, MiscellaneousMathematicalSymbols_A, SupplementalArrows_A, BraillePatterns, SupplementalArrowsB, MiscellaneousMathematicalSymbolsB, SupplementalMathematicalOperators, MiscellaneousSymbolsandArrows, Glagolitic, Coptic, GeorgianSupplement, Tifinagh, EthiopicExtended, SupplementalPunctuation, CJKRadicalsSupplement, KangxiRadicals, IdeographicDescriptionCharacters, CJKSymbolsandPunctuation, Hiragana, Katakana, Bopomofo, HangulCompatibilityJamo, Kanbun, BopomofoExtended, CJKStrokes, KatakanaPhoneticExtensions, EnclosedCJKLettersandMonths, CJKCompatibility, CJKUnifiedIdeographsExtensionA, YijingHexagramSymbols, CJKUnifiedIdeographs, YiSyllables, YiRadicals, ModifierToneLetters, SylotiNagri, HangulSyllables, HighSurrogates, HighPrivateUseSurrogates, LowSurrogates, PrivateUseArea, CJKCompatibilityIdeographs, AlphabeticPresentationForms, ArabicPresentationForms_A, VariationSelectors, VerticalForms, CombiningHalfMarks, CJKCompatibilityForms, SmallFormVariants, ArabicPresentationForms_B, HalfwidthandFullwidthForms, Specials, LinearBSyllabary, LinearBIdeograms, AegeanNumbers, AncientGreekNumbers, OldItalic, Gothic, Ugaritic, OldPersian, Deseret, Shavian, Osmanya, CypriotSyllabary, Kharoshthi, ByzantineMusicalSymbols, MusicalSymbols, AncientGreekMusicalNotation, TaiXuanJingSymbols, Mathemat, Private, __Count }; }; }; /********************************************************************************* ** UnicodeTypes.hpp *********************************************************************************/ namespace unicode { struct initialise { enum type { Success = 0, VersionNotSupported = 1, __Count }; }; struct categoryCasing { enum type { Uppercase = 0, Lowercase, Titlecase, Modifier, Other, __Count }; }; struct categoryMark { enum type { NonSpacing = 0, SpaceCombining, MarkEnclosing, __Count }; }; struct categoryNumber { enum type { DecimalDigit = 0, Letter, Other, __Count }; }; struct categorySeperator { enum type { Space = 0, Line, Paragraph, __Count }; }; struct categoryOther { enum type { Control = 0, Format, Surrogate, PrivateUse, NotAssigned, __Count }; }; struct categoryPunctuation { enum type { Connector = 0, Dash, Open, Close, InitialQuote, FinalQuote, Other, __Count }; }; struct categorySymbol { enum type { Math = 0, Currency, Modifier, Other, __Count }; }; struct joinType { enum type { None = 0, Right, Left, Dual, Causing, Transparent, __Count }; }; struct biDiCharType { enum type { StrongLeftToRight = 0x100, StrongLeftToRightEmbedding, StrongLeftToRightOverride, StrongRightToLeft = 0x200, StrongRightToLeftArabic, StrongRightToLeftEmbedding, StrongRightToLeftOverride, WeakPopDirectionFormat = 0x1000, WeakEuropeanDigits, WeakEuropeanNumberSeperator, WeakEuropeanNumberTerminator, WeakArabicNumber, WeakCommonNumberSeperator, WeakNonSpacingMark, WeakBoundaryNeutral, NeutralParagraphSeperator = 0x2000, NeutralSegmentSeperator, NeutralWhiteppace, NeutralsOther, }; }; struct breakClass { enum type { Mandatory = 0, CarriageReturn, LineFeed, CombiningMarks, NextLine, Surrogates, WordJoiner, ZeroWidthSpace, NonBreaking, ContingentBreakOpport, Space, BreakOpportBeforeAfter, BreakOpportAfter, BreakOpportBefore, Hyphen, ClosingPunct, ExclamationInterrog, Inseperable, NonStarter, OpeningPuct, AmbiguousQuote, InfixSeperator, Numeric, PostfixNumeric, PrefixNumeric, SymbolsAllowingBreaks, Ambiguous, OrdinaryAlpabeticAndSymbolChars, HangulLVSyllable, HangulLVTSyllable, Ideograph, HangulLJamo, HangulVJamo, HangulTJamo, ComplexContext, Unknown, __Count }; }; struct breakAction { enum type { Direct = 0, Indirect, CombiningIndirect, CombiningProhibited, Prohibited, Explicit, __Count }; }; struct compareLevel { enum type { Insensitive = 1, // No Accent, No case, No punctuation, No tie-breaker - role < roles < rule Accents, // Yes Accent, No case, No punctuation, No tie-breaker - rrole < rôle < roles CaseAccents, // Yes Accent, Yes case, No punctuation, No tie-breaker - role < Role < rôle CaseAccentsPunctuation, // Yes Accent, Yes case, Yes punctuation, No tie-breaker - role < “role” < Role Full // Full comparison used for sorting }; }; struct functions { int m_nStructureVersion; int (*pFnVersion)(); size_t (*pFnCombiningClass)(char32_t); categoryCasing::type (*pFnCategoryCasing)(char32_t); categoryMark::type (*pFnCategoryMark)(char32_t); categoryNumber::type (*pFnCategoryNumber)(char32_t); categorySeperator::type (*pFnCategorySeperator)(char32_t); categoryOther::type (*pFnCategoryOther)(char32_t); categoryPunctuation::type (*pFnCategoryPunctuation)(char32_t); categorySymbol::type (*pFnCategorySymbol)(char32_t); joinType::type (*pFnJoinType)(char32_t); biDiCharType::type (*pFnBiDiCharType)(char32_t); breakClass::type (*pFnBreakClass)(char32_t); bool (*pFnPage0)(char32_t); range::type (*pFnRange)(char32_t); bool (*pFnStartOfGrapheme)(char32_t, char32_t, char32_t); bool (*pFnStartOfWord)(char32_t, char32_t, char32_t); bool (*pFnStartOfIdentifier)(char32_t, char32_t, char32_t); bool (*pFnStartOfSentence)(char32_t, char32_t, char32_t); breakAction::type (*pFnStartOfLine)(char32_t, char32_t, char32_t); void (*pFnAppendSortData)(char32_t, vector&); void (*pFnCompareSortData)(const vector&, \ const vector&, compareLevel::type); // Parameters: // char32_t* is a pointer to a string // size_t are the number of characters in the string that could be processed - must be >= 1 // Returns: // char32_t is a the upper case character // size_t is the number of lowercase characters that were used to find the uppercase character tuple (*pFnGetUppercase)(char32_t*, size_t); // Parameters: // char32_t is the character to be processed // Returns: // const char32_t * is a pointer to a zero terminated set of char32_t that are the lower case version // THIS CAN RETURN NULL if no complex version is available - if NULL is returned use uniGetCommonOrSimpleLowercase const char32_t * (*pFnGetComplexLowercase)(char32_t); // Parameters: // char32_t is the character to be processed // Returns: // char32_t is a the lower case character - note: uniGetComplexLowercase should be tried first char32_t (*pFnGetCommonOrSimpleLowercase)(char32_t); // forms char32_t (*pFnGetNominalForm)(char32_t); char32_t (*pFnGetLeftForm)(char32_t); char32_t (*pFnGetRightForm)(char32_t); char32_t (*pFnGetMedialForm)(char32_t); }; }; /********************************************************************************* ** UnicodeCharImpl.hpp *********************************************************************************/ namespace unicode { #ifdef _IMPORT_UNICHAR // import definition #define UNICHAR_EXTERN __declspec(dllimport) #else // export definition #define UNICHAR_EXTERN __declspec(dllexport) #endif //!_IMPORT_UNICHAR initialise::type UNICHAR_EXTERN Initialise(functions& fns); }; /********************************************************************************* ** Unicode.hpp *********************************************************************************/ namespace unicode { inline int Version(functions* pFns) \ { return (*pFns->pFnVersion)(); }; inline size_t CombiningClass(functions* pFns, char32_t ch) \ { return (*pFns->pFnCombiningClass)(ch); }; inline categoryCasing::type CategoryCasing(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategoryCasing)(ch); }; inline categoryMark::type CategoryMark(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategoryMark)(ch); }; inline categoryNumber::type CategoryNumber(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategoryNumber)(ch); }; inline categorySeperator::type CategorySeperator(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategorySeperator)(ch); }; inline categoryOther::type CategoryOther(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategoryOther)(ch); }; inline categoryPunctuation::type CategoryPunctuation(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategoryPunctuation)(ch); }; inline categorySymbol::type CategorySymbol(functions* pFns, char32_t ch) \ { return (*pFns->pFnCategorySymbol)(ch); }; inline joinType::type JoinType(functions* pFns, char32_t ch) \ { return (*pFns->pFnJoinType)(ch); }; inline biDiCharType::type BiDiCharType(functions* pFns, char32_t ch) \ { return (*pFns->pFnBiDiCharType)(ch); }; inline breakClass::type BreakClass(functions* pFns, char32_t ch) \ { return (*pFns->pFnBreakClass)(ch); }; // blocks inline bool Page0(functions* pFns, char32_t ch) \ { return (*pFns->pFnPage0)(ch); }; inline range::type Range(functions* pFns, char32_t ch) \ { return (*pFns->pFnRange)(ch); }; // seperators inline bool StartOfGrapheme(functions* pFns, char32_t chPrev, char32_t ch, char32_t chNext) \ { return (*pFns->pFnStartOfGrapheme)(chPrev, ch, chNext); }; template bool StartOfGrapheme(functions* pFns, inputIterator ch) { return (*pFns->pFnStartOfGrapheme)(*(--ch), *ch, *(++ch)); }; template bool StartOfGrapheme(functions* pFns, inputIterator chPrev, inputIterator ch, inputIterator chNext) { return (*pFns->pFnStartOfGrapheme)(*chPrev, *ch, *chNext); }; inline bool StartOfWord(functions* pFns, char32_t chPrev, char32_t ch, char32_t chNext) \ { return (*pFns->pFnStartOfWord)(chPrev, ch, chNext); }; template bool StartOfWord(functions* pFns, inputIterator ch) { return (*pFns->pFnStartOfWord)(*(--ch), *ch, *(++ch)); }; template bool StartOfWord(functions* pFns, inputIterator chPrev, inputIterator ch, inputIterator chNext) { return (*pFns->pFnStartOfWord)(*chPrev, *ch, *chNext); }; inline bool StartOfIdentifier(functions* pFns, char32_t chPrev, char32_t ch, char32_t chNext) \ { return (*pFns->pFnStartOfIdentifier)(chPrev, ch, chNext); }; template bool StartOfIdentifier(functions* pFns, inputIterator ch) { return (*pFns->pFnStartOfIdentifier)(*(--ch), *ch, *(++ch)); }; template bool StartOfIdentifier(functions* pFns, inputIterator chPrev, inputIterator ch, inputIterator chNext) { return (*pFns->pFnStartOfIdentifier)(*chPrev, *ch, *chNext); }; inline bool StartOfSentence(functions* pFns, char32_t chPrev, char32_t ch, char32_t chNext) \ { return (*pFns->pFnStartOfSentence)(chPrev, ch, chNext); }; template bool StartOfSentence(functions* pFns, inputIterator ch) { return (*pFns->pFnStartOfSentence)(*(--ch), *ch, *(++ch)); }; template bool StartOfSentence(functions* pFns, inputIterator chPrev, inputIterator ch, inputIterator chNext) { return (*pFns->pFnStartOfSentence)(*chPrev, *ch, *chNext); }; inline breakAction::type StartOfLine(functions* pFns, char32_t chPrev, char32_t ch, char32_t chNext) \ { return (*pFns->pFnStartOfLine)(chPrev, ch, chNext); }; template breakAction::type StartOfLine(functions* pFns, inputIterator ch) { return (*pFns->pFnStartOfLine)(*(--ch), *ch, *(++ch)); }; template breakAction::type StartOfLine(functions* pFns, inputIterator chPrev, inputIterator ch, inputIterator chNext) { return (*pFns->pFnStartOfLine)(*chPrev, *ch, *chNext); }; // sort data inline void AppendSortData(functions* pFns, const char32_t ch, \ vector& dest) \ { (*pFns->pFnAppendSortData)(ch, dest); }; template void GetSortData(functions* pFns, int nEstimatedChars, inputIterator first, inputIterator last, vector& dest) { dest.clear(); dest.reserve(nEstimatedChars * 4); for_each(first, last, bind2nd<(*pFns->pFnAppendSortData), dest>); }; // returns +1 for +ve, 0 for equal and -1 for -ve inline int CompareSortData(functions* pFns, const vector& data1, \ const vector& data2, compareLevel::type nLevel) \ { (*pFns->pFnCompareSortData)(data1, data2, nLevel); }; // forms information inline char32_t GetNominalForm(functions* pFns, char32_t ch) \ { return (*pFns->pFnGetNominalForm)(ch); }; inline char32_t GetLeftForm(functions* pFns, char32_t ch) \ { return (*pFns->pFnGetLeftForm)(ch); }; inline char32_t GetRightForm(functions* pFns, char32_t ch) \ { return (*pFns->pFnGetRightForm)(ch); }; inline char32_t GetMedialForm(functions* pFns, char32_t ch) \ { return (*pFns->pFnGetMedialForm)(ch); }; // upper and lower case inline tuple GetUppercase(functions* pFns, char32_t* pCh, size_t n) \ { return (*pFns->pFnGetUppercase)(pCh, n); }; template void GetUppercase(functions* pFns, int nEstimatedChars, inputIterator first, inputIterator last, vector& dest) { vector tblChars; tblChars.reserve(nEstimatedChars); for_each(first, last, back_inserter(tblChars)); dest.clear(); // this is a poor estimate of the number of uppercase character, but it is the best we can do dest.reserve(nEstimatedChars); int n = 0; while (n < tblChars.size()) { tuple result = (*pFns->pFnGetUppercase)(&tblChars[n], tblChars.size() - n); dest.push_back(result.get<0>); n += result.get<1>; } } inline const char32_t * GetComplexLowercase(functions* pFns, char32_t ch) { return (*pFns->pFnGetComplexLowercase)(ch); }; inline const char32_t GetCommonOrSimpleLowercase(functions* pFns, char32_t ch) { return (*pFns->pFnGetCommonOrSimpleLowercase)(ch); }; template void GetLowercase(inputIterator src, outputIterator dest) { char32_t * pData = (*pFns->pFnGetComplexLowercase)(*src); if (pData != NULL) { src++; do { *dest++ = *pData++; } while (*pData != NULL); return; } *dest++ = (*pFns->pFnGetCommonOrSimpleLowercase)(*src++); } // ---------------------------------------------------------------------------- // inline helpers // TODO add lots based on core functions above // inline bool uniSpace(functions* pFns, char16_t ch) { // space for any kind of separator. return ((*pFns->pFnCategorySeperator)(ch) == categorySeperator::Space); } inline bool uniStrongLTOR(functions* pFns, char16_t ch) { // space for any kind of separator. return (((*pFns->pFnBreakClass)(ch) & 0xF00) == 0x100); } inline bool uniStrongRTOL(functions* pFns, char16_t ch) { // space for any kind of separator. return (((*pFns->pFnBreakClass)(ch) & 0xF00) == 0x200); } };