Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r55813 - in sandbox/SOC/2009/unicode: boost/iterator boost/unicode libs/unicode/doc libs/unicode/example
From: loufoque_at_[hidden]
Date: 2009-08-27 19:24:23


Author: mgaunard
Date: 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
New Revision: 55813
URL: http://svn.boost.org/trac/boost/changeset/55813

Log:
More docs (motivation, concept explanation, terms) + try at making joined_n do perfect forwarding
Text files modified:
   sandbox/SOC/2009/unicode/boost/iterator/join_iterator.hpp | 60 ++++++++++
   sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator_fwd.hpp | 2
   sandbox/SOC/2009/unicode/boost/unicode/cat.hpp | 4
   sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk | 232 ++++++++++++++++++++++++++++++++++++++-
   sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp | 119 ++++++++++----------
   sandbox/SOC/2009/unicode/libs/unicode/example/search.cpp | 45 ++++---
   6 files changed, 369 insertions(+), 93 deletions(-)

Modified: sandbox/SOC/2009/unicode/boost/iterator/join_iterator.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/join_iterator.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/iterator/join_iterator.hpp 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -14,9 +14,15 @@
 #include <boost/fusion/include/mpl.hpp>
 #include <boost/fusion/tuple.hpp>
 #include <boost/fusion/adapted.hpp>
-#include <boost/fusion/algorithm/transformation/transform.hpp>
+#include <boost/fusion/container/vector/convert.hpp>
 #include <boost/fusion/include/as_vector.hpp>
 
+#include <boost/fusion/algorithm/transformation/transform.hpp>
+#include <boost/fusion/include/transform.hpp>
+#include <boost/fusion/functional/adapter/unfused.hpp>
+
+#include <boost/functional/forward_adapter.hpp>
+
 #include <boost/variant.hpp>
 
 #include <boost/utility/enable_if.hpp>
@@ -332,6 +338,57 @@
     );
 }
 
+struct range_transformer
+{
+ template<typename>
+ struct result {};
+
+ template<typename F, typename R>
+ struct result<F(R&)>
+ {
+ typedef iterator_range<
+ typename range_iterator<R>::type
+ > type;
+ };
+
+ template<typename Range>
+ typename result<range_transformer(Range&)>::type
+ operator()(Range& r) const
+ {
+ return make_iterator_range(r);
+ }
+};
+
+struct fused_make_range_tuple
+{
+ template<typename>
+ struct result
+ {
+ };
+
+ template<typename F, typename Seq>
+ struct result<F(Seq&)>
+ {
+ typedef iterator_range< join_iterator< typename fusion::result_of::as_vector<
+ typename fusion::result_of::transform<
+ Seq const,
+ range_transformer
+ >::type
+ >::type > > type;
+ };
+
+ template<class Seq>
+ typename result<fused_make_range_tuple(Seq&)>::type operator()(Seq const & s) const
+ {
+ return joined(fusion::as_vector(fusion::transform(s, range_transformer())));
+ }
+};
+
+/*forward_adapter<
+ fusion::unfused<fused_make_range_tuple>
+> joined_n;*/
+
+#if 1
 #ifdef BOOST_UNICODE_DOXYGEN_INVOKED
 template<typename... T>
 iterator_range<
@@ -357,6 +414,7 @@
     /**/
 BOOST_PP_REPEAT_FROM_TO(1, 5, BOOST_ITERATOR_JOIN_DEF, ~)
 #endif
+#endif
 
 } // namespace boost
 

Modified: sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator_fwd.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator_fwd.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator_fwd.hpp 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -123,8 +123,8 @@
                 return pos;
         }
 
+private:
         typedef typename Pipe::output_type T;
-
         friend class boost::iterator_core_access;
 
         T dereference() const

Modified: sandbox/SOC/2009/unicode/boost/unicode/cat.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/cat.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/cat.hpp 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -100,7 +100,7 @@
    \pre \c Range1 and \c Range2 are in Normalized Form nf, have the same value type and are non-empty.
    \post \c out is in Normalized Form nf and is stream-safe. */ \
 template<typename Range1, typename Range2, typename OutputIterator, typename... T> \
-OutputIterator name##_concat(const Range1& range1, const Range2& range2, OutputIterator out, const T&...);
+OutputIterator name##_concat(const Range1& range1, const Range2& range2, OutputIterator out, const T&... args);
 #else
 #define BOOST_UNICODE_COMPOSE_CONCAT_DEF(name, nf, pipe, n) \
 BOOST_PP_REPEAT(BOOST_PP_INC(n), BOOST_UNICODE_COMPOSE_CONCAT_DEF_A, (name)(pipe))
@@ -133,7 +133,7 @@
    \pre \c Range1 and \c Range2 are in Normalized Form nf, have the same value type and are non-empty.
    \return Lazy stream-safe range in Normalized Form nf. */ \
 template<typename Range1, typename Range2, typename... T> \
-detail::unspecified<void> name##_concated(const Range1& range1, const Range2& range2, const T&...);
+detail::unspecified<void> name##_concated(const Range1& range1, const Range2& range2, const T&... args);
 #else
 #define BOOST_UNICODE_COMPOSE_CONCATED_DEF(name, nf, pipe, n) \
 BOOST_PP_REPEAT(BOOST_PP_INC(n), BOOST_UNICODE_COMPOSE_CONCATED_DEF_A, (name)(pipe))

Modified: sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -58,8 +58,66 @@
 
 [endsect]
 
+[section Motivation]
+
+Most software applications need, at a moment or another, to deal with
+text in a natural language, to analyze it or perform certain operations
+on it.
+
+Solutions to represent and deal with text in languages based on a latin
+alphabet appeared very early in the history of software, but not only
+did they not fulfill all linguistics needs, they only worked for a subset
+of the languages they were supposed to deal with.
+Solutions to deal with text in other languages then emerged after that
+over the years, but were still restricted to a specific language.
+
+Since the late 80's, effort has been made to create an universal solution to
+represent and deal with text in any language, in particular due to
+internationalization considerations.
+
+The Unicode standard was thus born, which not only provides means to
+encode text from any natural language ever created into a digital form,
+but also categorizes characters, allows to identify graphemes, words,
+sentences and lines, and allows to perform case conversion and to sort
+text, either in a language-agnostic or language-tailored manner.
+
+This library aims at providing the mechanisms to deal with text in a
+natural language, in a language- and platform-agnostic way, using the
+foundations of the Unicode standard.
+
+In particular, ability to have a proper abstraction of what a character
+is is deemed important.
+
+[endsect] [/Motivation]
+
 [section Introduction to Unicode]
 
+[section Notion of character]
+
+[variablelist Language terms:
+ [[Character] [a unit of information that roughly corresponds to a grapheme, grapheme-like unit, or symbol, such as in an alphabet or syllabary in the written form of a natural language.]]
+ [[Grapheme] [the fundamental unit in written language.]]
+ [[Glyph] [an element of writing in a natural language, visual variants of the abstract unit known as grapheme.]]
+]
+
+As can be seen, what a character is is loosely defined. There are various
+levels at which to approximate it within Unicode, which are further
+explained below.
+
+[variablelist Unicode terms:
+ [[Code unit] [the unit in terms of which the string of text is encoded in, in one of the Unicode transformation formats.]]
+ [[Code point] [the unit, made of potentially several code units, in which Unicode defines most of it non-encoding-specific operations.]]
+ [[Combining character sequence] [a sequence of code points that is the unit for the Unicode composition and decomposition processes]]
+ [[Grapheme cluster] [a cluster of code points that form a grapheme.]]
+]
+
+The general idea is that "character" in Unicode usually refers to code points,
+and higher levels of abstractions are usually referred to as "abstract
+characters", or directly with their actual name.
+
+[endsect] [/Definitions]
+
+
 [section Character set]
 The Unicode character set is a mapping that associates *code points*, which are integers, to characters for any writing system or language.
 
@@ -102,26 +160,26 @@
 
 [endsect]
 
-[section Combining character characters]
+[section Combining character sequences]
 
 A *non-combining code point* may be followed by an arbitrary number of *combining code points* to form a single *combining character sequence*, which is really a composite character.
 
 Certain characters are only available as a combination of multiple code points, while some, the ones that are expected to be the most used,
 are also available as a single precomposed code point. The order of the combined code points may also vary, but all code points combinations
-leading to the same character are still canonically equivalent.
+leading to the same abstract character are still canonically equivalent.
 
 While a combining character sequence can be arbitrarily big, the Unicode standard also introduces the concept of a *stream-safe string*, where
-a it is at most 31 code points long, which is largely sufficient for any real use.
+a combining character sequence is at most 31 code points long, which is largely above what is sufficient for any linguistic use.
 
 [endsect]
 
 [section Grapheme clusters]
 
-Another yet higher-level abstraction of character is that of a *grapheme cluster*, which really corresponds to what a human would call a character.
+Another yet higher-level abstraction of character is that of a *grapheme cluster*, i.e. a cluster of *code points* that constitutes a *grapheme*.
 All *combining character sequences* are graphemes, but there are other sequences of *code points* that are as well; for example =\r\n= is one.
 
-For certain classes of applications, such as word processors, it is important to operate at the *grapheme* level rather than at the *code point* or
-*combining character sequence* one.
+For certain classes of applications, such as word processors, it can be important to operate at the *grapheme* level rather than at the *code point* or
+*combining character sequence* one, as this is what the document is composed in terms of.
 
 [endsect]
 
@@ -198,6 +256,123 @@
 
 [endsect]
 
+[section The concepts behind this library]
+
+Two concepts are of utmost importance within this library, the [conceptref Consumer]
+concept, which is used for segmentation of text, and, more importantly,
+the [conceptref Pipe] concept, which is used for conversion, including
+transcoding and normalization.
+
+[heading Consumer]
+A model of the [conceptref Consumer] concept is a class that takes an
+input range, specified as two iterators, and consumes it left-to-right
+or right-to-left, that is to say it returns an iterator with the new
+begin in the case of left-to-right consuming, and the new end in the case
+of right-to-left consuming.
+
+Semantically, a right-to-left consuming done after a left-to-right consuming
+should restore the original position. Indeed, both primitives need to
+be provided in a symmetric way in order to implement bidirectional
+iteration.
+
+Here is an example of a consumer that consumes one element in a range
+of integers:
+``struct element_consumer
+{
+ typedef int input_type;
+
+ template<typename In>
+ In ltr(In begin, In end)
+ {
+ return ++begin;
+ }
+
+ template<typename In>
+ In rtl(In begin, In end)
+ {
+ return --end;
+ }
+};``
+
+A model of the [conceptref Consumer] concept may then be used to segment
+a range. The [classref boost::consumer_iterator], eventually invoked
+by [funcref boost::consumed], can be used to exploit that concept to
+turn a range into a range of subranges.
+
+With the above example, the range would be converted into a range of
+subranges, each subrange being exactly one element.
+
+[heading Pipe]
+A model of the [conceptref Pipe] concept is a class that takes an input
+range, specified as two iterators, consumes it left-to-right
+or right-to-left, writes some elements to an output iterator, and returns
+the new begin in the case of left-to-right consuming or the new end
+in the case of right-to-left consuming, as well as the new output
+iterator.
+
+In terms of semantics, not only does the consuming need to be symmetric,
+but the output shall also be the same for a given consumed subrange,
+whatever the consuming direction.
+Furthermore, the output shall always be ordered left-to-right, even when
+applying the pipe right-to-left.
+
+Here is an example of a pipe that converts two adjacent elements into the
+two numbers reversed, in a range of integers that must have an
+even number of elements; indeed, for the two operations to be symmetric
+here, there is not really another way.
+``struct reverse2_pipe
+{
+ typedef int input_type;
+ typedef int output_type;
+ typedef mpl::int_<2> max_output;
+
+ template<typename In, typename Out>
+ std::pair<In, Out> ltr(In begin, In end, Out out)
+ {
+ int i = *begin++;
+ if(begin == end)
+ throw std::out_of_range();
+
+ *out++ = *begin++;
+ *out++ = i;
+ return std::make_pair(begin, out);
+ }
+
+ template<typename In, typename Out>
+ std::pair<In, Out> rtl(In begin, In end, Out out)
+ {
+ *out++ = *--end;
+ if(end == begin)
+ throw std::out_of_range();
+
+ *out++ = *--end;
+ return std::make_pair(end, out);
+ }
+};``
+
+A model of the [conceptref Pipe] concept may then be used to perform
+a many-to-many conversion on a whole range, be it eagerly (by calling
+repeatly the pipe) or lazily (be evaluating it step by step as an
+iterator adapter is advanced).
+
+The [funcref boost::pipe] function provides the former, while the
+[funcref boost::piped] function which returns a range in terms of
+[classref boost::pipe_iterator] provides the latter.
+
+With the above example, a piped range [1, 2, 3, 4] would be converted
+to [2, 1, 4, 3].
+
+[heading OneManyPipe]
+
+Additionally, there is a refinement of the [conceptref Pipe] concept named
+[conceptref OneManyPipe], where one element is converted to many.
+
+This allows avoiding the consuming altogether so that the pipe can be
+defined as a single function that takes a value, an output iterator,
+and returns it.
+
+[endsect]
+
 [section Overview]
 
 [section Range operations]
@@ -234,7 +409,7 @@
 
 [heading Segmentation]
 Segmentations are expressed in terms of the [conceptref Consumer] concept, which is inherently
-very similar to the [conceptref Pipe] concept excepts it doesn't perform any kind of transformation,
+very similar to the [conceptref Pipe] concept except it doesn't perform any kind of transformation,
 it just reads part of the input.
 As a matter of fact, a =Pipe= can be converted to =Consumer= using [classref boost::pipe_consumer].
 
@@ -357,12 +532,13 @@
 Different approaches to do that are possible:
 
 * [conceptref Pipe]- or [conceptref Consumer]-based, you may simply run classic search algorithms, such as
-the ones from Boost.StringAlgo, with ranges of the appropriate elements, that elements can be ranges
-themselves (subranges returned by [classref boost::consumer_iterator] are =EqualityComparable=).
+the ones from Boost.StringAlgo, with ranges of the appropriate elements -- those elements being able
+to be ranges themselves (subranges returned by [classref boost::consumer_iterator] are =EqualityComparable=).
 * [conceptref BoundaryChecker]-based, the classic algorithms are run, then false positives
 that don't lie on the right boundaries are discarded. This has the advantage of reducing conversion and
 iteration overhead in certain situations.
-The most practical way to achieve this is to adapt a =Finder= in Boost.StringAlgo with [classref boost::algorithm::boundary_finder].
+The most practical way to achieve this is to adapt a =Finder= in Boost.StringAlgo with [classref boost::algorithm::boundary_finder],
+and the boundary you are interested in testing, for example [classref boost::unicode::utf_grapheme_boundary].
 
 [important You will have to normalize input before the search if you want canonically equivalent things
 to compare equal.]
@@ -372,6 +548,23 @@
 [endsect] [/Overview]
 
 [section User's Guide]
+[section Organization]
+
+[caution The organization of headers may change in the future in order to
+improve compile times.]
+
+[variablelist Main headers
+ [[[headerref boost/cuchar.hpp]] [Primitive types for UTF code units.]]
+ [[[headerref boost/unicode/utf.hpp]] [Conversion between UTF encodings.]]
+ [[[headerref boost/unicode/static_utf.hpp]] [Compile-time conversion between UTF encodings.]]
+ [[[headerref boost/unicode/graphemes.hpp]] [Functions to iterate and identify graphemes.]]
+ [[[headerref boost/unicode/compose.hpp]] [Functions to compose, decompose and normalize unicode strings.]]
+ [[[headerref boost/unicode/cat.hpp]] [Functions to concatenate normalized strings while maintaining a normalized form.]]
+ [[[headerref boost/unicode/search.hpp]] [Utility to adapt Boost.StringAlgo finders to discard matches that lie on certain boundaries.]]
+ [[[headerref boost/unicode/ucd/properties.hpp]] [Access to the properties attached with a code point in the Unicode Character Database.]]
+]
+
+[endsect] [/Organization]
 [endsect] [/User's Guide]
 
 [section Examples]
@@ -513,6 +706,16 @@
 
 [section Appendix B: Rationale]
 
+[heading Iterators rather than streams]
+The library chooses to base itself upon iterator adapters rather than
+upon streams, even though the latter were designed for conversion
+facilities with buffering and can be configured with locales.
+
+That choice was made because it is believed that the iterator and
+range abstractions are more flexible and easier to deal with, and that
+there are also quite more efficient.
+
+
 [heading Pipe concept]
 Centralizing conversion into a single [conceptref Pipe] model allows
 eager and lazy variants of evaluation to be possible for any conversion
@@ -530,6 +733,11 @@
 
 [section Appendix C: Future Work]
 
+[heading Type deduction]
+A type deduction, similar to that of Boost.Fusion =result_of= namespace,
+would be useful as a lot of the functions do not have a specified return
+type.
+
 [heading Non-checked UTF conversion]
 The library only provides UTF conversion pipes that do extensive checking
 that the input is correct and that the end is not unexpectedly met.
@@ -543,6 +751,10 @@
 is in a normalized form, which could be used to avoid expensive decomposition
 and recomposition.
 
+[heading Forwarding]
+Certain functions that take ranges and return them adapted do not perform
+correct forwarding, and return an adapted const range instead.
+
 [heading Unicode String type]
 Future versions of the library could provide a string type that maintains
 the following invariants: valid UTF, stream-safe and in Normalization Form C.

Modified: sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -1,98 +1,99 @@
 //[ compose
 /*`
-Example in development.
+This example shows how to decompose, recompose, normalize strings and
+how to maintain a normalized form while concatenating two strings.
 */
 
 #include <boost/unicode/compose.hpp>
-#include <boost/foreach.hpp>
-#include <boost/range/as_array.hpp>
-#include <iostream>
-#include <iterator>
-
 #include <boost/unicode/cat.hpp>
 
-#include <boost/range/adaptor/reversed.hpp>
 #include <boost/range/algorithm/copy.hpp>
 #include <boost/assign/list_of.hpp>
 
+#include <iostream>
+#include <iterator>
+
 namespace unicode = boost::unicode;
 namespace ucd = unicode::ucd;
 using boost::assign::list_of;
 using boost::char32;
 
+/*`
+We're going to do a lot of copying to std::out with spaces to delimit
+the elements in that example, so we instantiate the iterator once:
+*/
+static std::ostream_iterator<char32> output(std::cout, " ");
+
 int main()
 {
- boost::char32 cp = 0x1E17;
+ char32 cp = 0x1E17;
+
+ // We want all results in hexadecimal
+ std::cout << std::hex;
     
     std::cout << "Decomposition of U+01E17 within the UCD: ";
- BOOST_FOREACH(boost::char32 c, ucd::get_decomposition(cp))
- std::cout << "0x" << std::hex << c << ' ';
+ boost::copy(ucd::get_decomposition(cp), output);
     std::cout << std::endl;
     std::cout << "Decomposition type: " << as_string(ucd::get_decomposition_type(cp)) << std::endl;
     
- boost::char32 baz[] = { cp, 0x330 };
     std::cout << "Canonical decomposition: ";
- std::cout << unicode::composed(unicode::decomposed(baz)) << std::endl;
- std::cout << "reversed: " << boost::make_reversed_range(unicode::composed(unicode::decomposed(baz))) << std::endl;
- std::cout << unicode::normalized(baz) << std::endl;
- std::cout << "reversed: " << boost::make_reversed_range(unicode::normalized(baz));
- std::cout << std::endl << std::endl;
+ unicode::decompose(list_of(cp), output);
+ std::cout << std::endl;
     
     std::cout << "Canonical decomposition of U+00A8: ";
- unicode::decompose(list_of(0xA8), std::ostream_iterator<boost::char32>(std::cout, " "));
+ unicode::decompose(list_of(0xA8), output);
     std::cout << std::endl;
     std::cout << "Compatibility decomposition of U+00A8: ";
- unicode::decompose(list_of(0xA8), std::ostream_iterator<boost::char32>(std::cout, " "), UINT_MAX);
+ unicode::decompose(list_of(0xA8), output, UINT_MAX);
     std::cout << std::endl;
     std::cout << std::endl;
     
- boost::char32 foo[] = { 0x113, 0x301 };
- std::cout << "Canonical composition of { " << boost::as_array(foo) << " }: ";
- unicode::compose(foo, std::ostream_iterator<boost::char32>(std::cout, " "));
+ char32 foo[] = { 0x113, 0x301 };
+ std::cout << "Canonical composition of { ";
+ boost::copy(foo, output);
+ std::cout << "}: ";
+ unicode::compose(foo, output);
     std::cout << std::endl;
     
- boost::char32 foo2[] = { 0x65, 0x304, 0x301 };
- std::cout << "Canonical composition of { " << boost::as_array(foo2) << " }: ";
- unicode::compose(foo2, std::ostream_iterator<boost::char32>(std::cout, " "));
+ char32 foo2[] = { 0x65, 0x304, 0x301 };
+ std::cout << "Canonical composition of { ";
+ boost::copy(foo2, output);
+ std::cout << "}: ";
+ unicode::compose(foo2, output);
     std::cout << std::endl;
     
- boost::char32 bar[] = { 0x20, 0x308 };
- std::cout << "Canonical composition of { " << boost::as_array(bar) << " }: ";
- unicode::compose(bar, std::ostream_iterator<boost::char32>(std::cout, " "));
+ char32 bar[] = { 0x20, 0x308 };
+ std::cout << "Canonical composition of { ";
+ boost::copy(bar, output);
+ std::cout << "}: ";
+ unicode::compose(bar, output);
     std::cout << std::endl << std::endl;
     
- //unicode::decomposed_concat(list_of<char32>(0x48)(0x65)(0x304)(0x301), list_of<char32>(0x330)(0x49), std::ostream_iterator<boost::char32>(std::cout, " "));
- std::cout << std::endl;
-
- boost::char32 tmp[] = {0x48, 0x1e17};
- boost::iterator_range<
- boost::pipe_iterator<
- boost::char32*,
- unicode::decomposer
- >
- > r = unicode::decomposed(tmp);
-
- std::cout << "Distance: " << std::distance(
- begin(r),
- prior(end(r))
- ) << std::endl;
-
- boost::char32 tmp2[] = {0x330, 0x49};
- unicode::composed_concat(list_of<char32>(0x48)(0x1e17), list_of<char32>(0x330)(0x49), std::ostream_iterator<boost::char32>(std::cout, " "));
- std::cout << std::endl;
-
- std::cout << unicode::composed_concated(tmp, tmp2) << std::endl;
-
- std::cout << "0xAC00: ";
- unicode::decompose(list_of<char32>(0xAC00), std::ostream_iterator<boost::char32>(std::cout, " "));
- std::cout << std::endl;
-
- std::cout << "0xAC01: ";
- unicode::decompose(list_of<char32>(0xAC01), std::ostream_iterator<boost::char32>(std::cout, " "));
- std::cout << std::endl;
-
- std::cout << "Composing 0x1100 0x1161 0x11a8: ";
- unicode::compose(list_of<char32>(0x1100)(0x1161)(0x11a8), std::ostream_iterator<char32>(std::cout, " "));
+ char32 baz[] = { cp, 0x330 };
+ std::cout << "Normalization C of { ";
+ boost::copy(baz, output);
+ std::cout << "}: ";
+ unicode::normalize(baz, output);
+ std::cout << std::endl;
+
+ char32 cat_dec1[] = { 0x48, 0x65, 0x304, 0x301 };
+ char32 cat_dec2[] = { 0x330, 0x49 };
+ std::cout << "Concatenation of the two decomposed strings { ";
+ boost::copy(cat_dec1, output);
+ std::cout << "} and { ";
+ boost::copy(cat_dec2, output);
+ std::cout << "}: ";
+ unicode::decomposed_concat(cat_dec1, cat_dec2, output);
+ std::cout << std::endl;
+
+ char32 cat_comp1[] = { 0x48, 0x1e17 };
+ char32 cat_comp2[] = { 0x330, 0x49 };
+ std::cout << "Concatenation of the two composed strings { ";
+ boost::copy(cat_comp1, output);
+ std::cout << "} and { ";
+ boost::copy(cat_comp2, output);
+ std::cout << "}: ";
+ unicode::composed_concat(cat_comp1, cat_comp2, output);
     std::cout << std::endl;
 }
 //]

Modified: sandbox/SOC/2009/unicode/libs/unicode/example/search.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/example/search.cpp (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/search.cpp 2009-08-27 19:24:22 EDT (Thu, 27 Aug 2009)
@@ -1,13 +1,18 @@
 //[ search
 /*`
-Example in development.
+This example shows how to search a substring within a string,
+at the grapheme level, using two methods.
+
+In this example we're going to use =BOOST_AUTO= as the
+return type of some of the functions is unspecified, but ideally
+you should try to avoid that dependency by not naming the variables at all,
+or rely on a type deduction system the library doesn't provide yet.
 */
 
 #include <boost/algorithm/string.hpp>
 #include <boost/unicode/search.hpp>
 
-#include <boost/foreach.hpp>
-#include <boost/foreach_auto.hpp>
+#include <boost/typeof/typeof.hpp>
 #include <iostream>
 
 #include <boost/unicode/graphemes.hpp>
@@ -17,34 +22,34 @@
 
 int main()
 {
+/*`
+We define the string we're going to search into, "foo<combining circumflex accent>foo"
+as well as it's version in terms of graphemes
+*/
     char foo[] = "foo\xcc\x82" "foo";
- //BOOST_AUTO(foo, unicode::utf_grapheme_bounded(foo_));
+ BOOST_AUTO(foo_bounded, unicode::utf_grapheme_bounded(boost::as_literal(foo)));
     
+//` We do the same thing for the string we're going to look for, "foo"
     char search[] = "foo";
+ BOOST_AUTO(search_bounded, unicode::utf_grapheme_bounded(boost::as_literal(search)));
     
- //BOOST_AUTO(s, unicode::utf_grapheme_bounded(boost::as_literal(search)));
- BOOST_AUTO(s, boost::as_literal(search));
+//` We perform the search using the ranges of graphemes, i.e. the [conceptref Consumer]-based approach:
+ BOOST_AUTO(range_consumer, boost::algorithm::find_first(foo_bounded, search_bounded));
     
- // Boost.StringAlgo
+//` We perform the search using the original range, but using an adapted Boost.StringAlgo Finder with the relevant [conceptref BoundaryChecker]:
     BOOST_AUTO(finder,
         boost::algorithm::make_boundary_finder(
- boost::algorithm::first_finder(s),
+ boost::algorithm::first_finder(search),
             unicode::utf_grapheme_boundary()
         )
     );
+ boost::iterator_range<char*> range_boundary = boost::algorithm::find(foo, finder);
     
- BOOST_AUTO(f, unicode::make_boundary_finder(
- unicode::make_simple_finder(s),
- unicode::utf_grapheme_boundary()
- ));
-
- //BOOST_AUTO(range, f.ltr(boost::begin(foo), boost::end(foo)));
- BOOST_AUTO(range, boost::algorithm::find(foo, finder));
+//` We now display the resulting matches, which should both be pointing to the second occurrence, with their positions within the original UTF-8 string:
+ std::cout << "[" << std::distance(boost::begin(foo), range_consumer.begin().base()) << ", " << std::distance(boost::begin(foo), range_consumer.end().base()) << "] ";
+ std::cout << range_consumer << std::endl;
     
- std::cout << "[" << std::distance(boost::begin(foo), range.begin()) << ", " << std::distance(boost::begin(foo), range.end()) << "] ";
-
- BOOST_FOREACH_AUTO(r, range)
- std::cout << r;
- std::cout << std::endl;
+ std::cout << "[" << std::distance(boost::begin(foo), range_boundary.begin()) << ", " << std::distance(boost::begin(foo), range_boundary.end()) << "] ";
+ std::cout << range_boundary << std::endl;
 }
 //]


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk