Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r55578 - in sandbox/SOC/2009/unicode: boost/iterator boost/unicode libs/unicode/example
From: loufoque_at_[hidden]
Date: 2009-08-13 22:25:09


Author: mgaunard
Date: 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
New Revision: 55578
URL: http://svn.boost.org/trac/boost/changeset/55578

Log:
Normalization support
Added:
   sandbox/SOC/2009/unicode/boost/unicode/cat.hpp (contents, props changed)
Text files modified:
   sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp | 77 ++++++++++++++++++++++++
   sandbox/SOC/2009/unicode/boost/unicode/compose.hpp | 1
   sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp | 127 ++++++++++++++++++++++++++++++++++-----
   sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp | 17 +++++
   sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp | 8 ++
   5 files changed, 210 insertions(+), 20 deletions(-)

Modified: sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -6,6 +6,8 @@
 
 #include <boost/range.hpp>
 #include <boost/mpl/int.hpp>
+#include <boost/mpl/times.hpp>
+#include <boost/tuple/tuple.hpp>
 
 #include <boost/concept/requires.hpp>
 #include <boost/range/concepts.hpp>
@@ -58,6 +60,81 @@
         return one_many_pipe<OneManyPipe>(p);
 }
 
+/* TODO: make it work for pipes that don't expose max_output */
+/** Model of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
+ * constructed from two models of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
+ * and that applies one after the other. */
+template<typename P1, typename P2>
+struct multi_pipe
+{
+ BOOST_CONCEPT_ASSERT((PipeConcept<P1>));
+ BOOST_CONCEPT_ASSERT((PipeConcept<P2>));
+
+ BOOST_CONCEPT_ASSERT((Convertible<typename P1::output_type, typename P2::input_type>));
+
+ typedef typename P1::input_type input_type;
+ typedef typename P2::output_type output_type;
+
+ typedef typename mpl::times<
+ typename P1::max_output,
+ typename P2::max_output
+ >::type max_output;
+
+ multi_pipe() {}
+ multi_pipe(P1 p1_, P2 p2_ = P2()) : p1(p1_), p2(p2_) {}
+
+ template<typename In, typename Out>
+ std::pair<In, Out> ltr(In begin, In end, Out out)
+ {
+ typename P1::output_type buf[max_output::value];
+ typename P1::output_type* b = buf;
+
+ std::pair<In, typename P1::output_type*> pair = p1.ltr(begin, end, buf);
+ typename P1::output_type* e = pair.second;
+
+ do
+ {
+ tie(b, out) = p2.ltr(b, e, out);
+ }
+ while(b != e);
+
+ return std::make_pair(pair.first, out);
+ }
+
+ template<typename In, typename Out>
+ std::pair<In, Out> rtl(In begin, In end, Out out)
+ {
+ typename P1::output_type buf[max_output::value];
+ typename P1::output_type* b = buf;
+
+ std::pair<In, typename P1::output_type*> pair = p1.rtl(begin, end, buf);
+ typename P1::output_type* e = pair.second;
+
+ do
+ {
+ tie(b, out) = p2.ltr(b, e, out);
+ }
+ while(b != e);
+
+ return std::make_pair(pair.first, out);
+ }
+
+private:
+ P1 p1;
+ P2 p2;
+};
+
+template<typename P1, typename P2>
+BOOST_CONCEPT_REQUIRES(
+ ((PipeConcept<P1>))
+ ((PipeConcept<P2>))
+ ((Convertible<typename P1::output_type, typename P2::input_type>)),
+ (multi_pipe<P1, P2>)
+) make_multi_pipe(P1 p1, P2 p2)
+{
+ return multi_pipe<P1, P2>(p1, p2);
+}
+
 /** Model of \c \xmlonly<conceptname>OneManyPipe</conceptname>\endxmlonly
  * that casts its input to its template parameter and writes it to its output. */
 template<typename T>

Added: sandbox/SOC/2009/unicode/boost/unicode/cat.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2009/unicode/boost/unicode/cat.hpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -0,0 +1,6 @@
+#ifndef BOOST_UNICODE_CAT_HPP
+#define BOOST_UNICODE_CAT_HPP
+
+
+
+#endif

Modified: sandbox/SOC/2009/unicode/boost/unicode/compose.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/compose.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/compose.hpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -11,6 +11,7 @@
 
 BOOST_UNICODE_PIPE_DEF(compose, 0)
 BOOST_UNICODE_PIPE_DEF(decompose, 1)
+BOOST_UNICODE_PIPE_DEF(normalize, 1)
 
 } // namespace unicode
 } // namespace boost

Modified: sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -7,9 +7,18 @@
 #include <boost/integer/static_pow.hpp>
 #include <climits>
 
-#include <boost/iterator/pipe_iterator.hpp>
 #include <vector>
 
+#include <boost/throw_exception.hpp>
+#include <stdexcept>
+#ifndef BOOST_NO_STD_LOCALE
+#include <sstream>
+#include <ios>
+#endif
+
+#include <boost/detail/unspecified.hpp>
+#include <boost/iterator/pipe_iterator.hpp>
+
 namespace boost
 {
 namespace unicode
@@ -21,6 +30,30 @@
 #undef BOOST_UNICODE_OPTION
 #endif
 
+namespace detail
+{
+ struct combining_pred
+ {
+ bool operator()(char32 lft, char32 rgt) const
+ {
+ return ucd::get_combining_class(lft) < ucd::get_combining_class(rgt);
+ }
+ };
+
+ template<typename Size, typename Iterator, typename Comp>
+ void stable_sort_bounded(Iterator begin, Iterator end, Comp comp = std::less<typename std::iterator_traits<Iterator>::value_type>())
+ {
+#if defined(__GLIBCPP__) || defined(__GLIBCXX__) || defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)
+ typename std::iterator_traits<Iterator>::value_type buf[Size::value];
+ return std::__stable_sort_adaptive(begin, end, buf, Size::value, comp);
+#else
+ return std::stable_sort(begin, end, comp);
+#endif
+ }
+
+}
+
+/* TODO: special case the case when Out is a RandomAccessIterator */
 /** Model of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
  * that decomposes a combining character sequence, i.e. it transforms a combining
  * character sequence into its canonically ordered decomposed equivalent.
@@ -31,53 +64,105 @@
     typedef char32 input_type;
     typedef char32 output_type;
     
+ typedef mpl::int_<31> max_output;
+
     decomposer(unsigned mask_ = BOOST_UNICODE_OPTION(ucd::decomposition_type::canonical)) : mask(mask_)
     {
     }
     
- /** \post \c out is in Normalization Form D. */
+ /** Throws \c std::out_of_range if [<tt>begin</tt>, <tt>end</tt>[ is not stream-safe.
+ * \post \c out is in Normalization Form D. */
     template<typename In, typename Out>
- std::pair<In, Out> ltr(In begin, In end, Out out)
+ std::pair<In, Out> ltr(In begin, In end, Out out, bool inverse = false)
     {
+ In pos = begin;
+
+ char32 buf[max_output::value];
+ char32* out_pos = buf;
+
+ bool to_sort = false;
         do
         {
- char32 ch = *begin;
+ char32 ch = *pos;
             if(ucd::get_combining_class(ch) != 0)
- {
- // canonical reorder, not handled yet
- }
+ to_sort = true;
         
             iterator_range<const char32*> dec = ucd::get_decomposition(ch);
             if(!empty(dec) && ((1 << ucd::get_decomposition_type(ch)) & mask))
             {
- out = pipe(dec, *this, out); // we decompose recursively
+ for(const char32* p = boost::begin(dec); p != boost::end(dec); ++p)
+ out_pos = decompose_rec(*p, out_pos);
             }
             else if(BOOST_UNICODE_OPTION(ucd::decomposition_type::canonical) & mask)
             {
- out = hangul_decomposer()(ch, out);
+ if((out_pos + hangul_decomposer::len(ch) - 1) != (buf + max_output::value))
+ out_pos = hangul_decomposer()(ch, out_pos);
+ else
+ not_stream_safe(begin, end);
+ }
+ else if(out_pos != (buf + max_output::value))
+ {
+ *out_pos++ = ch;
             }
             else
             {
- *out++ = ch;
+ not_stream_safe(begin, end);
             }
             
- ++begin;
+ ++pos;
         }
- while(begin != end && ucd::get_combining_class(*begin) != 0);
+ while(pos != end && ((!inverse && ucd::get_combining_class(*pos) != 0) || (inverse && ucd::get_combining_class(*pos) == 0)));
         
- return std::make_pair(begin, out);
+ if(to_sort)
+ detail::stable_sort_bounded<max_output>(buf, out_pos, detail::combining_pred());
+
+ out = std::copy(buf, out_pos, out);
+ return std::make_pair(pos, out);
     }
     
- /** \post \c out is in Normalization Form D. */
+ /** Throws \c std::out_of_range if [<tt>begin</tt>, <tt>end</tt>[ is not stream-safe.
+ * \post \c out is in Normalization Form D. */
     template<typename In, typename Out>
     std::pair<In, Out> rtl(In begin, In end, Out out)
     {
- // NOT IMPLEMENTED
- *out++ = *--end;
- return std::make_pair(end, out);
+ std::pair<
+ reverse_iterator<In>,
+ Out
+ > p = ltr(make_reverse_iterator(end), make_reverse_iterator(begin), out, true);
+ return std::make_pair(p.first.base(), p.second);
     }
     
 private:
+ template<typename Iterator>
+ static void not_stream_safe(Iterator begin, Iterator end)
+ {
+#ifndef BOOST_NO_STD_LOCALE
+ std::stringstream ss;
+ ss << "Invalid Unicode stream-safe combining character sequence " << std::showbase << std::hex;
+ for(Iterator it = begin; it != end; ++it)
+ ss << *it << " ";
+ ss << "encountered while trying to decompose UTF-32 sequence";
+ std::out_of_range e(ss.str());
+#else
+ std::out_of_range e("Invalid Unicode stream-safe combining character sequence encountered while trying to decompose UTF-32 sequence");
+#endif
+ boost::throw_exception(e);
+ }
+
+ template<typename OutputIterator>
+ OutputIterator decompose_rec(char32 ch, OutputIterator out)
+ {
+ iterator_range<const char32*> dec = ucd::get_decomposition(ch);
+ if(!empty(dec) && ((1 << ucd::get_decomposition_type(ch)) & mask))
+ {
+ for(const char32* p = begin(dec); p != end(dec); ++p)
+ out = decompose_rec(*p, out);
+ return out;
+ }
+ *out++ = ch;
+ return out;
+ }
+
     unsigned mask;
 };
 
@@ -128,7 +213,8 @@
     typedef char32 output_type;
     typedef mpl::int_<1> max_output;
     
- /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D. */
+ /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D.
+ * \post \c out is in Normalization Form C. */
     template<typename In, typename Out>
     std::pair<In, Out> ltr(In begin, In end, Out out)
     {
@@ -172,7 +258,8 @@
     }
     
     /* This could by made faster using a sorted table of reversed strings */
- /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D. */
+ /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D.
+ * \post \c out is in Normalization Form C. */
     template<typename In, typename Out>
     std::pair<In, Out> rtl(In begin, In end, Out out)
     {
@@ -224,6 +311,8 @@
     }
 };
 
+typedef boost::detail::unspecified< multi_pipe<decomposer, composer> >::type normalizer;
+
 } // namespace unicode
 } // namespace boost
 

Modified: sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -32,6 +32,7 @@
  * Other code points are left unchanged. */
 struct hangul_decomposer
 {
+ typedef char32 input_type;
     typedef char32 output_type;
     typedef mpl::int_<3> max_output;
     
@@ -59,8 +60,23 @@
         return out;
     }
     
+ static int len(char32 ch)
+ {
+ using namespace detail;
+
+ char32 SIndex = ch - SBase;
+ char32 TIndex = SIndex % TCount;
+
+ if(SIndex < 0 || SIndex >= SCount)
+ return 1;
+ if(TIndex)
+ return 3;
+ return 2;
+ }
+
 };
 
+/* TODO: implement it */
 /** \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly that
  * transforms <L, V>, <L, V, T> and <LV, T> Hangul code points sequences into the
  * LV and LVT Hangul syllables, since those compositions are not part
@@ -68,6 +84,7 @@
  * Other code points are left unchanged. */
 struct hangul_composer
 {
+ typedef char32 input_type;
     typedef char32 output_type;
     typedef mpl::int_<1> max_output;
     

Modified: sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -9,6 +9,8 @@
 #include <iostream>
 #include <iterator>
 
+#include <boost/range/adaptor/reversed.hpp>
+
 namespace unicode = boost::unicode;
 namespace ucd = unicode::ucd;
 
@@ -39,8 +41,12 @@
     std::cout << std::endl;
     std::cout << "Decomposition type: " << as_string(ucd::get_decomposition_type(cp)) << std::endl;
     
+ boost::char32 baz[] = { cp, 0x330 };
     std::cout << "Canonical decomposition: ";
- unicode::decompose(boost::list_of(cp), std::ostream_iterator<boost::char32>(std::cout, " "));
+ std::cout << unicode::composed(unicode::decomposed(baz)) << std::endl;
+ std::cout << "reversed: " << boost::make_reversed_range(unicode::composed(unicode::decomposed(baz))) << std::endl;
+ std::cout << unicode::normalized(baz) << std::endl;
+ std::cout << "reversed: " << boost::make_reversed_range(unicode::normalized(baz));
     std::cout << std::endl << std::endl;
     
     std::cout << "Canonical decomposition of U+00A8: ";


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk