|
Boost-Commit : |
Subject: [Boost-commit] svn:boost r64117 - in sandbox/SOC/2009/unicode: boost/iterator boost/unicode libs/unicode/test libs/unicode/test/iterator libs/unicode/test/unicode
From: loufoque_at_[hidden]
Date: 2010-07-17 22:11:16
Author: mgaunard
Date: 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
New Revision: 64117
URL: http://svn.boost.org/trac/boost/changeset/64117
Log:
experimental codecvt support for unicode converters
Added:
sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt_facet.hpp (contents, props changed)
sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp (contents, props changed)
Text files modified:
sandbox/SOC/2009/unicode/boost/iterator/convert_iterator.hpp | 2 +-
sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp | 6 +++---
sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp | 14 +++++++++++++-
sandbox/SOC/2009/unicode/libs/unicode/test/Jamfile.v2 | 1 +
sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_utf.cpp | 2 +-
5 files changed, 19 insertions(+), 6 deletions(-)
Modified: sandbox/SOC/2009/unicode/boost/iterator/convert_iterator.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/convert_iterator.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/iterator/convert_iterator.hpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -408,7 +408,7 @@
adapter that wraps the range \c range and converts it
step-by-step as the range is advanced. */ \
template<typename Range, typename... T> \
- boost::segmented_range< \
+ boost::converted_range< \
Range, \
converter_name \
> \
Added: sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt_facet.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt_facet.hpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -0,0 +1,222 @@
+#ifndef BOOST_ITERATOR_CONVERTER_CODECVT_FACET_HPP
+#define BOOST_ITERATOR_CONVERTER_CODECVT_FACET_HPP
+
+#include <locale>
+#include <cstddef>
+
+#include <boost/iterator/convert_iterator.hpp>
+#include <boost/iterator/dummy_output_iterator.hpp>
+
+#include <algorithm>
+
+#include <map>
+#include <boost/range/algorithm.hpp>
+#include <boost/range/join.hpp>
+
+namespace boost
+{
+
+template<typename InternT, typename P1, typename P2>
+struct converter_codecvt_facet : std::codecvt<InternT, typename P1::output_type, std::mbstate_t>
+{
+ typedef InternT intern_type;
+ typedef typename P1::output_type extern_type;
+ typedef std::mbstate_t state_type;
+
+ BOOST_CONCEPT_ASSERT((ConverterConcept<P1>));
+ BOOST_CONCEPT_ASSERT((ConverterConcept<P2>));
+
+ BOOST_CONCEPT_ASSERT((Convertible<InternT, typename P1::input_type>));
+ BOOST_CONCEPT_ASSERT((Convertible<typename P2::output_type, InternT>));
+
+ explicit converter_codecvt_facet(const P1& p1_ = P1(), const P2& p2_ = P2(), std::size_t refs = 0)
+ : std::codecvt<intern_type, extern_type, state_type>(refs), p1(p1_), p2(p2_)
+ {
+ }
+
+private:
+ struct state_t
+ {
+ intern_type pending_data[P2::max_output::value];
+ size_t pending_size;
+ };
+ mutable std::map<state_type*, state_t> states;
+
+ mutable P1 p1;
+ mutable P2 p2;
+
+protected:
+
+ virtual std::codecvt_base::result do_in(
+ state_type& state,
+ const extern_type* from,
+ const extern_type* from_end,
+ const extern_type*& from_next,
+ intern_type* to,
+ intern_type* to_end,
+ intern_type*& to_next
+ ) const
+ {
+ state_t& st = states[&state];
+
+ from_next = from;
+ to_next = to;
+
+ if(st.pending_size)
+ {
+ *to_next++ = st.pending_data[0];
+ std::copy(st.pending_data + 1, st.pending_data + st.pending_size, st.pending_data);
+ st.pending_size--;
+ return std::codecvt_base::ok;
+ }
+
+ try
+ {
+ std::pair<const extern_type*, intern_type*> p = p2.ltr(from_next, from_end, st.pending_data);
+ from_next = p.first;
+ *to_next++ = st.pending_data[0];
+ st.pending_size = p.second - st.pending_data;
+ std::copy(st.pending_data + 1, st.pending_data + st.pending_size, st.pending_data);
+ st.pending_size--;
+ }
+ catch(...)
+ {
+ return std::codecvt_base::partial;
+ }
+ return std::codecvt_base::ok;
+ }
+
+ virtual std::codecvt_base::result do_out(
+ state_type& state,
+ const intern_type* from,
+ const intern_type* from_end,
+ const intern_type*& from_next,
+ extern_type* to,
+ extern_type* to_end,
+ extern_type*& to_next
+ ) const
+ {
+ typedef const boost::iterator_range<const intern_type*> range_base;
+ typedef boost::range_detail::join_iterator<const intern_type*, const intern_type*> iterator;
+
+ state_t& st = states[&state];
+
+ from_next = from;
+ to_next = to;
+
+ boost::joined_range<range_base, range_base> input = boost::join(
+ range_base(st.pending_data, st.pending_data + st.pending_size),
+ range_base(from, from_end)
+ );
+
+ iterator from2 = input.begin();
+ iterator from_next2 = from2;
+ iterator from_end2 = input.end();
+
+ while(from_next2 != from_end2)
+ {
+ try
+ {
+ std::pair<iterator, extern_type*> p = p1.ltr(from_next2, from_end2, to_next);
+ from_next2 = p.first;
+ to_next = p.second;
+ }
+ catch(...)
+ {
+ size_t written = from_next2 - from2;
+ if(written >= st.pending_size)
+ {
+ from_next += (from_next2 - from2) - st.pending_size;
+ st.pending_size = 0;
+ }
+
+ boost::copy(range_base(from_next, from_end), st.pending_data + st.pending_size);
+ st.pending_size += (from_end - from_next);
+ from_next = from_end;
+ return std::codecvt_base::ok;
+ }
+ }
+
+ size_t written = from_next2 - from2;
+ if(written >= st.pending_size)
+ {
+ from_next += (from_next2 - from2) - st.pending_size;
+ st.pending_size = 0;
+ }
+ return std::codecvt_base::ok;
+ }
+
+ virtual bool do_always_noconv() const throw()
+ {
+ return false;
+ }
+
+ virtual std::codecvt_base::result do_unshift(
+ state_type& state,
+ extern_type* to,
+ extern_type* to_end,
+ extern_type*& to_next
+ ) const
+ {
+ state_t& st = states[&state];
+
+ to_next = to;
+ const intern_type* from = st.pending_data;
+ const intern_type* from_next = from;
+ const intern_type* from_end = st.pending_data + st.pending_size;
+
+ while(from_next != from_end)
+ {
+ try
+ {
+ std::pair<const intern_type*, extern_type*> p = p1.ltr(from_next, from_end, to_next);
+ from_next = p.first;
+ to_next = p.second;
+ }
+ catch(...)
+ {
+ return std::codecvt_base::error;
+ }
+ }
+
+ st.pending_size = 0;
+ return std::codecvt_base::ok;
+ }
+
+ virtual int do_encoding() const throw()
+ {
+ return 0;
+ }
+
+ virtual int do_length(
+ state_type&,
+ const extern_type* from,
+ const extern_type* from_end,
+ std::size_t max_limit
+ ) const
+ {
+ const extern_type* from_next = from;
+ while(from_next != from_end && max_limit--)
+ {
+ try
+ {
+ std::pair<const extern_type*, dummy_output_iterator> p = p2.ltr(from_next, from_end, dummy_output_iterator());
+ from_next = p.first;
+ }
+ catch(...)
+ {
+ break;
+ }
+ }
+ return from_next - from;
+ }
+
+ virtual int do_max_length() const throw ()
+ {
+ return P1::max_output::value;
+ }
+};
+
+} // namespace boost
+
+#endif
Modified: sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -145,7 +145,7 @@
>::type
decompose_impl(In begin, In end, Out out)
{
- char32* out_pos = out;
+ Out out_pos = out;
bool to_sort = false;
@@ -224,12 +224,12 @@
bool operator()(const ucd::unichar_compose_data_entry& lft, In rgt) const
{
- return lft.decomp[0] > offset && lft.decomp[1+offset] < *rgt;
+ return lft.decomp[0] > offset && lft.decomp[1+offset] < boost::char32(*rgt);
}
bool operator()(In lft, const ucd::unichar_compose_data_entry& rgt) const
{
- return rgt.decomp[0] > offset && *lft < rgt.decomp[1+offset];
+ return rgt.decomp[0] > offset && boost::char32(*lft) < rgt.decomp[1+offset];
}
private:
Modified: sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp (original)
+++ sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -139,7 +139,7 @@
if(unicode::is_high_surrogate(value))
{
- // precondition; next value must have be a low-surrogate:
+ // precondition; next value must be a low-surrogate:
if(++it == end)
detail::invalid_utf_sequence(begin, end);
@@ -466,6 +466,7 @@
template<typename ValueType>
struct utf_encoder : detail::select_encoder<ValueType>::type
{
+ typedef ValueType output_type;
};
/** Model of \c \xmlonly<conceptname>Converter</conceptname>\endxmlonly,
@@ -587,6 +588,17 @@
#endif
};
+/** Model of \c \xmlonly<conceptname>Converter</conceptname>\endxmlonly
+ * that converts from UTF-X to UTF-Y, X being detected from the value type
+ * of the input range, Y being specified by the ValueType parameter */
+template<typename ValueType>
+struct utf_transcoder : boost::converted_converter<
+ boost::unicode::utf_decoder,
+ boost::unicode::utf_encoder<ValueType>
+>
+{
+};
+
/** Model of \c \xmlonly<conceptname>OneManyConverter</conceptname>\endxmlonly
* that converts from UTF-32 to ISO-8859-1 alias latin-1. */
typedef boost::detail::unspecified< cast_converter<char> >::type latin1_encoder;
Modified: sandbox/SOC/2009/unicode/libs/unicode/test/Jamfile.v2
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/test/Jamfile.v2 (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/test/Jamfile.v2 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -22,6 +22,7 @@
test-suite iterator :
[ run iterator/test_convert.cpp ]
[ run iterator/test_segment.cpp ]
+ [ run iterator/test_codecvt.cpp ]
;
test-suite unicode :
Added: sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -0,0 +1,64 @@
+#define BOOST_TEST_MODULE Codecvt
+#include <boost/test/included/unit_test.hpp>
+
+#include <boost/iterator/converter_codecvt_facet.hpp>
+#include <boost/unicode/utf.hpp>
+#include <boost/unicode/compose.hpp>
+
+#include <fstream>
+#include <boost/range/algorithm.hpp>
+#include <boost/range/as_literal.hpp>
+
+typedef boost::converter_codecvt_facet<
+ wchar_t,
+ boost::unicode::utf_transcoder<char>,
+ boost::multi_converter<
+ boost::converted_converter<boost::unicode::utf_decoder, boost::unicode::normalizer>,
+ boost::unicode::utf_encoder<wchar_t>
+ >
+> utf_u8_normalize_codecvt;
+
+typedef boost::converter_codecvt_facet<
+ wchar_t,
+ boost::unicode::utf_transcoder<char>,
+ boost::unicode::utf_transcoder<wchar_t>
+> utf_u8_codecvt;
+
+
+BOOST_AUTO_TEST_CASE( codecvt )
+{
+ // e\u0301 is \u00E9
+ // \U0002FA1D is \U0002A600
+ const wchar_t data_[] = L"hello e\u0301 \U0002FA1D world";
+ boost::iterator_range<const wchar_t*> data = boost::as_literal(data_);
+
+ const wchar_t data_normalized_[] = L"hello \u00E9 \U0002A600 world";
+ boost::iterator_range<const wchar_t*> data_normalized = data;//boost::as_literal(data_normalized_);
+
+ std::locale old_locale;
+ std::locale utf8_locale(old_locale, new utf_u8_codecvt());
+
+ // Set a New global locale
+ //std::locale::global(utf8_locale);
+
+ // Send the UTF-X data out, converting to UTF-8
+ {
+ std::wofstream ofs("data.ucd");
+ ofs.imbue(utf8_locale);
+ boost::copy(data, std::ostream_iterator<wchar_t, wchar_t>(ofs));
+ }
+
+ // Read the UTF-8 data back in, converting to UTF-X and normalizing on the way in
+ {
+ std::wifstream ifs("data.ucd");
+ ifs.imbue(utf8_locale);
+ wchar_t item = 0;
+ size_t i = 0;
+ while (ifs >> std::noskipws >> item)
+ {
+ BOOST_CHECK_EQUAL(data_normalized[i], item);
+ i++;
+ }
+ BOOST_CHECK_EQUAL(i, (size_t)boost::size(data_normalized));
+ }
+}
Modified: sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_utf.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_utf.cpp (original)
+++ sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_utf.cpp 2010-07-17 22:11:15 EDT (Sat, 17 Jul 2010)
@@ -24,7 +24,7 @@
CHECK_UTF(8,
input,
- list_of<char>('$')(0xC2)(0xA2)(0xE2)(0x82)(0xAC)(0xF0)(0xA4)(0xAD)(0xA2)
+ list_of<char>('$')((char)0xC2)((char)0xA2)((char)0xE2)((char)0x82)((char)0xAC)((char)0xF0)((char)0xA4)((char)0xAD)((char)0xA2)
);
CHECK_UTF(16,
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk