Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly

Date view	Thread view	Subject view	Author view

Subject: Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2013-09-09 21:06:22

Next message: Boost C++ Libraries: "Re: [Boost-bugs] [Boost C++ Libraries] #8766: is_iterator_category<Traversal> error"
Previous message: Boost C++ Libraries: "Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly"
In reply to: Boost C++ Libraries: "[Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly"
Next in thread: Boost C++ Libraries: "Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly"

#8883: property_tree JSON reader does not parse unicode characters properly
----------------------------------+----------------------------------------
  Reporter: Ronny Krueger | Owner: cornedbee
  <rk@â€¦> | Status: new
      Type: Bugs | Component: property_tree
Milestone: To Be Determined | Severity: Problem
   Version: Boost 1.54.0 | Keywords: property_tree JSON unicode
Resolution: |
----------------------------------+----------------------------------------

Comment (by Ben McCart <bmccart@â€¦>):

I accidentally pasted the reader diff twice rather than pasting the reader
and the writer. In addition, a bug in the UTF16 reader has been fixed.
(How embarrassing)

{{{

Index: json_parser_write.hpp

===================================================================

--- json_parser_write.hpp (revision 85628)

+++ json_parser_write.hpp (working copy)

@@ -10,12 +10,15 @@

  #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED
  #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED

+#include <boost/cstdint.hpp>
  #include <boost/property_tree/ptree.hpp>
  #include <boost/next_prior.hpp>
  #include <boost/type_traits/make_unsigned.hpp>
  #include <string>
  #include <ostream>
+#include <sstream>
  #include <iomanip>
+#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
@@ -33,7 +36,7 @@

              // We escape everything outside ASCII, because this code
can't
              // handle high unicode characters.
              if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
- (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF))
+ (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
                  result += *b;
              else if (*b == Ch('\b')) result += Ch('\\'), result +=
Ch('b');
              else if (*b == Ch('\f')) result += Ch('\\'), result +=
Ch('f');
@@ -44,18 +47,59 @@

              else if (*b == Ch('\\')) result += Ch('\\'), result +=
Ch('\\');
              else
              {
- const char *hexdigits = "0123456789ABCDEF";
+ std::ostringstream oss;
                  typedef typename make_unsigned<Ch>::type UCh;
- unsigned long u = (std::min)(static_cast<unsigned long>(
- static_cast<UCh>(*b)),
- 0xFFFFul);
- int d1 = u / 4096; u -= d1 * 4096;
- int d2 = u / 256; u -= d2 * 256;
- int d3 = u / 16; u -= d3 * 16;
- int d4 = u;
- result += Ch('\\'); result += Ch('u');
- result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
- result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
+ if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
+ {
+ // Assume UTF16.
+ oss << "\\u" << std::setw(4) << std::setfill('0') <<
std::hex << std::uppercase << uint16_t(*b);
+ if ((0xD7FF < uint16_t(*b)) && (uint16_t(*b) <
0xE000))
+ {
+ // Add second 16 bit value for surrogat6e pair
+ if (e-b == 1)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ ++b;
+ oss << "\\u" << std::setw(4) << std::setfill('0')
<< std::hex << std::uppercase << uint16_t(*b);
+ }
+
+ result += oss.str();
+ }
+ else
+ {
+ // Assume UTF8
+ std::mbstate_t state = 0;
+ std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
+
+ std::size_t in_max = std::min(e-b, 4);;
+ // Determine how many 'C' chars will be consumed to
constuct a single UTF8 codepoint - this is
+ // required as codecvt.in will fail if it begins
another code point that can't be completed
+ // because the 'end' of input sequence is reached...
+ if ((*b & uint8_t(0xE0)) == uint8_t(0xC0))
+ in_max = std::min(e-b, 2);
+ else if ((*b & uint8_t(0xF0)) == uint8_t(0xE0))
+ in_max = std::min(e-b, 3);
+ else if ((*b & uint8_t(0xF8)) == uint8_t(0xF0))
+ in_max = std::min(e-b, 4);
+ else
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ Ch const *from_next = &*b;
+ wchar_t utf16str[2] = { wchar_t(0), wchar_t(0) };
+ wchar_t *to_next = utf16str;
+ if (utf16_utf8.in(state, &*b, (&*b) + in_max,
from_next, utf16str, utf16str + 2, to_next) != 0)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ oss << "\\u" << std::setw(4) << std::setfill('0') <<
std::hex << std::uppercase << uint16_t(utf16str[0]);
+ if ( to_next - utf16str == 2)
+ {
+ // Add second 16 bit value for surrogat6e pair
+ oss << "\\u" << std::setw(4) << std::setfill('0')
<< std::hex << std::uppercase << uint16_t(utf16str[1]);
+ }
+ result += oss.str();
+
+ b += ((from_next - &*b) - std::size_t(1)); //
Additinal incrementing for additional UTF8 chars consumed.
+ }
              }
              ++b;
          }
}}}

{{{
Index: json_parser_read.hpp

===================================================================

--- json_parser_read.hpp (revision 85628)

+++ json_parser_read.hpp (working copy)

@@ -22,6 +22,7 @@

  #include <istream>
  #include <vector>
  #include <algorithm>
+#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
@@ -41,7 +42,10 @@

          Str name;
          Ptree root;
          std::vector<Ptree *> stack;
+ unsigned long u_surrogate;

+ context() : u_surrogate(0) {}
+
          struct a_object_s
          {
              context &c;
@@ -146,8 +150,53 @@

              a_unicode(context &c): c(c) { }
              void operator()(unsigned long u) const
              {
- u = (std::min)(u, static_cast<unsigned
long>((std::numeric_limits<Ch>::max)()));
- c.string += Ch(u);
+ typedef typename make_unsigned<Ch>::type UCh;
+ if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
+ {
+ if ((c.u_surrogate == 0) && (0xD7FF < u && u <
0xE000))
+ {
+ c.u_surrogate = u;
+ }
+ else if (c.u_surrogate)
+ {
+ c.string += Ch((std::min)(c.u_surrogate,
static_cast<unsigned long>((std::numeric_limits<Ch>::max)())));
+ c.u_surrogate = 0;
+ }
+
+ u = (std::min)(u, static_cast<unsigned
long>((std::numeric_limits<Ch>::max)()));
+ c.string += Ch(u);
+ }
+ else // Ch is one byte - encode the given Unicode code
point as UTF-8
+ {
+ if ((c.u_surrogate == 0) && (0xD7FF < u && u <
0xE000))
+ {
+ c.u_surrogate = u;
+ }
+ else
+ {
+ wchar_t utf16str[3] = { wchar_t(0), wchar_t(0),
wchar_t(0) };
+ wchar_t const *from_next = utf16str;
+
+ Ch utf8str[5] = { Ch(0), Ch(0), Ch(0), Ch(0),
Ch(0) };
+ Ch * to_next = utf8str;
+
+ std::size_t size = 0;
+ if (c.u_surrogate)
+ {
+ utf16str[size++] = wchar_t(c.u_surrogate);
+ c.u_surrogate = 0;
+ }
+ utf16str[size++] = wchar_t(u);
+
+ std::mbstate_t state = 0;
+ std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
+
+ if (utf16_utf8.out(state, &utf16str[0],
&utf16str[0] + size, from_next, &utf8str[0], &utf8str[4], to_next) != 0)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ c.string += utf8str;
+ }
+ }
              }
          };

}}}

-- 
Ticket URL: <https://svn.boost.org/trac/boost/ticket/8883#comment:5>
Boost C++ Libraries <http://www.boost.org/>
Boost provides free peer-reviewed portable C++ source libraries.

Date view	Thread view	Subject view	Author view

This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:14 UTC