Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly

Subject: Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2013-09-09 20:23:35


#8883: property_tree JSON reader does not parse unicode characters properly
----------------------------------+----------------------------------------
  Reporter: Ronny Krueger | Owner: cornedbee
  <rk@…> | Status: new
      Type: Bugs | Component: property_tree
 Milestone: To Be Determined | Severity: Problem
   Version: Boost 1.54.0 | Keywords: property_tree JSON unicode
Resolution: |
----------------------------------+----------------------------------------

Comment (by Ben McCart <bmccart@…>):

 After inspecting /detail/json_parser_write.hpp and
 /detail/json_parser_read.hpp it has become apparent to me that the current
 implementation is completely deficient concerning Unicode (both UTF8 and
 UTF16). The writer creates a Unicode escape sequence block (/uXXXX) for
 each incoming 'C' char in the UTF8 code sequence, which is incorrect.
 Each escaped Unicode character is to be represented as 16-bit hex values
 of UTF16 single or surrogate pairs (see RFC4627 section 2.5
 [http://tools.ietf.org/html/rfc4627]). In addition, the existing
 implementation will only encode UCS2 correctly for 16 bit wchar_t strings
 as it isn't doing anything to correctly handle surrogate pairs either in
 the writer or the reader.

 I have a working implementation which I'm sure does not meet Boost coding
 guidelines, but may be adaptable with use of Boost Locale (I am using
 codecvt). I couldn't figure out how to add an attachment so I'm including
 the diffs inline bellow (The UTF8 implementation has been tested some, the
 UTF16 portion hasn't. It is ugly, but it works.):

 {{{


 Index: json_parser_read.hpp

 ===================================================================

 --- json_parser_read.hpp (revision 85628)

 +++ json_parser_read.hpp (working copy)

 @@ -22,6 +22,7 @@

  #include <istream>
  #include <vector>
  #include <algorithm>
 +#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
 @@ -41,7 +42,10 @@

          Str name;
          Ptree root;
          std::vector<Ptree *> stack;
 + unsigned long u_surrogate;

 + context() : u_surrogate(0) {}
 +
          struct a_object_s
          {
              context &c;
 @@ -146,8 +150,46 @@

              a_unicode(context &c): c(c) { }
              void operator()(unsigned long u) const
              {
 - u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 - c.string += Ch(u);
 + typedef typename make_unsigned<Ch>::type UCh;
 + if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
 + {
 + if (c.u_surrogate)
 + c.string += Ch((std::min)(c.u_surrogate,
 static_cast<unsigned long>((std::numeric_limits<Ch>::max)())));
 +
 + u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 + c.string += Ch(u);
 + }
 + else // Ch is one byte - encode the given Unicode code
 point as UTF-8
 + {
 + if ((c.u_surrogate == 0) && (0xD7FF < u && u <
 0xE000))
 + {
 + c.u_surrogate = u;
 + }
 + else
 + {
 + wchar_t utf16str[3] = { wchar_t(0), wchar_t(0),
 wchar_t(0) };
 + wchar_t const *from_next = utf16str;
 +
 + Ch utf8str[5] = { Ch(0), Ch(0), Ch(0), Ch(0),
 Ch(0) };
 + Ch * to_next = utf8str;
 +
 + std::size_t size = 0;
 + if (c.u_surrogate)
 + {
 + utf16str[size++] = wchar_t(c.u_surrogate);
 + c.u_surrogate = 0;
 + }
 + utf16str[size++] = wchar_t(u);
 +
 + std::mbstate_t state = 0;
 + std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
 +
 + if (utf16_utf8.out(state, &utf16str[0],
 &utf16str[0] + size, from_next, &utf8str[0], &utf8str[4], to_next) != 0)
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + c.string += utf8str;
 + }
 + }
              }
          };
 }}}



 {{{
 Index: json_parser_read.hpp

 ===================================================================

 --- json_parser_read.hpp (revision 85628)

 +++ json_parser_read.hpp (working copy)

 @@ -22,6 +22,7 @@

  #include <istream>
  #include <vector>
  #include <algorithm>
 +#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
 @@ -41,7 +42,10 @@

          Str name;
          Ptree root;
          std::vector<Ptree *> stack;
 + unsigned long u_surrogate;

 + context() : u_surrogate(0) {}
 +
          struct a_object_s
          {
              context &c;
 @@ -146,8 +150,46 @@

              a_unicode(context &c): c(c) { }
              void operator()(unsigned long u) const
              {
 - u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 - c.string += Ch(u);
 + typedef typename make_unsigned<Ch>::type UCh;
 + if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
 + {
 + if (c.u_surrogate)
 + c.string += Ch((std::min)(c.u_surrogate,
 static_cast<unsigned long>((std::numeric_limits<Ch>::max)())));
 +
 + u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 + c.string += Ch(u);
 + }
 + else // Ch is one byte - encode the given Unicode code
 point as UTF-8
 + {
 + if ((c.u_surrogate == 0) && (0xD7FF < u && u <
 0xE000))
 + {
 + c.u_surrogate = u;
 + }
 + else
 + {
 + wchar_t utf16str[3] = { wchar_t(0), wchar_t(0),
 wchar_t(0) };
 + wchar_t const *from_next = utf16str;
 +
 + Ch utf8str[5] = { Ch(0), Ch(0), Ch(0), Ch(0),
 Ch(0) };
 + Ch * to_next = utf8str;
 +
 + std::size_t size = 0;
 + if (c.u_surrogate)
 + {
 + utf16str[size++] = wchar_t(c.u_surrogate);
 + c.u_surrogate = 0;
 + }
 + utf16str[size++] = wchar_t(u);
 +
 + std::mbstate_t state = 0;
 + std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
 +
 + if (utf16_utf8.out(state, &utf16str[0],
 &utf16str[0] + size, from_next, &utf8str[0], &utf8str[4], to_next) != 0)
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + c.string += utf8str;
 + }
 + }
              }
          };

 }}}

-- 
Ticket URL: <https://svn.boost.org/trac/boost/ticket/8883#comment:4>
Boost C++ Libraries <http://www.boost.org/>
Boost provides free peer-reviewed portable C++ source libraries.

This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:14 UTC