Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly

Subject: Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2013-09-09 21:06:22


#8883: property_tree JSON reader does not parse unicode characters properly
----------------------------------+----------------------------------------
  Reporter: Ronny Krueger | Owner: cornedbee
  <rk@…> | Status: new
      Type: Bugs | Component: property_tree
 Milestone: To Be Determined | Severity: Problem
   Version: Boost 1.54.0 | Keywords: property_tree JSON unicode
Resolution: |
----------------------------------+----------------------------------------

Comment (by Ben McCart <bmccart@…>):

 I accidentally pasted the reader diff twice rather than pasting the reader
 and the writer. In addition, a bug in the UTF16 reader has been fixed.
 (How embarrassing)

 {{{

 Index: json_parser_write.hpp

 ===================================================================

 --- json_parser_write.hpp (revision 85628)

 +++ json_parser_write.hpp (working copy)

 @@ -10,12 +10,15 @@

  #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED
  #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED

 +#include <boost/cstdint.hpp>
  #include <boost/property_tree/ptree.hpp>
  #include <boost/next_prior.hpp>
  #include <boost/type_traits/make_unsigned.hpp>
  #include <string>
  #include <ostream>
 +#include <sstream>
  #include <iomanip>
 +#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
 @@ -33,7 +36,7 @@

              // We escape everything outside ASCII, because this code
 can't
              // handle high unicode characters.
              if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
 - (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF))
 + (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
                  result += *b;
              else if (*b == Ch('\b')) result += Ch('\\'), result +=
 Ch('b');
              else if (*b == Ch('\f')) result += Ch('\\'), result +=
 Ch('f');
 @@ -44,18 +47,59 @@

              else if (*b == Ch('\\')) result += Ch('\\'), result +=
 Ch('\\');
              else
              {
 - const char *hexdigits = "0123456789ABCDEF";
 + std::ostringstream oss;
                  typedef typename make_unsigned<Ch>::type UCh;
 - unsigned long u = (std::min)(static_cast<unsigned long>(
 - static_cast<UCh>(*b)),
 - 0xFFFFul);
 - int d1 = u / 4096; u -= d1 * 4096;
 - int d2 = u / 256; u -= d2 * 256;
 - int d3 = u / 16; u -= d3 * 16;
 - int d4 = u;
 - result += Ch('\\'); result += Ch('u');
 - result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
 - result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
 + if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
 + {
 + // Assume UTF16.
 + oss << "\\u" << std::setw(4) << std::setfill('0') <<
 std::hex << std::uppercase << uint16_t(*b);
 + if ((0xD7FF < uint16_t(*b)) && (uint16_t(*b) <
 0xE000))
 + {
 + // Add second 16 bit value for surrogat6e pair
 + if (e-b == 1)
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + ++b;
 + oss << "\\u" << std::setw(4) << std::setfill('0')
 << std::hex << std::uppercase << uint16_t(*b);
 + }
 +
 + result += oss.str();
 + }
 + else
 + {
 + // Assume UTF8
 + std::mbstate_t state = 0;
 + std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
 +
 + std::size_t in_max = std::min(e-b, 4);;
 + // Determine how many 'C' chars will be consumed to
 constuct a single UTF8 codepoint - this is
 + // required as codecvt.in will fail if it begins
 another code point that can't be completed
 + // because the 'end' of input sequence is reached...
 + if ((*b & uint8_t(0xE0)) == uint8_t(0xC0))
 + in_max = std::min(e-b, 2);
 + else if ((*b & uint8_t(0xF0)) == uint8_t(0xE0))
 + in_max = std::min(e-b, 3);
 + else if ((*b & uint8_t(0xF8)) == uint8_t(0xF0))
 + in_max = std::min(e-b, 4);
 + else
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + Ch const *from_next = &*b;
 + wchar_t utf16str[2] = { wchar_t(0), wchar_t(0) };
 + wchar_t *to_next = utf16str;
 + if (utf16_utf8.in(state, &*b, (&*b) + in_max,
 from_next, utf16str, utf16str + 2, to_next) != 0)
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + oss << "\\u" << std::setw(4) << std::setfill('0') <<
 std::hex << std::uppercase << uint16_t(utf16str[0]);
 + if ( to_next - utf16str == 2)
 + {
 + // Add second 16 bit value for surrogat6e pair
 + oss << "\\u" << std::setw(4) << std::setfill('0')
 << std::hex << std::uppercase << uint16_t(utf16str[1]);
 + }
 + result += oss.str();
 +
 + b += ((from_next - &*b) - std::size_t(1)); //
 Additinal incrementing for additional UTF8 chars consumed.
 + }
              }
              ++b;
          }
 }}}



 {{{
 Index: json_parser_read.hpp

 ===================================================================

 --- json_parser_read.hpp (revision 85628)

 +++ json_parser_read.hpp (working copy)

 @@ -22,6 +22,7 @@

  #include <istream>
  #include <vector>
  #include <algorithm>
 +#include <codecvt>

  namespace boost { namespace property_tree { namespace json_parser
  {
 @@ -41,7 +42,10 @@

          Str name;
          Ptree root;
          std::vector<Ptree *> stack;
 + unsigned long u_surrogate;

 + context() : u_surrogate(0) {}
 +
          struct a_object_s
          {
              context &c;
 @@ -146,8 +150,53 @@

              a_unicode(context &c): c(c) { }
              void operator()(unsigned long u) const
              {
 - u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 - c.string += Ch(u);
 + typedef typename make_unsigned<Ch>::type UCh;
 + if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
 + {
 + if ((c.u_surrogate == 0) && (0xD7FF < u && u <
 0xE000))
 + {
 + c.u_surrogate = u;
 + }
 + else if (c.u_surrogate)
 + {
 + c.string += Ch((std::min)(c.u_surrogate,
 static_cast<unsigned long>((std::numeric_limits<Ch>::max)())));
 + c.u_surrogate = 0;
 + }
 +
 + u = (std::min)(u, static_cast<unsigned
 long>((std::numeric_limits<Ch>::max)()));
 + c.string += Ch(u);
 + }
 + else // Ch is one byte - encode the given Unicode code
 point as UTF-8
 + {
 + if ((c.u_surrogate == 0) && (0xD7FF < u && u <
 0xE000))
 + {
 + c.u_surrogate = u;
 + }
 + else
 + {
 + wchar_t utf16str[3] = { wchar_t(0), wchar_t(0),
 wchar_t(0) };
 + wchar_t const *from_next = utf16str;
 +
 + Ch utf8str[5] = { Ch(0), Ch(0), Ch(0), Ch(0),
 Ch(0) };
 + Ch * to_next = utf8str;
 +
 + std::size_t size = 0;
 + if (c.u_surrogate)
 + {
 + utf16str[size++] = wchar_t(c.u_surrogate);
 + c.u_surrogate = 0;
 + }
 + utf16str[size++] = wchar_t(u);
 +
 + std::mbstate_t state = 0;
 + std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
 +
 + if (utf16_utf8.out(state, &utf16str[0],
 &utf16str[0] + size, from_next, &utf8str[0], &utf8str[4], to_next) != 0)
 +
 BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
 +
 + c.string += utf8str;
 + }
 + }
              }
          };

 }}}

-- 
Ticket URL: <https://svn.boost.org/trac/boost/ticket/8883#comment:5>
Boost C++ Libraries <http://www.boost.org/>
Boost provides free peer-reviewed portable C++ source libraries.

This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:14 UTC