Subject: Re: [Boost-bugs] [Boost C++ Libraries] #8883: property_tree JSON reader does not parse unicode characters properly
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2013-09-09 21:06:22
#8883: property_tree JSON reader does not parse unicode characters properly
----------------------------------+----------------------------------------
Reporter: Ronny Krueger | Owner: cornedbee
<rk@â¦> | Status: new
Type: Bugs | Component: property_tree
Milestone: To Be Determined | Severity: Problem
Version: Boost 1.54.0 | Keywords: property_tree JSON unicode
Resolution: |
----------------------------------+----------------------------------------
Comment (by Ben McCart <bmccart@â¦>):
I accidentally pasted the reader diff twice rather than pasting the reader
and the writer. In addition, a bug in the UTF16 reader has been fixed.
(How embarrassing)
{{{
Index: json_parser_write.hpp
===================================================================
--- json_parser_write.hpp (revision 85628)
+++ json_parser_write.hpp (working copy)
@@ -10,12 +10,15 @@
#ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED
#define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WRITE_HPP_INCLUDED
+#include <boost/cstdint.hpp>
#include <boost/property_tree/ptree.hpp>
#include <boost/next_prior.hpp>
#include <boost/type_traits/make_unsigned.hpp>
#include <string>
#include <ostream>
+#include <sstream>
#include <iomanip>
+#include <codecvt>
namespace boost { namespace property_tree { namespace json_parser
{
@@ -33,7 +36,7 @@
// We escape everything outside ASCII, because this code
can't
// handle high unicode characters.
if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
- (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF))
+ (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
result += *b;
else if (*b == Ch('\b')) result += Ch('\\'), result +=
Ch('b');
else if (*b == Ch('\f')) result += Ch('\\'), result +=
Ch('f');
@@ -44,18 +47,59 @@
else if (*b == Ch('\\')) result += Ch('\\'), result +=
Ch('\\');
else
{
- const char *hexdigits = "0123456789ABCDEF";
+ std::ostringstream oss;
typedef typename make_unsigned<Ch>::type UCh;
- unsigned long u = (std::min)(static_cast<unsigned long>(
- static_cast<UCh>(*b)),
- 0xFFFFul);
- int d1 = u / 4096; u -= d1 * 4096;
- int d2 = u / 256; u -= d2 * 256;
- int d3 = u / 16; u -= d3 * 16;
- int d4 = u;
- result += Ch('\\'); result += Ch('u');
- result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
- result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
+ if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
+ {
+ // Assume UTF16.
+ oss << "\\u" << std::setw(4) << std::setfill('0') <<
std::hex << std::uppercase << uint16_t(*b);
+ if ((0xD7FF < uint16_t(*b)) && (uint16_t(*b) <
0xE000))
+ {
+ // Add second 16 bit value for surrogat6e pair
+ if (e-b == 1)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ ++b;
+ oss << "\\u" << std::setw(4) << std::setfill('0')
<< std::hex << std::uppercase << uint16_t(*b);
+ }
+
+ result += oss.str();
+ }
+ else
+ {
+ // Assume UTF8
+ std::mbstate_t state = 0;
+ std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
+
+ std::size_t in_max = std::min(e-b, 4);;
+ // Determine how many 'C' chars will be consumed to
constuct a single UTF8 codepoint - this is
+ // required as codecvt.in will fail if it begins
another code point that can't be completed
+ // because the 'end' of input sequence is reached...
+ if ((*b & uint8_t(0xE0)) == uint8_t(0xC0))
+ in_max = std::min(e-b, 2);
+ else if ((*b & uint8_t(0xF0)) == uint8_t(0xE0))
+ in_max = std::min(e-b, 3);
+ else if ((*b & uint8_t(0xF8)) == uint8_t(0xF0))
+ in_max = std::min(e-b, 4);
+ else
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ Ch const *from_next = &*b;
+ wchar_t utf16str[2] = { wchar_t(0), wchar_t(0) };
+ wchar_t *to_next = utf16str;
+ if (utf16_utf8.in(state, &*b, (&*b) + in_max,
from_next, utf16str, utf16str + 2, to_next) != 0)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ oss << "\\u" << std::setw(4) << std::setfill('0') <<
std::hex << std::uppercase << uint16_t(utf16str[0]);
+ if ( to_next - utf16str == 2)
+ {
+ // Add second 16 bit value for surrogat6e pair
+ oss << "\\u" << std::setw(4) << std::setfill('0')
<< std::hex << std::uppercase << uint16_t(utf16str[1]);
+ }
+ result += oss.str();
+
+ b += ((from_next - &*b) - std::size_t(1)); //
Additinal incrementing for additional UTF8 chars consumed.
+ }
}
++b;
}
}}}
{{{
Index: json_parser_read.hpp
===================================================================
--- json_parser_read.hpp (revision 85628)
+++ json_parser_read.hpp (working copy)
@@ -22,6 +22,7 @@
#include <istream>
#include <vector>
#include <algorithm>
+#include <codecvt>
namespace boost { namespace property_tree { namespace json_parser
{
@@ -41,7 +42,10 @@
Str name;
Ptree root;
std::vector<Ptree *> stack;
+ unsigned long u_surrogate;
+ context() : u_surrogate(0) {}
+
struct a_object_s
{
context &c;
@@ -146,8 +150,53 @@
a_unicode(context &c): c(c) { }
void operator()(unsigned long u) const
{
- u = (std::min)(u, static_cast<unsigned
long>((std::numeric_limits<Ch>::max)()));
- c.string += Ch(u);
+ typedef typename make_unsigned<Ch>::type UCh;
+ if (long(std::numeric_limits<UCh>::max()) >= 0xFFFF)
+ {
+ if ((c.u_surrogate == 0) && (0xD7FF < u && u <
0xE000))
+ {
+ c.u_surrogate = u;
+ }
+ else if (c.u_surrogate)
+ {
+ c.string += Ch((std::min)(c.u_surrogate,
static_cast<unsigned long>((std::numeric_limits<Ch>::max)())));
+ c.u_surrogate = 0;
+ }
+
+ u = (std::min)(u, static_cast<unsigned
long>((std::numeric_limits<Ch>::max)()));
+ c.string += Ch(u);
+ }
+ else // Ch is one byte - encode the given Unicode code
point as UTF-8
+ {
+ if ((c.u_surrogate == 0) && (0xD7FF < u && u <
0xE000))
+ {
+ c.u_surrogate = u;
+ }
+ else
+ {
+ wchar_t utf16str[3] = { wchar_t(0), wchar_t(0),
wchar_t(0) };
+ wchar_t const *from_next = utf16str;
+
+ Ch utf8str[5] = { Ch(0), Ch(0), Ch(0), Ch(0),
Ch(0) };
+ Ch * to_next = utf8str;
+
+ std::size_t size = 0;
+ if (c.u_surrogate)
+ {
+ utf16str[size++] = wchar_t(c.u_surrogate);
+ c.u_surrogate = 0;
+ }
+ utf16str[size++] = wchar_t(u);
+
+ std::mbstate_t state = 0;
+ std::codecvt_utf8_utf16<wchar_t> utf16_utf8;
+
+ if (utf16_utf8.out(state, &utf16str[0],
&utf16str[0] + size, from_next, &utf8str[0], &utf8str[4], to_next) != 0)
+
BOOST_PROPERTY_TREE_THROW(json_parser_error("write error", "", 0));
+
+ c.string += utf8str;
+ }
+ }
}
};
}}}
-- Ticket URL: <https://svn.boost.org/trac/boost/ticket/8883#comment:5> Boost C++ Libraries <http://www.boost.org/> Boost provides free peer-reviewed portable C++ source libraries.
This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:14 UTC