Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r81614 - in trunk: boost/regex/pending libs/regex/test libs/regex/test/unicode
From: john_at_[hidden]
Date: 2012-11-28 12:57:29


Author: johnmaddock
Date: 2012-11-28 12:57:26 EST (Wed, 28 Nov 2012)
New Revision: 81614
URL: http://svn.boost.org/trac/boost/changeset/81614

Log:
Add further error checking to UTF-8 decoding.
Fixes #7744.
Text files modified:
   trunk/boost/regex/pending/unicode_iterator.hpp | 24 ++++++++++++++++++++++--
   trunk/libs/regex/test/Jamfile.v2 | 4 ++--
   trunk/libs/regex/test/unicode/unicode_iterator_test.cpp | 5 +++++
   3 files changed, 29 insertions(+), 4 deletions(-)

Modified: trunk/boost/regex/pending/unicode_iterator.hpp
==============================================================================
--- trunk/boost/regex/pending/unicode_iterator.hpp (original)
+++ trunk/boost/regex/pending/unicode_iterator.hpp 2012-11-28 12:57:26 EST (Wed, 28 Nov 2012)
@@ -520,9 +520,26 @@
    }
    void increment()
    {
+ // We must not start with a continuation character:
+ if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
+ invalid_sequence();
       // skip high surrogate first if there is one:
       unsigned c = detail::utf8_byte_count(*m_position);
- std::advance(m_position, c);
+ if(m_value == pending_read)
+ {
+ // Since we haven't read in a value, we need to validate the code points:
+ for(unsigned i = 0; i < c; ++i)
+ {
+ ++m_position;
+ // We must have a continuation byte:
+ if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
+ invalid_sequence();
+ }
+ }
+ else
+ {
+ std::advance(m_position, c);
+ }
       m_value = pending_read;
    }
    void decrement()
@@ -589,7 +606,7 @@
       // we must not have a continuation character:
       if((m_value & 0xC0u) == 0x80u)
          invalid_sequence();
- // see how many extra byts we have:
+ // see how many extra bytes we have:
       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
       // extract the extra bits, 6 from each extra byte:
       BaseIterator next(m_position);
@@ -597,6 +614,9 @@
       {
          ++next;
          m_value <<= 6;
+ // We must have a continuation byte:
+ if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
+ invalid_sequence();
          m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
       }
       // we now need to remove a few of the leftmost bits, but how many depends

Modified: trunk/libs/regex/test/Jamfile.v2
==============================================================================
--- trunk/libs/regex/test/Jamfile.v2 (original)
+++ trunk/libs/regex/test/Jamfile.v2 2012-11-28 12:57:26 EST (Wed, 28 Nov 2012)
@@ -124,8 +124,8 @@
             ../build//boost_regex
       ]
       
- [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : <define>TEST_UTF8 : unicode_iterator_test_utf8 ]
- [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : <define>TEST_UTF16 : unicode_iterator_test_utf16 ]
+ [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release <define>TEST_UTF8 : unicode_iterator_test_utf8 ]
+ [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release <define>TEST_UTF16 : unicode_iterator_test_utf16 ]
       [ run static_mutex/static_mutex_test.cpp
             ../../thread/build//boost_thread ../build//boost_regex
       ]

Modified: trunk/libs/regex/test/unicode/unicode_iterator_test.cpp
==============================================================================
--- trunk/libs/regex/test/unicode/unicode_iterator_test.cpp (original)
+++ trunk/libs/regex/test/unicode/unicode_iterator_test.cpp 2012-11-28 12:57:26 EST (Wed, 28 Nov 2012)
@@ -103,6 +103,11 @@
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2, bad_seq2, bad_seq2 + 5), std::out_of_range);
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2 + 1, bad_seq2 + 1, bad_seq2 + 6), std::out_of_range);
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2 + 1, bad_seq2, bad_seq2 + 6), std::out_of_range);
+
+ boost::uint8_t bad_seq3[5] = { '.', '*', 0xe4, '.', '*' };
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range);
+ boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' };
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range);
 }
 
 void test(const std::vector< ::boost::uint32_t>& v)


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk