Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r65943 - in trunk: boost/regex/v4 libs/regex/src libs/regex/test/regress
From: john_at_[hidden]
Date: 2010-10-13 12:53:15


Author: johnmaddock
Date: 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
New Revision: 65943
URL: http://svn.boost.org/trac/boost/changeset/65943

Log:
Fix code to handle multiple named-subexpressions with the same name.
Updated test cases to match.
Text files modified:
   trunk/boost/regex/v4/basic_regex.hpp | 102 ++++++++++-----------------------------
   trunk/boost/regex/v4/basic_regex_creator.hpp | 6 ++
   trunk/boost/regex/v4/basic_regex_parser.hpp | 32 +++++++++--
   trunk/boost/regex/v4/match_results.hpp | 26 +++++++--
   trunk/boost/regex/v4/perl_matcher_common.hpp | 61 ++++++++++++++++++++---
   trunk/libs/regex/src/regex_traits_defaults.cpp | 2
   trunk/libs/regex/test/regress/test_perl_ex.cpp | 11 ++++
   trunk/libs/regex/test/regress/test_replace.cpp | 6 ++
   trunk/libs/regex/test/regress/test_tricky_cases.cpp | 2
   9 files changed, 149 insertions(+), 99 deletions(-)

Modified: trunk/boost/regex/v4/basic_regex.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex.hpp (original)
+++ trunk/boost/regex/v4/basic_regex.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -53,7 +53,7 @@
    if(first != last)
    {
       I next = last - 1;
- while((next != first) && !(*(next-1) < *next))
+ while((next != first) && (*next < *(next-1)))
       {
          (next-1)->swap(*next);
          --next;
@@ -61,22 +61,6 @@
    }
 }
 
-//
-// Class named_subexpressions
-// Contains information about named subexpressions within the regex.
-//
-template <class charT>
-class named_subexpressions_base
-{
-public:
- virtual int get_id(const charT* i, const charT* j)const = 0;
- virtual int get_id(std::size_t h)const = 0;
-#ifdef __GNUC__
- // warning supression:
- virtual ~named_subexpressions_base(){}
-#endif
-};
-
 template <class Iterator>
 inline std::size_t hash_value_from_capture_name(Iterator i, Iterator j)
 {
@@ -86,13 +70,14 @@
    return r;
 }
 
-template <class charT>
-class named_subexpressions : public named_subexpressions_base<charT>
+class named_subexpressions
 {
+public:
    struct name
    {
+ template <class charT>
       name(const charT* i, const charT* j, int idx)
- : /*n(i, j), */ index(idx)
+ : index(idx)
       {
          hash = hash_value_from_capture_name(i, j);
       }
@@ -100,31 +85,35 @@
          : index(idx), hash(h)
       {
       }
- //std::vector<charT> n;
       int index;
       std::size_t hash;
       bool operator < (const name& other)const
       {
- return hash < other.hash; //std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end());
+ return hash < other.hash;
       }
       bool operator == (const name& other)const
       {
- return hash == other.hash; //n == other.n;
+ return hash == other.hash;
       }
       void swap(name& other)
       {
- //n.swap(other.n);
          std::swap(index, other.index);
          std::swap(hash, other.hash);
       }
    };
-public:
+
+ typedef std::vector<name>::const_iterator const_iterator;
+ typedef std::pair<const_iterator, const_iterator> range_type;
+
    named_subexpressions(){}
+
+ template <class charT>
    void set_name(const charT* i, const charT* j, int index)
    {
       m_sub_names.push_back(name(i, j, index));
       bubble_down_one(m_sub_names.begin(), m_sub_names.end());
    }
+ template <class charT>
    int get_id(const charT* i, const charT* j)const
    {
       name t(i, j, 0);
@@ -135,72 +124,37 @@
       }
       return -1;
    }
+ template <class charT>
+ range_type equal_range(const charT* i, const charT* j)const
+ {
+ name t(i, j, 0);
+ return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
+ }
    int get_id(std::size_t h)const
    {
       name t(h, 0);
- typename std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
+ std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
       if((pos != m_sub_names.end()) && (*pos == t))
       {
          return pos->index;
       }
       return -1;
    }
-private:
- std::vector<name> m_sub_names;
-};
-
-template <class charT, class Other>
-class named_subexpressions_converter : public named_subexpressions_base<charT>
-{
- boost::shared_ptr<named_subexpressions<Other> > m_converter;
-public:
- named_subexpressions_converter(boost::shared_ptr<named_subexpressions<Other> > s)
- : m_converter(s) {}
- int get_id(const charT* i, const charT* j)const
- {
- if(i == j)
- return -1;
- std::vector<Other> v;
- while(i != j)
- {
- v.push_back(*i);
- ++i;
- }
- return m_converter->get_id(&v[0], &v[0] + v.size());
- }
- int get_id(std::size_t h)const
+ range_type equal_range(std::size_t h)const
    {
- return m_converter->get_id(h);
+ name t(h, 0);
+ return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
    }
+private:
+ std::vector<name> m_sub_names;
 };
 
-template <class To>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
- boost::shared_ptr<named_subexpressions<To> > s,
- boost::integral_constant<bool,true> const&)
-{
- return s;
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
- boost::shared_ptr<named_subexpressions<From> > s,
- boost::integral_constant<bool,false> const&)
-{
- return boost::shared_ptr<named_subexpressions_converter<To, From> >(new named_subexpressions_converter<To, From>(s));
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs(
- boost::shared_ptr<named_subexpressions<From> > s)
-{
- typedef typename boost::is_same<To, From>::type tag_type;
- return convert_to_named_subs_imp<To>(s, tag_type());
-}
 //
 // class regex_data:
 // represents the data we wish to expose to the matching algorithms.
 //
 template <class charT, class traits>
-struct regex_data : public named_subexpressions<charT>
+struct regex_data : public named_subexpressions
 {
    typedef regex_constants::syntax_option_type flag_type;
    typedef std::size_t size_type;
@@ -672,7 +626,7 @@
       BOOST_ASSERT(0 != m_pimpl.get());
       return m_pimpl->get_data();
    }
- boost::shared_ptr<re_detail::named_subexpressions<charT> > get_named_subs()const
+ boost::shared_ptr<re_detail::named_subexpressions > get_named_subs()const
    {
       return m_pimpl;
    }

Modified: trunk/boost/regex/v4/basic_regex_creator.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_creator.hpp (original)
+++ trunk/boost/regex/v4/basic_regex_creator.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -806,7 +806,13 @@
             re_syntax_base* p = base;
             std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
             if(idx > 10000)
+ {
+ //
+ // There may be more than one capture group with this hash, just do what Perl
+ // does and recurse to the leftmost:
+ //
                idx = m_pdata->get_id(idx);
+ }
             while(p)
             {
                if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))

Modified: trunk/boost/regex/v4/basic_regex_parser.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_parser.hpp (original)
+++ trunk/boost/regex/v4/basic_regex_parser.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -820,7 +820,11 @@
             return false;
          }
          // maybe have \g{ddd}
- if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
+ regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
+ regex_constants::syntax_type syn_end = 0;
+ if((syn == regex_constants::syntax_open_brace)
+ || (syn == regex_constants::escape_type_left_word)
+ || (syn == regex_constants::escape_type_end_buffer))
          {
             if(++m_position == m_end)
             {
@@ -828,6 +832,18 @@
                return false;
             }
             have_brace = true;
+ switch(syn)
+ {
+ case regex_constants::syntax_open_brace:
+ syn_end = regex_constants::syntax_close_brace;
+ break;
+ case regex_constants::escape_type_left_word:
+ syn_end = regex_constants::escape_type_right_word;
+ break;
+ default:
+ syn_end = regex_constants::escape_type_end_buffer;
+ break;
+ }
          }
          negative = (*m_position == static_cast<charT>('-'));
          if((negative) && (++m_position == m_end))
@@ -837,18 +853,20 @@
          }
          const charT* pc = m_position;
          int i = this->m_traits.toi(pc, m_end, 10);
- if(i < 0)
+ if((i < 0) && syn_end)
          {
- // Check for a named capture:
+ // Check for a named capture, get the leftmost one if there is more than one:
             const charT* base = m_position;
- while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+ while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
+ {
                ++m_position;
- i = this->m_pdata->get_id(base, m_position);
+ }
+ i = hash_value_from_capture_name(base, m_position);
             pc = m_position;
          }
          if(negative)
             i = 1 + m_mark_count - i;
- if((i > 0) && (this->m_backrefs & (1u << (i-1))))
+ if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
          {
             m_position = pc;
             re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
@@ -863,7 +881,7 @@
          m_position = pc;
          if(have_brace)
          {
- if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+ if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
             {
                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
                return false;

Modified: trunk/boost/regex/v4/match_results.hpp
==============================================================================
--- trunk/boost/regex/v4/match_results.hpp (original)
+++ trunk/boost/regex/v4/match_results.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -38,7 +38,6 @@
 
 namespace re_detail{
 
-template <class charT>
 class named_subexpressions;
 
 }
@@ -69,7 +68,7 @@
    typedef typename re_detail::regex_iterator_traits<
                                     BidiIterator>::value_type char_type;
    typedef std::basic_string<char_type> string_type;
- typedef re_detail::named_subexpressions_base<char_type> named_sub_type;
+ typedef re_detail::named_subexpressions named_sub_type;
 
    // construct/copy/destroy:
    explicit match_results(const Allocator& a = Allocator())
@@ -225,10 +224,15 @@
    //
    const_reference named_subexpression(const char_type* i, const char_type* j) const
    {
+ //
+ // Scan for the leftmost *matched* subexpression with the specified named:
+ //
       if(m_is_singular)
          raise_logic_error();
- int index = m_named_subs->get_id(i, j);
- return index > 0 ? (*this)[index] : m_null;
+ re_detail::named_subexpressions::range_type r = m_named_subs->equal_range(i, j);
+ while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+ ++r.first;
+ return r.first != r.second ? (*this)[r.first->index] : m_null;
    }
    template <class charT>
    const_reference named_subexpression(const charT* i, const charT* j) const
@@ -243,10 +247,20 @@
    }
    int named_subexpression_index(const char_type* i, const char_type* j) const
    {
+ //
+ // Scan for the leftmost *matched* subexpression with the specified named.
+ // If none found then return the leftmost expression with that name,
+ // otherwise an invalid index:
+ //
       if(m_is_singular)
          raise_logic_error();
- int index = m_named_subs->get_id(i, j);
- return index > 0 ? index : -20;
+ re_detail::named_subexpressions::range_type s, r;
+ s = r = m_named_subs->equal_range(i, j);
+ while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+ ++r.first;
+ if(r.first == r.second)
+ r = s;
+ return r.first != r.second ? r.first->index : -20;
    }
    template <class charT>
    int named_subexpression_index(const charT* i, const charT* j) const

Modified: trunk/boost/regex/v4/perl_matcher_common.hpp
==============================================================================
--- trunk/boost/regex/v4/perl_matcher_common.hpp (original)
+++ trunk/boost/regex/v4/perl_matcher_common.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -200,7 +200,7 @@
    m_match_flags |= regex_constants::match_all;
    m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
    m_presult->set_base(base);
- m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+ m_presult->set_named_subs(this->re.get_named_subs());
    if(m_match_flags & match_posix)
       m_result = *m_presult;
    verify_options(re.flags(), m_match_flags);
@@ -262,7 +262,7 @@
       pstate = re.get_first_state();
       m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last);
       m_presult->set_base(base);
- m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+ m_presult->set_named_subs(this->re.get_named_subs());
       m_match_flags |= regex_constants::match_init;
    }
    else
@@ -588,8 +588,23 @@
    // in the match, this is in line with ECMAScript, but not Perl
    // or PCRE.
    //
- BidiIterator i = (*m_presult)[static_cast<const re_brace*>(pstate)->index].first;
- BidiIterator j = (*m_presult)[static_cast<const re_brace*>(pstate)->index].second;
+ int index = static_cast<const re_brace*>(pstate)->index;
+ if(index >= 10000)
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(index);
+ BOOST_ASSERT(r.first != r.second);
+ do
+ {
+ index = r.first->index;
+ ++r.first;
+ }while((r.first != r.second) && ((*m_presult)[index].matched != true));
+ }
+
+ if((m_match_flags & match_perl) && !(*m_presult)[index].matched)
+ return false;
+
+ BidiIterator i = (*m_presult)[index].first;
+ BidiIterator j = (*m_presult)[index].second;
    while(i != j)
    {
       if((position == last) || (traits_inst.translate(*position, icase) != traits_inst.translate(*i, icase)))
@@ -713,7 +728,7 @@
 {
    // return true if marked sub-expression N has been matched:
    int index = static_cast<const re_brace*>(pstate)->index;
- bool result;
+ bool result = false;
    if(index == 9999)
    {
       // Magic value for a (DEFINE) block:
@@ -721,11 +736,25 @@
    }
    else if(index > 0)
    {
+ // Have we matched subexpression "index"?
       // Check if index is a hash value:
       if(index >= 10000)
- index = re.get_data().get_id(index);
- // Have we matched subexpression "index"?
- result = (*m_presult)[index].matched;
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(index);
+ while(r.first != r.second)
+ {
+ if((*m_presult)[r.first->index].matched)
+ {
+ result = true;
+ break;
+ }
+ ++r.first;
+ }
+ }
+ else
+ {
+ result = (*m_presult)[index].matched;
+ }
       pstate = pstate->next.p;
    }
    else
@@ -734,8 +763,20 @@
       // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
       int idx = -index-1;
       if(idx >= 10000)
- idx = re.get_data().get_id(idx);
- result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(idx);
+ int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;
+ while(r.first != r.second)
+ {
+ result |= (stack_index == r.first->index);
+ if(result)break;
+ ++r.first;
+ }
+ }
+ else
+ {
+ result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+ }
       pstate = pstate->next.p;
    }
    return result;

Modified: trunk/libs/regex/src/regex_traits_defaults.cpp
==============================================================================
--- trunk/libs/regex/src/regex_traits_defaults.cpp (original)
+++ trunk/libs/regex/src/regex_traits_defaults.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -100,7 +100,7 @@
          "p",
          "P",
          "N",
- "g",
+ "gk",
          "K",
          "R",
    };

Modified: trunk/libs/regex/test/regress/test_perl_ex.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_perl_ex.cpp (original)
+++ trunk/libs/regex/test/regress/test_perl_ex.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -907,5 +907,16 @@
    // Bugs:
    TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(); } }", match_default, make_array(0, 46, 10, 13, 14, 46, -2, -2));
    TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(){} } { {{{ } } } } {}}", match_default, make_array(0, 64, 10, 13, 14, 64, -2, -2));
+
+ // Recursion to a named sub with a name that is used multiple times:
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "aaaa.aa", match_default, make_array(0, 7, 0, 4, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.aa", match_default, make_array(0, 7, -1, -1, 0, 4, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.bb", match_default, make_array(-2, -2));
+ // Back reference to a named sub with a name that is used multiple times:
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.aaaa", match_default, make_array(0, 9, 0, 4, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.aaaa", match_default, make_array(-2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.bbbb", match_default, make_array(-2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.bbbb", match_default, make_array(0, 9, -1, -1, 0, 4, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+)|c+)\\.\\k<A>", perl, "cccc.cccc", match_default, make_array(-2, -2));
 }
 

Modified: trunk/libs/regex/test/regress/test_replace.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_replace.cpp (original)
+++ trunk/libs/regex/test/regress/test_replace.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -184,5 +184,11 @@
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "$2", "bb");
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "d$+{one}c", "daac");
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "c$+{two}d", "cbbd");
+
+ TEST_REGEX_REPLACE("(?<one>a)(?<one>b)(c)", perl, " ...abc,,", match_default, "$1.$2.$3.$+{one}", " ...a.b.c.a,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...a,,", match_default, "$1.$2.$+{one}", " ...a..a,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...b,,", match_default, "$1.$2.$+{one}", " ....b.b,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)(?<one>b))", perl, " ...ab,,", match_default, "$1.$2.$+{one}", " ...a.b.a,,");
+
 }
 

Modified: trunk/libs/regex/test/regress/test_tricky_cases.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_tricky_cases.cpp (original)
+++ trunk/libs/regex/test/regress/test_tricky_cases.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -49,7 +49,7 @@
    TEST_REGEX_SEARCH("(a)d|(b)c", perl, "abc", match_default, make_array(1, 3, -1, -1, 1, 2, -2, -2));
    TEST_REGEX_SEARCH("_+((www)|(ftp)|(mailto)):_*", perl, "_wwwnocolon _mailto:", match_default, make_array(12, 20, 13, 19, -1, -1, -1, -1, 13, 19, -2, -2));
    // subtleties of matching
- TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(0, 3, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(-2, -2));
    TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default, make_array(0, 4, 2, 3, -2, -2));
    TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default, make_array(0, 10, 0, 3, 3, 10, -2, -2));
    TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2));


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk