|
Boost-Commit : |
Subject: [Boost-commit] svn:boost r65943 - in trunk: boost/regex/v4 libs/regex/src libs/regex/test/regress
From: john_at_[hidden]
Date: 2010-10-13 12:53:15
Author: johnmaddock
Date: 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
New Revision: 65943
URL: http://svn.boost.org/trac/boost/changeset/65943
Log:
Fix code to handle multiple named-subexpressions with the same name.
Updated test cases to match.
Text files modified:
trunk/boost/regex/v4/basic_regex.hpp | 102 ++++++++++-----------------------------
trunk/boost/regex/v4/basic_regex_creator.hpp | 6 ++
trunk/boost/regex/v4/basic_regex_parser.hpp | 32 +++++++++--
trunk/boost/regex/v4/match_results.hpp | 26 +++++++--
trunk/boost/regex/v4/perl_matcher_common.hpp | 61 ++++++++++++++++++++---
trunk/libs/regex/src/regex_traits_defaults.cpp | 2
trunk/libs/regex/test/regress/test_perl_ex.cpp | 11 ++++
trunk/libs/regex/test/regress/test_replace.cpp | 6 ++
trunk/libs/regex/test/regress/test_tricky_cases.cpp | 2
9 files changed, 149 insertions(+), 99 deletions(-)
Modified: trunk/boost/regex/v4/basic_regex.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex.hpp (original)
+++ trunk/boost/regex/v4/basic_regex.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -53,7 +53,7 @@
if(first != last)
{
I next = last - 1;
- while((next != first) && !(*(next-1) < *next))
+ while((next != first) && (*next < *(next-1)))
{
(next-1)->swap(*next);
--next;
@@ -61,22 +61,6 @@
}
}
-//
-// Class named_subexpressions
-// Contains information about named subexpressions within the regex.
-//
-template <class charT>
-class named_subexpressions_base
-{
-public:
- virtual int get_id(const charT* i, const charT* j)const = 0;
- virtual int get_id(std::size_t h)const = 0;
-#ifdef __GNUC__
- // warning supression:
- virtual ~named_subexpressions_base(){}
-#endif
-};
-
template <class Iterator>
inline std::size_t hash_value_from_capture_name(Iterator i, Iterator j)
{
@@ -86,13 +70,14 @@
return r;
}
-template <class charT>
-class named_subexpressions : public named_subexpressions_base<charT>
+class named_subexpressions
{
+public:
struct name
{
+ template <class charT>
name(const charT* i, const charT* j, int idx)
- : /*n(i, j), */ index(idx)
+ : index(idx)
{
hash = hash_value_from_capture_name(i, j);
}
@@ -100,31 +85,35 @@
: index(idx), hash(h)
{
}
- //std::vector<charT> n;
int index;
std::size_t hash;
bool operator < (const name& other)const
{
- return hash < other.hash; //std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end());
+ return hash < other.hash;
}
bool operator == (const name& other)const
{
- return hash == other.hash; //n == other.n;
+ return hash == other.hash;
}
void swap(name& other)
{
- //n.swap(other.n);
std::swap(index, other.index);
std::swap(hash, other.hash);
}
};
-public:
+
+ typedef std::vector<name>::const_iterator const_iterator;
+ typedef std::pair<const_iterator, const_iterator> range_type;
+
named_subexpressions(){}
+
+ template <class charT>
void set_name(const charT* i, const charT* j, int index)
{
m_sub_names.push_back(name(i, j, index));
bubble_down_one(m_sub_names.begin(), m_sub_names.end());
}
+ template <class charT>
int get_id(const charT* i, const charT* j)const
{
name t(i, j, 0);
@@ -135,72 +124,37 @@
}
return -1;
}
+ template <class charT>
+ range_type equal_range(const charT* i, const charT* j)const
+ {
+ name t(i, j, 0);
+ return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
+ }
int get_id(std::size_t h)const
{
name t(h, 0);
- typename std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
+ std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
if((pos != m_sub_names.end()) && (*pos == t))
{
return pos->index;
}
return -1;
}
-private:
- std::vector<name> m_sub_names;
-};
-
-template <class charT, class Other>
-class named_subexpressions_converter : public named_subexpressions_base<charT>
-{
- boost::shared_ptr<named_subexpressions<Other> > m_converter;
-public:
- named_subexpressions_converter(boost::shared_ptr<named_subexpressions<Other> > s)
- : m_converter(s) {}
- int get_id(const charT* i, const charT* j)const
- {
- if(i == j)
- return -1;
- std::vector<Other> v;
- while(i != j)
- {
- v.push_back(*i);
- ++i;
- }
- return m_converter->get_id(&v[0], &v[0] + v.size());
- }
- int get_id(std::size_t h)const
+ range_type equal_range(std::size_t h)const
{
- return m_converter->get_id(h);
+ name t(h, 0);
+ return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
}
+private:
+ std::vector<name> m_sub_names;
};
-template <class To>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
- boost::shared_ptr<named_subexpressions<To> > s,
- boost::integral_constant<bool,true> const&)
-{
- return s;
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
- boost::shared_ptr<named_subexpressions<From> > s,
- boost::integral_constant<bool,false> const&)
-{
- return boost::shared_ptr<named_subexpressions_converter<To, From> >(new named_subexpressions_converter<To, From>(s));
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs(
- boost::shared_ptr<named_subexpressions<From> > s)
-{
- typedef typename boost::is_same<To, From>::type tag_type;
- return convert_to_named_subs_imp<To>(s, tag_type());
-}
//
// class regex_data:
// represents the data we wish to expose to the matching algorithms.
//
template <class charT, class traits>
-struct regex_data : public named_subexpressions<charT>
+struct regex_data : public named_subexpressions
{
typedef regex_constants::syntax_option_type flag_type;
typedef std::size_t size_type;
@@ -672,7 +626,7 @@
BOOST_ASSERT(0 != m_pimpl.get());
return m_pimpl->get_data();
}
- boost::shared_ptr<re_detail::named_subexpressions<charT> > get_named_subs()const
+ boost::shared_ptr<re_detail::named_subexpressions > get_named_subs()const
{
return m_pimpl;
}
Modified: trunk/boost/regex/v4/basic_regex_creator.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_creator.hpp (original)
+++ trunk/boost/regex/v4/basic_regex_creator.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -806,7 +806,13 @@
re_syntax_base* p = base;
std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
if(idx > 10000)
+ {
+ //
+ // There may be more than one capture group with this hash, just do what Perl
+ // does and recurse to the leftmost:
+ //
idx = m_pdata->get_id(idx);
+ }
while(p)
{
if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))
Modified: trunk/boost/regex/v4/basic_regex_parser.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_parser.hpp (original)
+++ trunk/boost/regex/v4/basic_regex_parser.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -820,7 +820,11 @@
return false;
}
// maybe have \g{ddd}
- if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
+ regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
+ regex_constants::syntax_type syn_end = 0;
+ if((syn == regex_constants::syntax_open_brace)
+ || (syn == regex_constants::escape_type_left_word)
+ || (syn == regex_constants::escape_type_end_buffer))
{
if(++m_position == m_end)
{
@@ -828,6 +832,18 @@
return false;
}
have_brace = true;
+ switch(syn)
+ {
+ case regex_constants::syntax_open_brace:
+ syn_end = regex_constants::syntax_close_brace;
+ break;
+ case regex_constants::escape_type_left_word:
+ syn_end = regex_constants::escape_type_right_word;
+ break;
+ default:
+ syn_end = regex_constants::escape_type_end_buffer;
+ break;
+ }
}
negative = (*m_position == static_cast<charT>('-'));
if((negative) && (++m_position == m_end))
@@ -837,18 +853,20 @@
}
const charT* pc = m_position;
int i = this->m_traits.toi(pc, m_end, 10);
- if(i < 0)
+ if((i < 0) && syn_end)
{
- // Check for a named capture:
+ // Check for a named capture, get the leftmost one if there is more than one:
const charT* base = m_position;
- while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+ while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
+ {
++m_position;
- i = this->m_pdata->get_id(base, m_position);
+ }
+ i = hash_value_from_capture_name(base, m_position);
pc = m_position;
}
if(negative)
i = 1 + m_mark_count - i;
- if((i > 0) && (this->m_backrefs & (1u << (i-1))))
+ if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
{
m_position = pc;
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
@@ -863,7 +881,7 @@
m_position = pc;
if(have_brace)
{
- if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+ if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
{
fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
return false;
Modified: trunk/boost/regex/v4/match_results.hpp
==============================================================================
--- trunk/boost/regex/v4/match_results.hpp (original)
+++ trunk/boost/regex/v4/match_results.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -38,7 +38,6 @@
namespace re_detail{
-template <class charT>
class named_subexpressions;
}
@@ -69,7 +68,7 @@
typedef typename re_detail::regex_iterator_traits<
BidiIterator>::value_type char_type;
typedef std::basic_string<char_type> string_type;
- typedef re_detail::named_subexpressions_base<char_type> named_sub_type;
+ typedef re_detail::named_subexpressions named_sub_type;
// construct/copy/destroy:
explicit match_results(const Allocator& a = Allocator())
@@ -225,10 +224,15 @@
//
const_reference named_subexpression(const char_type* i, const char_type* j) const
{
+ //
+ // Scan for the leftmost *matched* subexpression with the specified named:
+ //
if(m_is_singular)
raise_logic_error();
- int index = m_named_subs->get_id(i, j);
- return index > 0 ? (*this)[index] : m_null;
+ re_detail::named_subexpressions::range_type r = m_named_subs->equal_range(i, j);
+ while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+ ++r.first;
+ return r.first != r.second ? (*this)[r.first->index] : m_null;
}
template <class charT>
const_reference named_subexpression(const charT* i, const charT* j) const
@@ -243,10 +247,20 @@
}
int named_subexpression_index(const char_type* i, const char_type* j) const
{
+ //
+ // Scan for the leftmost *matched* subexpression with the specified named.
+ // If none found then return the leftmost expression with that name,
+ // otherwise an invalid index:
+ //
if(m_is_singular)
raise_logic_error();
- int index = m_named_subs->get_id(i, j);
- return index > 0 ? index : -20;
+ re_detail::named_subexpressions::range_type s, r;
+ s = r = m_named_subs->equal_range(i, j);
+ while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+ ++r.first;
+ if(r.first == r.second)
+ r = s;
+ return r.first != r.second ? r.first->index : -20;
}
template <class charT>
int named_subexpression_index(const charT* i, const charT* j) const
Modified: trunk/boost/regex/v4/perl_matcher_common.hpp
==============================================================================
--- trunk/boost/regex/v4/perl_matcher_common.hpp (original)
+++ trunk/boost/regex/v4/perl_matcher_common.hpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -200,7 +200,7 @@
m_match_flags |= regex_constants::match_all;
m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
m_presult->set_base(base);
- m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+ m_presult->set_named_subs(this->re.get_named_subs());
if(m_match_flags & match_posix)
m_result = *m_presult;
verify_options(re.flags(), m_match_flags);
@@ -262,7 +262,7 @@
pstate = re.get_first_state();
m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last);
m_presult->set_base(base);
- m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+ m_presult->set_named_subs(this->re.get_named_subs());
m_match_flags |= regex_constants::match_init;
}
else
@@ -588,8 +588,23 @@
// in the match, this is in line with ECMAScript, but not Perl
// or PCRE.
//
- BidiIterator i = (*m_presult)[static_cast<const re_brace*>(pstate)->index].first;
- BidiIterator j = (*m_presult)[static_cast<const re_brace*>(pstate)->index].second;
+ int index = static_cast<const re_brace*>(pstate)->index;
+ if(index >= 10000)
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(index);
+ BOOST_ASSERT(r.first != r.second);
+ do
+ {
+ index = r.first->index;
+ ++r.first;
+ }while((r.first != r.second) && ((*m_presult)[index].matched != true));
+ }
+
+ if((m_match_flags & match_perl) && !(*m_presult)[index].matched)
+ return false;
+
+ BidiIterator i = (*m_presult)[index].first;
+ BidiIterator j = (*m_presult)[index].second;
while(i != j)
{
if((position == last) || (traits_inst.translate(*position, icase) != traits_inst.translate(*i, icase)))
@@ -713,7 +728,7 @@
{
// return true if marked sub-expression N has been matched:
int index = static_cast<const re_brace*>(pstate)->index;
- bool result;
+ bool result = false;
if(index == 9999)
{
// Magic value for a (DEFINE) block:
@@ -721,11 +736,25 @@
}
else if(index > 0)
{
+ // Have we matched subexpression "index"?
// Check if index is a hash value:
if(index >= 10000)
- index = re.get_data().get_id(index);
- // Have we matched subexpression "index"?
- result = (*m_presult)[index].matched;
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(index);
+ while(r.first != r.second)
+ {
+ if((*m_presult)[r.first->index].matched)
+ {
+ result = true;
+ break;
+ }
+ ++r.first;
+ }
+ }
+ else
+ {
+ result = (*m_presult)[index].matched;
+ }
pstate = pstate->next.p;
}
else
@@ -734,8 +763,20 @@
// If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
int idx = -index-1;
if(idx >= 10000)
- idx = re.get_data().get_id(idx);
- result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+ {
+ named_subexpressions::range_type r = re.get_data().equal_range(idx);
+ int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;
+ while(r.first != r.second)
+ {
+ result |= (stack_index == r.first->index);
+ if(result)break;
+ ++r.first;
+ }
+ }
+ else
+ {
+ result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+ }
pstate = pstate->next.p;
}
return result;
Modified: trunk/libs/regex/src/regex_traits_defaults.cpp
==============================================================================
--- trunk/libs/regex/src/regex_traits_defaults.cpp (original)
+++ trunk/libs/regex/src/regex_traits_defaults.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -100,7 +100,7 @@
"p",
"P",
"N",
- "g",
+ "gk",
"K",
"R",
};
Modified: trunk/libs/regex/test/regress/test_perl_ex.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_perl_ex.cpp (original)
+++ trunk/libs/regex/test/regress/test_perl_ex.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -907,5 +907,16 @@
// Bugs:
TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(); } }", match_default, make_array(0, 46, 10, 13, 14, 46, -2, -2));
TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(){} } { {{{ } } } } {}}", match_default, make_array(0, 64, 10, 13, 14, 64, -2, -2));
+
+ // Recursion to a named sub with a name that is used multiple times:
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "aaaa.aa", match_default, make_array(0, 7, 0, 4, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.aa", match_default, make_array(0, 7, -1, -1, 0, 4, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.bb", match_default, make_array(-2, -2));
+ // Back reference to a named sub with a name that is used multiple times:
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.aaaa", match_default, make_array(0, 9, 0, 4, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.aaaa", match_default, make_array(-2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.bbbb", match_default, make_array(-2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.bbbb", match_default, make_array(0, 9, -1, -1, 0, 4, -2, -2));
+ TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+)|c+)\\.\\k<A>", perl, "cccc.cccc", match_default, make_array(-2, -2));
}
Modified: trunk/libs/regex/test/regress/test_replace.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_replace.cpp (original)
+++ trunk/libs/regex/test/regress/test_replace.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -184,5 +184,11 @@
TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "$2", "bb");
TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "d$+{one}c", "daac");
TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "c$+{two}d", "cbbd");
+
+ TEST_REGEX_REPLACE("(?<one>a)(?<one>b)(c)", perl, " ...abc,,", match_default, "$1.$2.$3.$+{one}", " ...a.b.c.a,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...a,,", match_default, "$1.$2.$+{one}", " ...a..a,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...b,,", match_default, "$1.$2.$+{one}", " ....b.b,,");
+ TEST_REGEX_REPLACE("(?:(?<one>a)(?<one>b))", perl, " ...ab,,", match_default, "$1.$2.$+{one}", " ...a.b.a,,");
+
}
Modified: trunk/libs/regex/test/regress/test_tricky_cases.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_tricky_cases.cpp (original)
+++ trunk/libs/regex/test/regress/test_tricky_cases.cpp 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -49,7 +49,7 @@
TEST_REGEX_SEARCH("(a)d|(b)c", perl, "abc", match_default, make_array(1, 3, -1, -1, 1, 2, -2, -2));
TEST_REGEX_SEARCH("_+((www)|(ftp)|(mailto)):_*", perl, "_wwwnocolon _mailto:", match_default, make_array(12, 20, 13, 19, -1, -1, -1, -1, 13, 19, -2, -2));
// subtleties of matching
- TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(0, 3, -1, -1, -2, -2));
+ TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default, make_array(0, 4, 2, 3, -2, -2));
TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default, make_array(0, 10, 0, 3, 3, 10, -2, -2));
TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2));
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk