Boost logo

Boost-Commit :

From: hughwimberly_at_[hidden]
Date: 2007-07-15 03:19:10


Author: hughwimberly
Date: 2007-07-15 03:19:10 EDT (Sun, 15 Jul 2007)
New Revision: 7433
URL: http://svn.boost.org/trac/boost/changeset/7433

Log:
Support for named captures, including retrieval and backreferences. Will not work for ICU, and untested on wide characters.

Added:
   sandbox/SOC/2007/regex/named-captures/libs/regex/test/regress/test_named_captures.hpp
Text files modified:
   sandbox/SOC/2007/regex/named-captures/boost/regex/v4/basic_regex_parser.hpp | 80 ++++++++++++++++++++++++++++++++++++++-
   sandbox/SOC/2007/regex/named-captures/boost/regex/v4/perl_matcher_common.hpp | 1
   2 files changed, 78 insertions(+), 3 deletions(-)

Modified: sandbox/SOC/2007/regex/named-captures/boost/regex/v4/basic_regex_parser.hpp
==============================================================================
--- sandbox/SOC/2007/regex/named-captures/boost/regex/v4/basic_regex_parser.hpp (original)
+++ sandbox/SOC/2007/regex/named-captures/boost/regex/v4/basic_regex_parser.hpp 2007-07-15 03:19:10 EDT (Sun, 15 Jul 2007)
@@ -1611,9 +1611,8 @@
       return false;
    }
    //
- // treat comments as a special case, as these
- // are the only ones that don't start with a leading
- // startmark state:
+ // treat comments and named back references as special cases, as these
+ // are the only ones that don't start with a leading startmark state:
    //
    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
    {
@@ -1622,6 +1621,37 @@
       {}
       return true;
    }
+ if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_not_property &&
+ this->m_traits.syntax_type(*(m_position+1)) == regex_constants::syntax_equal)
+ {
+ ++m_position;
+ // parse named backreference (?P=name)
+ BOOST_ASSERT(m_position != m_end);
+ const charT* name_start = ++m_position;
+ while (m_position != m_end && this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
+ ++m_position;
+ if(m_position == m_end)
+ {
+ this->fail(regex_constants::error_paren, m_position - m_base);
+ return false;
+ }
+ std::basic_string<charT> group_name(name_start, m_position);
+
+ std::map<std::basic_string<charT>, int>::iterator name_ref = this->m_pdata->p_name_map->find(group_name);
+ if (name_ref != this->m_pdata->p_name_map->end() && (this->m_backrefs & (1u << (name_ref->second-1))))
+ {
+ re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
+ pb->index = name_ref->second;
+ }
+ else
+ {
+ fail(regex_constants::error_backref, m_position - m_end);
+ return false;
+ }
+ ++m_position;
+ return true;
+ }
+
    //
    // backup some state, and prepare the way:
    //
@@ -1768,6 +1798,50 @@
    case regex_constants::syntax_close_mark:
       fail(regex_constants::error_badrepeat, m_position - m_base);
       return false;
+ case regex_constants::escape_type_not_property: //why is this a "P"?
+ // check for named capture syntax
+ if (*m_position == charT('P'))
+ {
+ ++m_position;
+ if (this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
+ {
+ // parse (?P<name>pattern)
+ // find end bracket ">", any character until then is part of the name
+ const charT* name_start = ++m_position;
+ while (m_position != m_end &&
+ this->m_traits.syntax_type(*m_position) != regex_constants::escape_type_right_word)
+ ++m_position;
+ if(m_position == m_end)
+ {
+ this->fail(regex_constants::error_paren, m_position - m_base);
+ return false;
+ }
+ std::basic_string<charT> group_name(name_start, m_position);
+
+ if (this->m_pdata->p_name_map->count(group_name) > 0)
+ {
+ // the capture name has already been used
+ fail(regex_constants::error_bad_pattern, m_position - m_base);
+ return false;
+ }
+ pb->index = markid = ++m_mark_count;
+ // add the name and markid to the name map
+ this->m_pdata->p_name_map->insert(std::pair<std::basic_string<charT>, int>(group_name, markid));
+
+ // allow backrefs to this mark:
+ if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
+ this->m_backrefs |= 1u << (markid - 1);
+
+ ++m_position;
+ }
+ else
+ {
+ fail(regex_constants::error_bad_pattern, m_position - m_base);
+ return false;
+ }
+ break;
+ }
+ //otherwise, fall through
    default:
       //
       // lets assume that we have a (?imsx) group and try and parse it:

Modified: sandbox/SOC/2007/regex/named-captures/boost/regex/v4/perl_matcher_common.hpp
==============================================================================
--- sandbox/SOC/2007/regex/named-captures/boost/regex/v4/perl_matcher_common.hpp (original)
+++ sandbox/SOC/2007/regex/named-captures/boost/regex/v4/perl_matcher_common.hpp 2007-07-15 03:19:10 EDT (Sun, 15 Jul 2007)
@@ -64,6 +64,7 @@
    }
    else
       m_presult = &m_result;
+ m_presult->set_name_map(e.get_name_map());
 #ifdef BOOST_REGEX_NON_RECURSIVE
    m_stack_base = 0;
    m_backup_state = 0;

Added: sandbox/SOC/2007/regex/named-captures/libs/regex/test/regress/test_named_captures.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2007/regex/named-captures/libs/regex/test/regress/test_named_captures.hpp 2007-07-15 03:19:10 EDT (Sun, 15 Jul 2007)
@@ -0,0 +1,83 @@
+
+#include <iostream>
+
+using namespace std;
+using namespace boost;
+
+void test_named_captures(const string& pattern, const string& text)
+{
+ regex ex(pattern);
+ smatch matches;
+ if (regex_match(text, matches, ex, match_extra))
+ cout << pattern << " matches " << text << endl;
+ else
+ cout << pattern << " does not match " << text << endl;
+}
+
+void test_named_groups()
+{
+ test_named_captures("(?P<alpha>a)", "a");
+ test_named_captures("(?P<alpha>abc)+", "abcabc");
+ test_named_captures("(?P<alpha>a+)(b+)", "aaabbb");
+ test_named_captures("(a+)(?P<beta>b+)", "aaabbb");
+ test_named_captures("(?P<outer>(a+)b+)", "aaabbb");
+ test_named_captures("(?P<outer>a+(b+))", "aaabbb");
+ test_named_captures("(a+(?P<inner>b+))", "aaabbb");
+ test_named_captures("(?P<outer>a+(?P<inner>b+))", "aaabbb");
+ test_named_captures("a+(?P<inSiDe>b+)b+", "aaadzabbb"); //should fail
+ test_named_captures("(a)(?P<alt>b|$)", "ab");
+ test_named_captures("(a)(?P<alt>b|$)", "a");
+}
+
+void test_named_retrieval()
+{
+ smatch matches;
+ regex ad("got (?P<drink>\\w+)\\?$");
+ string text = "got milk?";
+ if (regex_match(text, matches, ad, match_extra) && matches["drink"] == "milk")
+ cout << "passed ad\n";
+ else
+ cout << "failed ad\n";
+
+ regex magic("(?P<first>abra)ca(?P<second>dabra)!");
+ text = "abracadabra!";
+ if (regex_match(text, matches, magic, match_extra) && matches["first"] == "abra" && matches["second"] == "dabra")
+ cout << "passed magic\n";
+ else
+ cout << "failed magic\n";
+
+ regex poem("(?P<first>.*), (\\w+) (?P<second>.*) D.*");
+ text = "Twas brillig, and the slithy toves Did gyre and gimble in the wabe";
+ if (regex_match(text, matches, poem, match_extra) &&
+ matches["first"] == "Twas brillig" && matches["second"] == "the slithy toves" &&
+ matches["second"] == matches[3])
+ cout << "passed poem\n";
+ else
+ cout << "failed poem\n";
+}
+
+void test_named_backrefs()
+{
+ test_named_captures("(?P<alpha>a)bc(?P=alpha)", "abca");
+ test_named_captures("(?P<alpha>a+)bc(?P=alpha)", "aaabcaaa");
+ test_named_captures("(?P<alpha>a+)bc(?P=alpha)", "aaabca"); //should fail
+ test_named_captures("a(?P<alpha>b|d)(?P=alpha)", "abb");
+ test_named_captures("a(?P<alpha>b|d)(?P=alpha)", "add");
+ test_named_captures("a(?P<alpha>b|d)(?P=alpha)", "abd"); //should fail
+ test_named_captures("(?P<name>\\w+) (.*) (?P=name)", "Jameson R. Jameson");
+ test_named_captures("(a+(?P<inner>b+))(?P=inner)", "aabbbb");
+ test_named_captures("(a+(?P<inner>b+))c(?P=inner)", "aabcb");
+ test_named_captures("(a+(?P<inner>b+))(?P=inner)", "aabbb"); //should fail
+ test_named_captures("(?P<recursive>a+(?P=recursive))(b+)", "aaabbb"); //should fail
+ test_named_captures("(?P<recursive>a+(?P=recursive))(b+)", "aaabbb"); //should fail
+ // the two below should (and do) cause runtime errors when the pattern compiles
+ //test_named_captures("(?P=alpha)", "abca"); //should fail
+ //test_named_captures("(?P=<alpha>.*)(?=beta)", "abab"); //should fail
+}
+
+void run_experimental()
+{
+ test_named_groups();
+ test_named_retrieval();
+ test_named_backrefs();
+}
\ No newline at end of file


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk