Boost logo

Boost :

From: David Bien (davidbien_at_[hidden])
Date: 2000-04-19 18:28:35


I have written an optimizing lexical analyzer generator template
library.

The idea is to create a set of regular expressions as local variables
using a regular expression operator namespace. This gets compiled
with the lexical analyzer template library, and some commands to
generate a set of N lexical analyzers(DFAs). The resulting executable
is run to actually generate the DFA's as a single .h and .cpp file.

stuff:
- templatized by character type - char and wchar_t have been tested
   any character-like type should fine - with a small amount of work
- Regular expression specification given via use of namespace
   overriding global operators.
- no virtuals necessary in generated lexical analyzer ( through use
   of templatized member functions )
- produces optimized DFA using algorthm by Aho/Sethi/Ullman"Compilers"
- support for '-' (exclude pattern) operator
- support for "completed by" operator
- lookahead support
- rule/action disambiguation support
- support for "trigger" actions - actions embedded in a regular
   expression that fire during the recognition of a pattern
   ( different concept than tokenization ).
- support for "lex" style pattern completion actions though not given
   in a "lex" manner, but via declaration of a unique action template
- NO pattern rejection support - though I think it could be added -
   comments in code about where it might need modification to
   implement
- written in ANSI C++, compiles on intel4.0 and gcc2.95.2.

The operators ( chosen cuz their relative precedence is the same as
that of the corresponding regular expression operator ):
| : "or"
* : concatenation
~ : kleene closure
-- : (predec) zero or one
++ : (preinc) one or more
- : "excludes" ( pattern exclusion )
+ : "completed by"
/ : lookahead operator

Here is a small usage example, with the generated DFA afterwards:

USE_REGEXP_OP_NAMESPACE
typedef wchar_t _TyCharTokens;
typedef _regexp_final< _TyCharTokens,
                       _TyAllocator > _TyRegExp;
#define l(x) literal< _TyCharTokens >(x)
#define ls(x) litstr< _TyCharTokens >(x)
#define lr(x,y) litrange< _TyCharTokens >(x,y)
#define t(a) _TyTrigger(a)

_TyRegExp _CharNoMinus = l(0x09) | l(0x0a) | l(0x0d)
  | lr(0x020,0x02c) | lr(0x02e,0xd7ff) | lr(0xe000,0xfffd);
_TyRegExp Comment = ls(L"<!--") * ~(_CharNoMinus | (l(L'-')
  * _CharNoMinus)) * ls(L"-->");

Comment.SetAction( _l_action_token< _TyCharTokens, 98 >() );

typedef _dfa< _TyCharTokens, _TyAllocator > _TyDfa;
typedef _TyDfa::_TyDfaCtxt _TyDfaCtxt;

_TyDfa dfaComment;
_TyDfaCtxt dctxtComment( dfaComment );
// regexp->NFA->DFA->optimized DFA:
gen_dfa( Comment, dfaComment, dctxtComment );
_l_generator< _TyDfa, char >
  gen( dfaComment, dctxtComment, "_testdfa.h",
       "testdfa.cpp", "TESTDFA", true, "ns_testdfa",
       "_l_token_analyzer< _TyAnalyzer >", "state",
       "startComment", "action", "wchar_t", "L'", "'" );
gen.generate();

Here is the generated DFA ( header excluded ):

typedef _l_state< wchar_t, 1, false, false, 0, 0 > _Tystate44;
_Tystate44 startComment = { 0, 1, 0, 0, 0, 0,
  {
    { L'<', L'<', (_TyStateProto*)( & state_45 ) }
  } };
typedef _l_state< wchar_t, 1, false, false, 0, 0 > _Tystate45;
_Tystate45 state_45 = { 0, 1, 0, 0, 0, 0,
  {
    { L'!', L'!', (_TyStateProto*)( & state_46 ) }
  } };
typedef _l_state< wchar_t, 1, false, false, 0, 0 > _Tystate46;
_Tystate46 state_46 = { 0, 1, 0, 0, 0, 0,
  {
    { L'-', L'-', (_TyStateProto*)( & state_47 ) }
  } };
typedef _l_state< wchar_t, 1, false, false, 0, 0 > _Tystate47;
_Tystate47 state_47 = { 0, 1, 0, 0, 0, 0,
  {
    { L'-', L'-', (_TyStateProto*)( & state_48 ) }
  } };
typedef _l_state< wchar_t, 6, false, false, 0, 0 > _Tystate48;
_Tystate48 state_48 = { 0, 6, 0, 0, 0, 0,
  {
    { 0x09, 0x0a, (_TyStateProto*)( & state_48 ) },
    { 0x0d, 0x0d, (_TyStateProto*)( & state_48 ) },
    { L' ', L',', (_TyStateProto*)( & state_48 ) },
    { L'-', L'-', (_TyStateProto*)( & state_49 ) },
    { L'.', 0x0d7ff, (_TyStateProto*)( & state_48 ) },
    { 0x0e000, 0x0fffd, (_TyStateProto*)( & state_48 ) }
  } };
typedef _l_state< wchar_t, 6, false, false, 0, 0 > _Tystate49;
_Tystate49 state_49 = { 0, 6, 0, 0, 0, 0,
  {
    { 0x09, 0x0a, (_TyStateProto*)( & state_48 ) },
    { 0x0d, 0x0d, (_TyStateProto*)( & state_48 ) },
    { L' ', L',', (_TyStateProto*)( & state_48 ) },
    { L'-', L'-', (_TyStateProto*)( & state_50 ) },
    { L'.', 0x0d7ff, (_TyStateProto*)( & state_48 ) },
    { 0x0e000, 0x0fffd, (_TyStateProto*)( & state_48 ) }
  } };
typedef _l_state< wchar_t, 1, false, false, 0, 0 > _Tystate50;
_Tystate50 state_50 = { 0, 1, 0, 0, 0, 0,
  {
    { L'>', L'>', (_TyStateProto*)( & state_51 ) }
  } };
typedef _l_state< wchar_t, 0, true, false, 0, 0 > _Tystate51;
_Tystate51 state_51 = { kucAccept, 0, 0, 0,
  offsetof( _Tystate51, m_pmfnAccept ), 0,
static_cast<_TyAnalyzerBase::_TyPMFnAccept>(&_TyAnalyzer::Action41)};

_l_action_token< wchar_t, 98, false > action41;
bool _TyAnalyzer::Action41()
{
  return action41.action< _TyMDAnalyzer >(
    static_cast< _TyMDAnalyzer & >( *this ) );
}


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk