Boost logo

Boost :

From: Daniel Berlin (dan_at_[hidden])
Date: 2001-06-08 23:01:22


John Max Skaller <skaller_at_[hidden]> writes:

> joel de guzman wrote:
>
>> 1. Why is compilation times bounded by lexical analysis?
>
> Because finding a lexeme requires scanning characters.
> There are lots of them. Even a very fast lexer has some
> per character overhead.
>
>> 2. What are the performance bottle-necks typically encountered?
>
> Old fashioned lexers use buffers. It is very expensive
> to test for end of buffer. On modern equipment, you can just
> load the whole file into memory and add a 'null' at the end,
> this speeds up advancing to the next character a huge amount.
>
> Still, you have to do a lookup into a rather large
> transition table. That typically guarrantees a physical
> memory access (clobbers the cache). So a 100Mhz processor
> is suddenly running at 10Mhz.
>
> For example, the Felix lexer:
>
> ocamllex.opt src/flx_lex.mll
> 208 states, 3174 transitions, table size 13944 bytes
>
> Obviously, 13K is bigger than the primary processor cache.

Try a scanner generator that generates directly executable
code, like re2c.
It's often *much* smaller, and *much* faster, than flex.
And it's the same amount of actual effort.

>
>> 3. What can be done about it?
>
> Often, a hand written lexer will be faster than
> anything else.
Same answer.
Just because flex is slow, doesn't mean everything else is.

You can make scanner generators that generate directly executable
scanners (IE output the real C code, rather than tables and an
interpreter), and you aren't going to make a faster hand coded one,
unless you know something you can't tell the scanner generator that
helps you.

This is rare.

From the following scanner specification:

(It's a severely hacked up re2c example, to avoid being too large in
lines of code to post (it compiles to something a lot smaller, obviously), so
excuse the style. I din't code it, i just shortened it)

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319

typedef unsigned int uint;
typedef unsigned char uchar;

#define BSIZE 8192

#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}

#define RET(i) {s->cur = cursor; return i;}

typedef struct Scanner {
    int fd;
    uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint line;
} Scanner;

uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}

int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
/*!re2c
any = [\000-\377];
O = [0-7];
D = [0-9];
L = [a-zA-Z_];
H = [a-fA-F0-9];
E = [Ee] [+-]? D+;
FS = [fFlL];
IS = [uUlL]*;
ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+);
*/

/*!re2c
        
        L (L|D)* { RET(ID); }
        
        D+ { RET(NUM); }
        [ \t\v\f]+ { goto std; }

        "\n"
            {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }

        any
            {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
*/

}

main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}

From that, you get, with the -s option, to use binary searches instead
of direct switch statements most of the time, you get:

/* Generated by re2c 0.9.1 on Fri Jun 8 23:53:33 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319

typedef unsigned int uint;
typedef unsigned char uchar;

#define BSIZE 8192

#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}

#define RET(i) {s->cur = cursor; return i;}

typedef struct Scanner {
    int fd;
    uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint line;
} Scanner;

uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}

int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
#line 75

{
        YYCTYPE yych;
        unsigned int yyaccept;
        goto yy0;
yy1: ++YYCURSOR;
yy0:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        if(yych <= '/'){
                if(yych <= '\n'){
                        if(yych <= '\b') goto yy10;
                        if(yych <= '\t') goto yy6;
                        goto yy8;
                } else {
                        if(yych <= '\f') goto yy6;
                        if(yych == ' ') goto yy6;
                        goto yy10;
                }
        } else {
                if(yych <= '^'){
                        if(yych <= '9') goto yy4;
                        if(yych <= '@') goto yy10;
                        if(yych >= '[') goto yy10;
                } else {
                        if(yych == '`') goto yy10;
                        if(yych >= '{') goto yy10;
                }
        }
yy2: yych = *++YYCURSOR;
        goto yy17;
yy3:
#line 79
        { RET(ID); }
yy4: yych = *++YYCURSOR;
        goto yy15;
yy5:
#line 81
        { RET(NUM); }
yy6: yych = *++YYCURSOR;
        goto yy13;
yy7:
#line 82
        { goto std; }
yy8: yych = *++YYCURSOR;
yy9:
#line 85
        {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }
yy10: yych = *++YYCURSOR;
yy11:
#line 92
        {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
yy12: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy13: if(yych <= '\n'){
                if(yych == '\t') goto yy12;
                goto yy7;
        } else {
                if(yych <= '\f') goto yy12;
                if(yych == ' ') goto yy12;
                goto yy7;
        }
yy14: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy15: if(yych <= '/') goto yy5;
        if(yych <= '9') goto yy14;
        goto yy5;
yy16: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy17: if(yych <= 'Z'){
                if(yych <= '/') goto yy3;
                if(yych <= '9') goto yy16;
                if(yych <= '@') goto yy3;
                goto yy16;
        } else {
                if(yych <= '_'){
                        if(yych <= '^') goto yy3;
                        goto yy16;
                } else {
                        if(yych <= '`') goto yy3;
                        if(yych <= 'z') goto yy16;
                        goto yy3;
                }
        }
}
#line 96

}

main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}

Without the -s option, you get:

/* Generated by re2c 0.5 on Fri Jun 8 23:53:39 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319

typedef unsigned int uint;
typedef unsigned char uchar;

#define BSIZE 8192

#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}

#define RET(i) {s->cur = cursor; return i;}

typedef struct Scanner {
    int fd;
    uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint line;
} Scanner;

uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}

int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
#line 75

{
        YYCTYPE yych;
        unsigned int yyaccept;
        goto yy0;
yy1: ++YYCURSOR;
yy0:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        switch(yych){
        case '\t': case '\v':
        case '\f': case ' ': goto yy6;
        case '\n': goto yy8;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9': goto yy4;
        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z': case '_': case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z': goto yy2;
        default: goto yy10;
        }
yy2: yych = *++YYCURSOR;
        goto yy17;
yy3:
#line 79
        { RET(ID); }
yy4: yych = *++YYCURSOR;
        goto yy15;
yy5:
#line 81
        { RET(NUM); }
yy6: yych = *++YYCURSOR;
        goto yy13;
yy7:
#line 82
        { goto std; }
yy8: yych = *++YYCURSOR;
yy9:
#line 85
        {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }
yy10: yych = *++YYCURSOR;
yy11:
#line 92
        {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
yy12: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy13: switch(yych){
        case '\t': case '\v':
        case '\f': case ' ': goto yy12;
        default: goto yy7;
        }
yy14: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy15: switch(yych){
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9': goto yy14;
        default: goto yy5;
        }
yy16: ++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy17: switch(yych){
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9': case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z': case '_': case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z': goto yy16;
        default: goto yy3;
        }
}
#line 96

}

main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}

etc.

There is another option to generate small bitmaps for larger classes,
and use those in addition to the other two above (IE you can combine
-sb, or just use -b).

Which will generate the fastest code of course is dependent on your
compiler. However, I doubt you could code a scanner significantly
faster than the above, unless your compiler is horrible about gotos or
something.

--Dan

-- 
"I went to a restaurant that serves "breakfast at any time."  So
I ordered French Toast during the Renaissance.
"-Steven Wright

Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk