|
Boost : |
From: Daniel Berlin (dan_at_[hidden])
Date: 2001-06-08 23:01:22
John Max Skaller <skaller_at_[hidden]> writes:
> joel de guzman wrote:
>
>> 1. Why is compilation times bounded by lexical analysis?
>
> Because finding a lexeme requires scanning characters.
> There are lots of them. Even a very fast lexer has some
> per character overhead.
>
>> 2. What are the performance bottle-necks typically encountered?
>
> Old fashioned lexers use buffers. It is very expensive
> to test for end of buffer. On modern equipment, you can just
> load the whole file into memory and add a 'null' at the end,
> this speeds up advancing to the next character a huge amount.
>
> Still, you have to do a lookup into a rather large
> transition table. That typically guarrantees a physical
> memory access (clobbers the cache). So a 100Mhz processor
> is suddenly running at 10Mhz.
>
> For example, the Felix lexer:
>
> ocamllex.opt src/flx_lex.mll
> 208 states, 3174 transitions, table size 13944 bytes
>
> Obviously, 13K is bigger than the primary processor cache.
Try a scanner generator that generates directly executable
code, like re2c.
It's often *much* smaller, and *much* faster, than flex.
And it's the same amount of actual effort.
>
>> 3. What can be done about it?
>
> Often, a hand written lexer will be faster than
> anything else.
Same answer.
Just because flex is slow, doesn't mean everything else is.
You can make scanner generators that generate directly executable
scanners (IE output the real C code, rather than tables and an
interpreter), and you aren't going to make a faster hand coded one,
unless you know something you can't tell the scanner generator that
helps you.
This is rare.
From the following scanner specification:
(It's a severely hacked up re2c example, to avoid being too large in
lines of code to post (it compiles to something a lot smaller, obviously), so
excuse the style. I din't code it, i just shortened it)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319
typedef unsigned int uint;
typedef unsigned char uchar;
#define BSIZE 8192
#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}
#define RET(i) {s->cur = cursor; return i;}
typedef struct Scanner {
int fd;
uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
uint line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
if(!s->eof){
uint cnt = s->tok - s->bot;
if(cnt){
memcpy(s->bot, s->tok, s->lim - s->tok);
s->tok = s->bot;
s->ptr -= cnt;
cursor -= cnt;
s->pos -= cnt;
s->lim -= cnt;
}
if((s->top - s->lim) < BSIZE){
uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
memcpy(buf, s->tok, s->lim - s->tok);
s->tok = buf;
s->ptr = &buf[s->ptr - s->bot];
cursor = &buf[cursor - s->bot];
s->pos = &buf[s->pos - s->bot];
s->lim = &buf[s->lim - s->bot];
s->top = &s->lim[BSIZE];
free(s->bot);
s->bot = buf;
}
if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
}
s->lim += cnt;
}
return cursor;
}
int scan(Scanner *s){
uchar *cursor = s->cur;
std:
s->tok = cursor;
/*!re2c
any = [\000-\377];
O = [0-7];
D = [0-9];
L = [a-zA-Z_];
H = [a-fA-F0-9];
E = [Ee] [+-]? D+;
FS = [fFlL];
IS = [uUlL]*;
ESC = [\\] ([abfnrtv?'"\\] | "x" H+ | O+);
*/
/*!re2c
L (L|D)* { RET(ID); }
D+ { RET(NUM); }
[ \t\v\f]+ { goto std; }
"\n"
{
if(cursor == s->eof) RET(EOI);
s->pos = cursor; s->line++;
goto std;
}
any
{
printf("unexpected character: %c\n", *s->tok);
goto std;
}
*/
}
main(){
Scanner in;
int t;
memset((char*) &in, 0, sizeof(in));
in.fd = 0;
while((t = scan(&in)) != EOI){
/*
printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
printf("%d\n", t);
*/
}
close(in.fd);
}
From that, you get, with the -s option, to use binary searches instead
of direct switch statements most of the time, you get:
/* Generated by re2c 0.9.1 on Fri Jun 8 23:53:33 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319
typedef unsigned int uint;
typedef unsigned char uchar;
#define BSIZE 8192
#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}
#define RET(i) {s->cur = cursor; return i;}
typedef struct Scanner {
int fd;
uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
uint line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
if(!s->eof){
uint cnt = s->tok - s->bot;
if(cnt){
memcpy(s->bot, s->tok, s->lim - s->tok);
s->tok = s->bot;
s->ptr -= cnt;
cursor -= cnt;
s->pos -= cnt;
s->lim -= cnt;
}
if((s->top - s->lim) < BSIZE){
uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
memcpy(buf, s->tok, s->lim - s->tok);
s->tok = buf;
s->ptr = &buf[s->ptr - s->bot];
cursor = &buf[cursor - s->bot];
s->pos = &buf[s->pos - s->bot];
s->lim = &buf[s->lim - s->bot];
s->top = &s->lim[BSIZE];
free(s->bot);
s->bot = buf;
}
if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
}
s->lim += cnt;
}
return cursor;
}
int scan(Scanner *s){
uchar *cursor = s->cur;
std:
s->tok = cursor;
#line 75
{
YYCTYPE yych;
unsigned int yyaccept;
goto yy0;
yy1: ++YYCURSOR;
yy0:
if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
yych = *YYCURSOR;
if(yych <= '/'){
if(yych <= '\n'){
if(yych <= '\b') goto yy10;
if(yych <= '\t') goto yy6;
goto yy8;
} else {
if(yych <= '\f') goto yy6;
if(yych == ' ') goto yy6;
goto yy10;
}
} else {
if(yych <= '^'){
if(yych <= '9') goto yy4;
if(yych <= '@') goto yy10;
if(yych >= '[') goto yy10;
} else {
if(yych == '`') goto yy10;
if(yych >= '{') goto yy10;
}
}
yy2: yych = *++YYCURSOR;
goto yy17;
yy3:
#line 79
{ RET(ID); }
yy4: yych = *++YYCURSOR;
goto yy15;
yy5:
#line 81
{ RET(NUM); }
yy6: yych = *++YYCURSOR;
goto yy13;
yy7:
#line 82
{ goto std; }
yy8: yych = *++YYCURSOR;
yy9:
#line 85
{
if(cursor == s->eof) RET(EOI);
s->pos = cursor; s->line++;
goto std;
}
yy10: yych = *++YYCURSOR;
yy11:
#line 92
{
printf("unexpected character: %c\n", *s->tok);
goto std;
}
yy12: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy13: if(yych <= '\n'){
if(yych == '\t') goto yy12;
goto yy7;
} else {
if(yych <= '\f') goto yy12;
if(yych == ' ') goto yy12;
goto yy7;
}
yy14: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy15: if(yych <= '/') goto yy5;
if(yych <= '9') goto yy14;
goto yy5;
yy16: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy17: if(yych <= 'Z'){
if(yych <= '/') goto yy3;
if(yych <= '9') goto yy16;
if(yych <= '@') goto yy3;
goto yy16;
} else {
if(yych <= '_'){
if(yych <= '^') goto yy3;
goto yy16;
} else {
if(yych <= '`') goto yy3;
if(yych <= 'z') goto yy16;
goto yy3;
}
}
}
#line 96
}
main(){
Scanner in;
int t;
memset((char*) &in, 0, sizeof(in));
in.fd = 0;
while((t = scan(&in)) != EOI){
/*
printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
printf("%d\n", t);
*/
}
close(in.fd);
}
Without the -s option, you get:
/* Generated by re2c 0.5 on Fri Jun 8 23:53:39 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define EOI 319
typedef unsigned int uint;
typedef unsigned char uchar;
#define BSIZE 8192
#define YYCTYPE uchar
#define YYCURSOR cursor
#define YYLIMIT s->lim
#define YYMARKER s->ptr
#define YYFILL(n) {cursor = fill(s, cursor);}
#define RET(i) {s->cur = cursor; return i;}
typedef struct Scanner {
int fd;
uchar *bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
uint line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
if(!s->eof){
uint cnt = s->tok - s->bot;
if(cnt){
memcpy(s->bot, s->tok, s->lim - s->tok);
s->tok = s->bot;
s->ptr -= cnt;
cursor -= cnt;
s->pos -= cnt;
s->lim -= cnt;
}
if((s->top - s->lim) < BSIZE){
uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
memcpy(buf, s->tok, s->lim - s->tok);
s->tok = buf;
s->ptr = &buf[s->ptr - s->bot];
cursor = &buf[cursor - s->bot];
s->pos = &buf[s->pos - s->bot];
s->lim = &buf[s->lim - s->bot];
s->top = &s->lim[BSIZE];
free(s->bot);
s->bot = buf;
}
if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
}
s->lim += cnt;
}
return cursor;
}
int scan(Scanner *s){
uchar *cursor = s->cur;
std:
s->tok = cursor;
#line 75
{
YYCTYPE yych;
unsigned int yyaccept;
goto yy0;
yy1: ++YYCURSOR;
yy0:
if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
yych = *YYCURSOR;
switch(yych){
case '\t': case '\v':
case '\f': case ' ': goto yy6;
case '\n': goto yy8;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': goto yy4;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z': case '_': case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy2;
default: goto yy10;
}
yy2: yych = *++YYCURSOR;
goto yy17;
yy3:
#line 79
{ RET(ID); }
yy4: yych = *++YYCURSOR;
goto yy15;
yy5:
#line 81
{ RET(NUM); }
yy6: yych = *++YYCURSOR;
goto yy13;
yy7:
#line 82
{ goto std; }
yy8: yych = *++YYCURSOR;
yy9:
#line 85
{
if(cursor == s->eof) RET(EOI);
s->pos = cursor; s->line++;
goto std;
}
yy10: yych = *++YYCURSOR;
yy11:
#line 92
{
printf("unexpected character: %c\n", *s->tok);
goto std;
}
yy12: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy13: switch(yych){
case '\t': case '\v':
case '\f': case ' ': goto yy12;
default: goto yy7;
}
yy14: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy15: switch(yych){
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': goto yy14;
default: goto yy5;
}
yy16: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy17: switch(yych){
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z': case '_': case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy16;
default: goto yy3;
}
}
#line 96
}
main(){
Scanner in;
int t;
memset((char*) &in, 0, sizeof(in));
in.fd = 0;
while((t = scan(&in)) != EOI){
/*
printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
printf("%d\n", t);
*/
}
close(in.fd);
}
etc.
There is another option to generate small bitmaps for larger classes,
and use those in addition to the other two above (IE you can combine
-sb, or just use -b).
Which will generate the fastest code of course is dependent on your
compiler. However, I doubt you could code a scanner significantly
faster than the above, unless your compiler is horrible about gotos or
something.
--Dan
-- "I went to a restaurant that serves "breakfast at any time." So I ordered French Toast during the Renaissance. "-Steven Wright
Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk