2 * LEX.C - Rune Language Lexer
4 * (c)Copyright 1993-2014, Matthew Dillon, All Rights Reserved. See the
5 * COPYRIGHT file at the base of the distribution.
10 static const u_int8_t LexChar[256] = {
11 /* 00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
12 /* 08 */ 0x00, T_WS, T_WS, 0x00, T_WS, T_WS, 0x00, 0x00,
13 /* 10 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
14 /* 18 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
15 /* 20 */ T_WS, T_MP, T_QU, T_CM, T_SP, T_MP, T_MP, T_QU,
16 /* 28 */ T_SP, T_SP, T_MP, T_MP, T_SP, T_MP, T_SP, T_XP,
17 /* 30 */ T_NU, T_NU, T_NU, T_NU, T_NU, T_NU, T_NU, T_NU,
18 /* 38 */ T_NU, T_NU, T_SP, T_SP, T_MP, T_MP, T_MP, T_MP,
19 /* 40 */ T_SP, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
20 /* 48 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
21 /* 50 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
22 /* 58 */ T_AL, T_AL, T_AL, T_SP, 0x00, T_SP, T_MP, T_AL,
23 /* 60 */ T_QU, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
24 /* 68 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
25 /* 70 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
26 /* 78 */ T_AL, T_AL, T_AL, T_SP, T_MP, T_SP, T_MP, 0x00
29 static const char * const LexErrAry[] = { LEX_ERR_STRINGS };
31 static int isHex(char c);
32 static const char *lexNumeric(const char *ptr, const char *pend, int *type);
33 static const char *lexQuoted(const char *ptr, const char *pend, int *type);
34 static int lexSymTokenType(const char *ptr, size_t len);
36 static lexlist_t LexList = RUNE_HEAD_INITIALIZER(LexList);
39 LexOpen(const char *path, token_t *tok)
41 Lex *lex = zalloc(sizeof(Lex));
43 bzero(tok, sizeof(token_t));
44 lex->l_Base = MAP_FAILED;
46 RUNE_INSERT_TAIL(&LexList, lex, l_Node);
48 if ((lex->l_Fd = open(path, O_RDONLY)) >= 0) {
50 if (fstat(lex->l_Fd, &st) == 0) {
51 lex->l_Base = mmap(NULL, st.st_size, PROT_READ,
52 MAP_SHARED, lex->l_Fd, 0);
53 if (lex->l_Base != MAP_FAILED) {
54 lex->l_Bytes = st.st_size;
57 lex->l_Path = safe_strdup(path);
59 tok->t_Ptr = lex->l_Base;
60 tok->t_End = lex->l_Base + lex->l_Bytes;
70 * Cleanup after we are done lexing. This can be called
71 * multiple times without harm.
78 if ((lex = *plex) != NULL) {
81 dassert(lex->l_OpenCount > 0);
82 if (--lex->l_OpenCount == 0) {
83 if (lex->l_Base != MAP_FAILED) {
84 munmap(lex->l_Base, lex->l_Bytes);
85 lex->l_Base = MAP_FAILED;
91 if (lex->l_RefCount == 0) {
92 safe_free(&lex->l_Path);
93 RUNE_REMOVE(&LexList, lex, l_Node);
94 zfree_wipe(lex, sizeof(Lex));
101 * Regenerate the first N bytes of the token. This is used to
102 * chop-down the size of a token when the lexer gets it wrong.
103 * For example, the '**' operator in certain contexts is really
104 * two pointer-indirection tokens ('*' '*').
107 LexRedactToken(token_t *tok, int bytes)
112 dassert(bytes <= tok->t_Len);
114 tok->t_End = tok->t_Ptr + bytes;
122 LexToken(token_t *tok)
126 tok->t_Type = TOK_EOF;
127 tok->t_Ptr += tok->t_Len;
129 while ((ptr = tok->t_Ptr) < tok->t_End) {
130 switch(LexChar[(uint8_t)*ptr]) {
135 while (ptr < tok->t_End &&
136 (LexChar[(uint8_t)*ptr] & T_ALNU)) {
139 tok->t_Type = SymKeywordFind(tok->t_Ptr,
140 ptr - tok->t_Ptr, 0);
141 if (tok->t_Type == 0) {
142 tok->t_Type = lexSymTokenType(tok->t_Ptr,
147 ptr = lexNumeric(ptr, tok->t_End, &tok->t_Type);
150 ptr = lexQuoted(ptr, tok->t_End, &tok->t_Type);
154 * Single character special, special case .n
155 * for floating point constant.
159 if (tok->t_Type == '.' && ptr < tok->t_End &&
160 (LexChar[(uint8_t)*ptr] & T_NU)) {
161 ptr = lexNumeric(ptr - 1, tok->t_End,
169 while (ptr < tok->t_End && *ptr != '\n')
171 if (ptr < tok->t_End)
177 * '/' character. Look for / / or / *
180 if (ptr < tok->t_End && *ptr == '/') {
181 while (ptr < tok->t_End && *ptr != '\n')
183 if (ptr < tok->t_End)
188 if (ptr < tok->t_End && *ptr == '*') {
192 while (ptr < tok->t_End) {
194 ptr + 1 < tok->t_End &&
207 tok->t_Type = TOK_ERR_UNTERMINATED_COMMENT;
213 * multi-character operators:
214 * "! % & * + - / < = > ? ^ _ | ~"
215 * We have to pick out assignments '=' and parsed
216 * conditional flow '&&' and '||'.
221 while (ptr < tok->t_End &&
222 (LexChar[(uint8_t)*ptr] & T_MP)) {
225 len = ptr - tok->t_Ptr;
226 if (len == 1 && ptr[-1] == '=') {
227 tok->t_Type = TOK_ASS;
228 } else if (len == 2 &&
231 tok->t_Type = TOK_ANDAND;
232 } else if (len == 2 &&
235 tok->t_Type = TOK_OROR;
236 } else if (len == 2 &&
239 tok->t_Type = TOK_STRIND;
241 tok->t_Type = TOK_OPER;
246 tok->t_Type = TOK_ERR_UNEXPECTED_CHAR;
252 tok->t_Len = ptr - tok->t_Ptr;
259 LexSkipToken(token_t *tok, int type)
261 if (tok->t_Type == type) {
262 type = LexToken(tok);
266 type = TOK_ERR_MISSING_OBRACE;
269 type = TOK_ERR_MISSING_CBRACE;
272 type = TOK_ERR_UNEXPECTED_TOKEN;
275 type = LexError(tok, type);
281 LexPeekToken(token_t *tok)
284 return(LexToken(&tmp));
288 LexStripQuotes(token_t *tok, token_t *tmp)
290 switch(tok->t_Type) {
301 bzero(tmp, sizeof(token_t));
306 LexError(token_t *tok, int err)
308 if ((tok->t_Type & TOKF_ERROR) == 0) {
309 tok->t_Type = err | TOKF_ERROR;
315 LexPrintError(token_t *tok)
317 Lex *lex = tok->t_Lex;
320 int boff = tok->t_Ptr - lex->l_Base;
321 int eoff = boff + tok->t_Len;
323 lexUpdateCache(lex, boff);
325 if ((tok->t_Type & TOKF_ERROR) == 0)
327 else if ((eno = tok->t_Type & TOKF_ERR_MASK) >= arysize(LexErrAry))
330 fprintf(stderr, "%s:%d: %s\n",
335 fwrite(lex->l_Base + lex->l_CacheOff,
336 1, boff - lex->l_CacheOff, stderr);
337 fprintf(stderr, "\033[7m");
338 fwrite(lex->l_Base + boff, 1, eoff - boff, stderr);
339 fprintf(stderr, "\033[m");
340 for (len = eoff; len < lex->l_Bytes && lex->l_Base[len] != '\n'; ++len)
342 fwrite(lex->l_Base + eoff, 1, len - eoff, stderr);
343 fprintf(stderr, "\n");
347 LexPrintRef(LexRef *lr, int type)
349 Lex *lex = lr->lr_Lex;
353 dassert(lex->l_Path != NULL);
356 if ((lex->l_Fd = open(lex->l_Path, O_RDONLY)) < 0) {
358 "Error at offset %d in file %s. "
359 "Unable to open file to access line\n",
360 lr->lr_Offset, lex->l_Path);
363 lex->l_Base = mmap(NULL, lex->l_Bytes,
364 PROT_READ, MAP_SHARED,
366 if (lex->l_Base == MAP_FAILED) {
368 "Error at offset %d in file %s. "
369 "Unable to mmap file to access line\n",
370 lr->lr_Offset, lex->l_Path);
376 bzero(&t, sizeof(t));
377 t.t_Ptr = lex->l_Base + lr->lr_Offset;
378 t.t_End = lex->l_Base + lex->l_Bytes;
379 t.t_Len = lr->lr_Len;
386 munmap(lex->l_Base, lex->l_Bytes);
387 lex->l_Base = MAP_FAILED;
392 LexInitRef(LexRef *lr, token_t *tok)
394 Lex *lex = tok->t_Lex;
397 lr->lr_Offset = tok->t_Ptr - lex->l_Base;
398 lr->lr_Len = tok->t_Len;
403 LexDupRef(LexRef *s, LexRef *d)
407 ++s->lr_Lex->l_RefCount;
411 LexDoneRef(LexRef *lr)
415 if ((lex = lr->lr_Lex) != NULL) {
416 dassert(lex->l_RefCount > 0);
417 if (--lex->l_RefCount == 0 && lex->l_OpenCount == 0) {
419 LexClose(&lr->lr_Lex);
425 LexErrStr(token_t *tok)
427 size_t eno = tok->t_Type & TOKF_ERR_MASK;
428 if (eno >= arysize(LexErrAry))
430 return(LexErrAry[eno]);
434 * lexUpdateCache() - optimized line number finder for error reporting, so
435 * we do not have to keep track of it while lexing.
437 * This routines sets l_CacheOff to the base of the line containing
438 * boff and sets l_CacheLine to the line number of that line. The first
439 * line of a file is always line 1.
442 lexUpdateCache(Lex *lex, int boff)
444 int off = lex->l_CacheOff;
445 const char *base = lex->l_Base;
448 * Locate the lexical offset going forwards, keeping track of line
451 while (off < boff && off < lex->l_Bytes) {
452 if (base[off] == '\n')
458 * Locate the lexical offset going backwards, keeping track of
461 while (off > boff && off > 0) {
463 if (base[off] == '\n')
468 * Locate the base of the line containing boff (the current line)
470 while (off > 0 && base[off-1] != '\n')
472 lex->l_CacheOff = off;
478 if (c >= '0' && c <= '9')
480 if (c >= 'a' && c <= 'f')
482 if (c >= 'A' && c <= 'F')
488 * Scan a numerical value
490 * dddd decimal or octal
491 * [dddd].[dddd][e[+,-]dddd] floating
492 * 0xHEX hex (base 16)
494 * Suffixes: F Float32 (if floating)
495 * D Float64 (if floating)
496 * X Float128 (if floating)
497 * B 8-bit integer (if integral) SB or UB only
498 * W 16-bit integer (if integral)
499 * I 32-bit integer (if integral) (default)
500 * L 64-bit integer (if integral)
501 * Q 128-bit integer (if integral)
503 * Also: U make integer unsigned (if integral)
504 * S make integer signed (if integral) (default)
509 lexNumeric(const char *ptr, const char *pend, int *type)
511 int base; /* -1 for floating point */
520 if (ptr + 2 <= pend && ptr[0] == '0' &&
521 (ptr[1] == 'x' || ptr[1] == 'X')) {
524 } else if (ptr[0] == '0') {
527 } else if (ptr[0] == '.') {
534 if (*ptr >= '0' && *ptr <= '9') {
541 if (base == -1 || base == 16) {
542 *type = TOK_ERR_BAD_NUMERIC_CONST;
551 ((*ptr >= 'a' && *ptr <= 'f') ||
552 (*ptr >= 'A' && *ptr <= 'F'))) {
556 if ((*ptr == 'e' || *ptr == 'E') && didExp == 0 &&
557 (base == -1 || base == 8 || base == 10)) {
561 if (ptr < pend && (*ptr == '+' || *ptr == '-'))
563 /* continue parsing digits */
568 while (ptr < pend && (LexChar[(uint8_t)*ptr] & T_AL)) {
570 * Any number of extra flags
572 if (*ptr == 'u' || *ptr == 'U') {
578 if (*ptr == 's' || *ptr == 'S') {
586 * Only one size extension flag
589 *type = TOK_ERR_BAD_NUMERIC_CONST;
600 *type = TOK_ERR_BAD_NUMERIC_CONST;
606 * SB or UB only 'B' extension alone can be confused
607 * with hex so we do not allow it, even for octal or
612 "Integer constant 8-bit extension "
613 "must be SB or UB, not just B\n");
614 *type = TOK_ERR_BAD_NUMERIC_CONST;
627 *type = TOK_ERR_BAD_NUMERIC_CONST;
633 * Means 128-bits for both floating and integer
642 if (base == 9 && bad8) {
643 *type = TOK_ERR_BAD_NUMERIC_CONST;
644 } else if (base == -1) {
653 * Scan a quoted string. The entire string including the quotes is returned.
654 * We also support string concatenation via multiple quoted strings,
658 lexQuoted(const char *ptr, const char *pend, int *type)
665 * Check terminator, loop on concat case for double-quoted
668 if (*ptr == (char)*type) {
674 (LexChar[(uint8_t)*ptr] & T_WS)) {
677 if (ptr < pend && *ptr == (char)*type) {
687 * Embedded CR or LF is not allowed (use string concatenation
690 if (*ptr == '\n' || *ptr == '\r') {
691 *type = TOK_ERR_EMBEDDED_CRLF;
693 } else if ((uint8_t)*ptr < 0x20) {
694 *type = TOK_ERR_ILLEGAL_ESCAPE;
705 if (*ptr == 'n' || *ptr == 't' ||
706 *ptr == 'r' || *ptr == '\\') {
711 if (ptr + 2 >= pend ||
714 *type = TOK_ERR_ILLEGAL_ESCAPE;
720 if (*ptr >= '0' && *ptr <= '7') {
723 *ptr >= '0' && *ptr <= '7') {
726 *ptr >= '0' && *ptr <= '7')
731 *type = TOK_ERR_ILLEGAL_ESCAPE;
736 *type = TOK_ERR_UNTERMINATED_STR;
741 * Calculate type of symbolic identifier. It can be one of
742 * TOK_ID or TOK_CLASSID. The rules are:
744 * - <anything>_t or <anything>_p or <anything>_m will be a TOK_CLASSID
745 * - all alpha characters are caps will be TOK_CONSTID (use TOK_ID for now)
746 * - first character is an upper-case alpha will be TOK_CLASSID
747 * - otherwise TOK_ID.
751 lexSymTokenType(const char *ptr, size_t len)
756 * Single lowercase alpha character suffix, must be a class
759 if (len >= 2 && ptr[len-2] == '_') {
769 if (*ptr >= 'A' && *ptr <= 'Z') {
771 while (++ptr < pen) {
772 if (*ptr >= 'a' && *ptr <= 'z')