2 * LEX.C - Rune Language Lexer
4 * (c)Copyright 1993-2014, Matthew Dillon, All Rights Reserved. See the
5 * COPYRIGHT file at the base of the distribution.
10 static const u_int8_t LexChar[256] = {
11 /* 00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
12 /* 08 */ 0x00, T_WS, T_WS, 0x00, T_WS, T_WS, 0x00, 0x00,
13 /* 10 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
14 /* 18 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
15 /* 20 */ T_WS, T_MP, T_QU, T_CM, T_SP, T_MP, T_MP, T_QU,
16 /* 28 */ T_SP, T_SP, T_MP, T_MP, T_SP, T_MP, T_SP, T_XP,
17 /* 30 */ T_NU, T_NU, T_NU, T_NU, T_NU, T_NU, T_NU, T_NU,
18 /* 38 */ T_NU, T_NU, T_SP, T_SP, T_MP, T_MP, T_MP, T_MP,
19 /* 40 */ T_SP, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
20 /* 48 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
21 /* 50 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
22 /* 58 */ T_AL, T_AL, T_AL, T_SP, 0x00, T_SP, T_MP, T_AL,
23 /* 60 */ T_QU, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
24 /* 68 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
25 /* 70 */ T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL, T_AL,
26 /* 78 */ T_AL, T_AL, T_AL, T_SP, T_MP, T_SP, T_MP, 0x00
29 static const char * const LexErrAry[] = { LEX_ERR_STRINGS };
31 static int isHex(char c);
32 static const char *lexNumeric(const char *ptr, const char *pend, int *type);
33 static const char *lexQuoted(const char *ptr, const char *pend, int *type,
35 static int lexSymTokenType(const char *ptr, size_t len);
37 static lexlist_t LexList = RUNE_HEAD_INITIALIZER(LexList);
40 LexOpen(const char *path, token_t *tok)
42 Lex *lex = zalloc(sizeof(Lex));
44 bzero(tok, sizeof(token_t));
45 lex->l_Base = MAP_FAILED;
47 RUNE_INSERT_TAIL(&LexList, lex, l_Node);
49 if ((lex->l_Fd = open(path, O_RDONLY)) >= 0) {
51 if (fstat(lex->l_Fd, &st) == 0) {
52 lex->l_Base = mmap(NULL, st.st_size, PROT_READ,
53 MAP_SHARED, lex->l_Fd, 0);
54 if (lex->l_Base != MAP_FAILED) {
55 lex->l_Bytes = st.st_size;
58 lex->l_Path = safe_strdup(path);
60 tok->t_Ptr = lex->l_Base;
61 tok->t_End = lex->l_Base + lex->l_Bytes;
63 tok->t_LinesInToken = 0;
74 * Cleanup after we are done lexing. This can be called
75 * multiple times without harm.
82 if ((lex = *plex) != NULL) {
85 dassert(lex->l_OpenCount > 0);
86 if (--lex->l_OpenCount == 0) {
87 if (lex->l_Base != MAP_FAILED) {
88 munmap(lex->l_Base, lex->l_Bytes);
89 lex->l_Base = MAP_FAILED;
95 if (lex->l_RefCount == 0) {
96 safe_free(&lex->l_Path);
97 RUNE_REMOVE(&LexList, lex, l_Node);
98 zfree_wipe(lex, sizeof(Lex));
105 * Regenerate the first N bytes of the token. This is used to
106 * chop-down the size of a token when the lexer gets it wrong.
107 * For example, the '**' operator in certain contexts is really
108 * two pointer-indirection tokens ('*' '*').
111 LexRedactToken(token_t *tok, int bytes)
116 dassert(bytes <= tok->t_Len);
118 tok->t_End = tok->t_Ptr + bytes;
127 LexToken(token_t *tok)
130 int otype = tok->t_Type;
132 tok->t_Type = TOK_EOF;
133 tok->t_Ptr += tok->t_Len;
134 tok->t_Line += tok->t_LinesInToken;
135 tok->t_LinesInToken = 0;
137 while ((ptr = tok->t_Ptr) < tok->t_End) {
138 switch(LexChar[(uint8_t)*ptr]) {
141 ++tok->t_LinesInToken;
145 while (ptr < tok->t_End &&
146 (LexChar[(uint8_t)*ptr] & T_ALNU)) {
149 tok->t_Type = SymKeywordFind(tok->t_Ptr,
150 ptr - tok->t_Ptr, 0);
151 if (tok->t_Type == 0) {
152 tok->t_Type = lexSymTokenType(tok->t_Ptr,
157 ptr = lexNumeric(ptr, tok->t_End, &tok->t_Type);
160 ptr = lexQuoted(ptr, tok->t_End, &tok->t_Type,
161 &tok->t_LinesInToken);
165 * Single character special, special case .n
166 * for floating point constant.
170 if (tok->t_Type == '.' && ptr < tok->t_End &&
171 (LexChar[(uint8_t)*ptr] & T_NU)) {
172 ptr = lexNumeric(ptr - 1, tok->t_End,
180 while (ptr < tok->t_End && *ptr != '\n')
182 if (ptr < tok->t_End) {
184 ++tok->t_LinesInToken;
190 * '/' character. Look for / / or / *
193 if (ptr < tok->t_End && *ptr == '/') {
194 while (ptr < tok->t_End && *ptr != '\n')
196 if (ptr < tok->t_End) {
198 ++tok->t_LinesInToken;
203 if (ptr < tok->t_End && *ptr == '*') {
207 while (ptr < tok->t_End) {
209 ++tok->t_LinesInToken;
211 ptr + 1 < tok->t_End &&
224 tok->t_Type = TOK_ERR_UNTERMINATED_COMMENT;
230 * Multi-character operators:
232 * "! % & * + - / < = > ? ^ _ | ~"
234 * We have to pick out assignments '=' and parsed
235 * conditional flow '&&' and '||'.
237 * Prefixed '*'s are handled by parser2.c via
238 * LexRedactToken(). However, something like
239 * ++* or --* is broken up here. Rune does not
240 * allow any '+*' or '-*' sequence to be an operator.
241 * They are always treated as separate operators,
247 if (*ptr == '<' && otype == TOK_IMPORT) {
248 ptr = lexQuoted(ptr, tok->t_End,
250 &tok->t_LinesInToken);
253 while (ptr < tok->t_End &&
254 (LexChar[(uint8_t)*ptr] & T_MP)) {
257 (ptr[-1] == '+' || ptr[-1] == '-')){
262 len = ptr - tok->t_Ptr;
263 if (len == 1 && ptr[-1] == '=') {
264 tok->t_Type = TOK_ASS;
265 } else if (len == 2 &&
268 tok->t_Type = TOK_ANDAND;
269 } else if (len == 2 &&
272 tok->t_Type = TOK_OROR;
273 } else if (len == 2 &&
276 tok->t_Type = TOK_STRIND;
278 tok->t_Type = TOK_OPER;
283 tok->t_Type = TOK_ERR_UNEXPECTED_CHAR;
289 tok->t_Len = ptr - tok->t_Ptr;
296 LexSkipToken(token_t *tok, int type)
298 if (tok->t_Type == type) {
299 type = LexToken(tok);
303 type = TOK_ERR_MISSING_OBRACE;
306 type = TOK_ERR_MISSING_CBRACE;
309 type = TOK_ERR_UNEXPECTED_TOKEN;
312 type = LexError(tok, type);
318 LexPeekToken(token_t *tok)
321 return(LexToken(&tmp));
325 LexStripQuotes(token_t *tok, token_t *tmp)
327 switch(tok->t_Type) {
339 bzero(tmp, sizeof(token_t));
344 LexError(token_t *tok, int err)
346 if ((tok->t_Type & TOKF_ERROR) == 0) {
347 tok->t_Type = err | TOKF_ERROR;
353 LexPrintError(token_t *tok)
355 Lex *lex = tok->t_Lex;
358 int boff = tok->t_Ptr - lex->l_Base;
359 int eoff = boff + tok->t_Len;
361 lexUpdateCache(lex, boff);
363 if ((tok->t_Type & TOKF_ERROR) == 0)
365 else if ((eno = tok->t_Type & TOKF_ERR_MASK) >= arysize(LexErrAry))
368 fprintf(stderr, "%s:%d: %s\n",
373 fwrite(lex->l_Base + lex->l_CacheOff,
374 1, boff - lex->l_CacheOff, stderr);
375 fprintf(stderr, "\033[7m");
376 fwrite(lex->l_Base + boff, 1, eoff - boff, stderr);
377 fprintf(stderr, "\033[m");
378 for (len = eoff; len < lex->l_Bytes && lex->l_Base[len] != '\n'; ++len)
380 fwrite(lex->l_Base + eoff, 1, len - eoff, stderr);
381 fprintf(stderr, "\n");
385 LexPrintRef(LexRef *lr, int type)
387 Lex *lex = lr->lr_Lex;
391 dassert(lex->l_Path != NULL);
394 if ((lex->l_Fd = open(lex->l_Path, O_RDONLY)) < 0) {
396 "Error at offset %d in file %s. "
397 "Unable to open file to access line\n",
398 lr->lr_Offset, lex->l_Path);
401 lex->l_Base = mmap(NULL, lex->l_Bytes,
402 PROT_READ, MAP_SHARED,
404 if (lex->l_Base == MAP_FAILED) {
406 "Error at offset %d in file %s. "
407 "Unable to mmap file to access line\n",
408 lr->lr_Offset, lex->l_Path);
414 bzero(&t, sizeof(t));
415 t.t_Ptr = lex->l_Base + lr->lr_Offset;
416 t.t_End = lex->l_Base + lex->l_Bytes;
417 t.t_Len = lr->lr_Len;
424 munmap(lex->l_Base, lex->l_Bytes);
425 lex->l_Base = MAP_FAILED;
430 LexInitRef(LexRef *lr, token_t *tok)
432 Lex *lex = tok->t_Lex;
435 lr->lr_Offset = tok->t_Ptr - lex->l_Base;
436 lr->lr_Len = tok->t_Len;
437 lr->lr_Line = tok->t_Line;
442 LexDupRef(LexRef *s, LexRef *d)
446 ++s->lr_Lex->l_RefCount;
450 LexDoneRef(LexRef *lr)
454 if ((lex = lr->lr_Lex) != NULL) {
455 dassert(lex->l_RefCount > 0);
456 if (--lex->l_RefCount == 0 && lex->l_OpenCount == 0) {
458 LexClose(&lr->lr_Lex);
464 LexErrStr(token_t *tok)
466 size_t eno = tok->t_Type & TOKF_ERR_MASK;
467 if (eno >= arysize(LexErrAry))
469 return(LexErrAry[eno]);
473 * lexUpdateCache() - optimized line number finder for error reporting, so
474 * we do not have to keep track of it while lexing.
476 * This routines sets l_CacheOff to the base of the line containing
477 * boff and sets l_CacheLine to the line number of that line. The first
478 * line of a file is always line 1.
481 lexUpdateCache(Lex *lex, int boff)
483 int off = lex->l_CacheOff;
484 const char *base = lex->l_Base;
487 * Locate the lexical offset going forwards, keeping track of line
490 while (off < boff && off < lex->l_Bytes) {
491 if (base[off] == '\n')
497 * Locate the lexical offset going backwards, keeping track of
500 while (off > boff && off > 0) {
502 if (base[off] == '\n')
507 * Locate the base of the line containing boff (the current line)
509 while (off > 0 && base[off-1] != '\n')
511 lex->l_CacheOff = off;
517 if (c >= '0' && c <= '9')
519 if (c >= 'a' && c <= 'f')
521 if (c >= 'A' && c <= 'F')
527 * Scan a numerical value
529 * dddd decimal or octal
530 * [dddd].[dddd][e[+,-]dddd] floating
531 * 0xHEX hex (base 16)
533 * Suffixes: F Float32 (if floating)
534 * D Float64 (if floating)
535 * X Float128 (if floating)
536 * B 8-bit integer (if integral) SB or UB only
537 * W 16-bit integer (if integral)
538 * I 32-bit integer (if integral) (default)
539 * L 64-bit integer (if integral)
540 * Q 128-bit integer (if integral)
542 * Also: U make integer unsigned (if integral)
543 * S make integer signed (if integral) (default)
548 lexNumeric(const char *ptr, const char *pend, int *type)
550 int base; /* -1 for floating point */
559 if (ptr + 2 <= pend && ptr[0] == '0' &&
560 (ptr[1] == 'x' || ptr[1] == 'X')) {
563 } else if (ptr[0] == '0') {
566 } else if (ptr[0] == '.') {
573 if (*ptr >= '0' && *ptr <= '9') {
580 if (base == -1 || base == 16) {
581 *type = TOK_ERR_BAD_NUMERIC_CONST;
590 ((*ptr >= 'a' && *ptr <= 'f') ||
591 (*ptr >= 'A' && *ptr <= 'F'))) {
595 if ((*ptr == 'e' || *ptr == 'E') && didExp == 0 &&
596 (base == -1 || base == 8 || base == 10)) {
600 if (ptr < pend && (*ptr == '+' || *ptr == '-'))
602 /* continue parsing digits */
607 while (ptr < pend && (LexChar[(uint8_t)*ptr] & T_AL)) {
609 * Any number of extra flags
611 if (*ptr == 'u' || *ptr == 'U') {
617 if (*ptr == 's' || *ptr == 'S') {
625 * Only one size extension flag
628 *type = TOK_ERR_BAD_NUMERIC_CONST;
639 *type = TOK_ERR_BAD_NUMERIC_CONST;
645 * SB or UB only 'B' extension alone can be confused
646 * with hex so we do not allow it, even for octal or
651 "Integer constant 8-bit extension "
652 "must be SB or UB, not just B\n");
653 *type = TOK_ERR_BAD_NUMERIC_CONST;
668 *type = TOK_ERR_BAD_NUMERIC_CONST;
674 * Means 128-bits for both floating and integer
683 if (base == 9 && bad8) {
684 *type = TOK_ERR_BAD_NUMERIC_CONST;
685 } else if (base == -1) {
694 * Scan a quoted string. The entire string including the quotes is returned.
695 * We also support string concatenation via multiple quoted strings,
696 * aka "abc" "def". Mixing double and back-ticked strings is allowed.
699 lexQuoted(const char *ptr, const char *pend, int *typep, int *linep)
704 *typep = btype = etype = *ptr;
711 * Check terminator, loop on concat case for double-quoted
712 * and back-ticked strings.
718 if (etype == TOK_DSTRING || etype == TOK_BSTRING) {
720 (LexChar[(uint8_t)*ptr] & T_WS)) {
725 if (ptr < pend && *ptr == btype) {
735 * Embedded CR or LF is not allowed (use string concatenation
738 if (*ptr == '\n' || *ptr == '\r') {
739 *typep = TOK_ERR_EMBEDDED_CRLF;
741 } else if ((uint8_t)*ptr < 0x20) {
742 *typep = TOK_ERR_ILLEGAL_ESCAPE;
749 if (etype != TOK_BSTRING && *ptr == '\\') {
753 if (*ptr == 'n' || *ptr == 't' ||
754 *ptr == 'r' || *ptr == '\\') {
759 if (ptr + 2 >= pend ||
762 *typep = TOK_ERR_ILLEGAL_ESCAPE;
768 if (*ptr >= '0' && *ptr <= '7') {
771 *ptr >= '0' && *ptr <= '7') {
774 *ptr >= '0' && *ptr <= '7')
779 *typep = TOK_ERR_ILLEGAL_ESCAPE;
784 *typep = TOK_ERR_UNTERMINATED_STR;
790 * Calculate type of symbolic identifier. It can be one of
791 * TOK_ID or TOK_CLASSID. The rules are:
793 * - <anything>_t or <anything>_p or <anything>_m will be a TOK_CLASSID
794 * - all alpha characters are caps will be TOK_CONSTID (use TOK_ID for now)
795 * - first character is an upper-case alpha will be TOK_CLASSID
796 * - otherwise TOK_ID.
800 lexSymTokenType(const char *ptr, size_t len)
805 * Single lowercase alpha character suffix, must be a class
808 if (len >= 2 && ptr[len-2] == '_') {
818 if (*ptr >= 'A' && *ptr <= 'Z') {
820 while (++ptr < pen) {
821 if (*ptr >= 'a' && *ptr <= 'z')