Rune - Generation work
[rune.git] / librune / lex.c
1 /*
2  * LEX.C        - Rune Language Lexer
3  *
4  * (c)Copyright 1993-2014, Matthew Dillon, All Rights Reserved.  See the  
5  *    COPYRIGHT file at the base of the distribution.
6  */
7
8 #include "defs.h"
9
10 static const u_int8_t LexChar[256] = {
11         /* 00 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
12         /* 08 */ 0x00,  T_WS,   T_WS,   0x00,   T_WS,   T_WS,   0x00,   0x00,
13         /* 10 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
14         /* 18 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
15         /* 20 */ T_WS,  T_MP,   T_QU,   T_CM,   T_SP,   T_MP,   T_MP,   T_QU,
16         /* 28 */ T_SP,  T_SP,   T_MP,   T_MP,   T_SP,   T_MP,   T_SP,   T_XP,
17         /* 30 */ T_NU,  T_NU,   T_NU,   T_NU,   T_NU,   T_NU,   T_NU,   T_NU,
18         /* 38 */ T_NU,  T_NU,   T_SP,   T_SP,   T_MP,   T_MP,   T_MP,   T_MP,
19         /* 40 */ T_SP,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
20         /* 48 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
21         /* 50 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
22         /* 58 */ T_AL,  T_AL,   T_AL,   T_SP,   0x00,   T_SP,   T_MP,   T_AL,
23         /* 60 */ T_QU,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
24         /* 68 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
25         /* 70 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
26         /* 78 */ T_AL,  T_AL,   T_AL,   T_SP,   T_MP,   T_SP,   T_MP,   0x00
27 };
28
29 static const char * const LexErrAry[] = { LEX_ERR_STRINGS };
30
31 static int isHex(char c);
32 static const char *lexNumeric(const char *ptr, const char *pend, int *type);
33 static const char *lexQuoted(const char *ptr, const char *pend, int *type);
34 static int lexSymTokenType(const char *ptr, size_t len);
35
36 static lexlist_t LexList = RUNE_HEAD_INITIALIZER(LexList);
37
38 Lex *
39 LexOpen(const char *path, token_t *tok)
40 {
41         Lex *lex = zalloc(sizeof(Lex));
42
43         bzero(tok, sizeof(token_t));
44         lex->l_Base = MAP_FAILED;
45         lex->l_OpenCount = 1;
46         RUNE_INSERT_TAIL(&LexList, lex, l_Node);
47
48         if ((lex->l_Fd = open(path, O_RDONLY)) >= 0) {
49                 struct stat st;
50                 if (fstat(lex->l_Fd, &st) == 0) {
51                         lex->l_Base = mmap(NULL, st.st_size, PROT_READ,
52                                            MAP_SHARED, lex->l_Fd, 0);
53                         if (lex->l_Base != MAP_FAILED) {
54                                 lex->l_Bytes = st.st_size;
55                                 lex->l_CacheLine = 1;
56                                 lex->l_CacheOff = 0;
57                                 lex->l_Path = safe_strdup(path);
58                                 tok->t_Lex = lex;
59                                 tok->t_Ptr = lex->l_Base;
60                                 tok->t_End = lex->l_Base + lex->l_Bytes;
61                                 return(lex);
62                         }
63                 }
64         }
65         LexClose(&lex);
66         return(NULL);
67 }
68
69 /*
70  * Cleanup after we are done lexing.  This can be called
71  * multiple times without harm.
72  */
73 void
74 LexClose(Lex **plex)
75 {
76         Lex *lex;
77
78         if ((lex = *plex) != NULL) {
79                 *plex = NULL;
80
81                 dassert(lex->l_OpenCount > 0);
82                 if (--lex->l_OpenCount == 0) {
83                         if (lex->l_Base != MAP_FAILED) {
84                                 munmap(lex->l_Base, lex->l_Bytes);
85                                 lex->l_Base = MAP_FAILED;
86                         }
87                         if (lex->l_Fd >= 0) {
88                                 close(lex->l_Fd);
89                                 lex->l_Fd = -1;
90                         }
91                         if (lex->l_RefCount == 0) {
92                                 safe_free(&lex->l_Path);
93                                 RUNE_REMOVE(&LexList, lex, l_Node);
94                                 zfree_wipe(lex, sizeof(Lex));
95                         }
96                 }
97         }
98 }
99
100 /*
101  * Regenerate the first N bytes of the token.  This is used to
102  * chop-down the size of a token when the lexer gets it wrong.
103  * For example, the '**' operator in certain contexts is really
104  * two pointer-indirection tokens ('*' '*').
105  */
106 int
107 LexRedactToken(token_t *tok, int bytes)
108 {
109         const char *save;
110         int t;
111
112         dassert(bytes <= tok->t_Len);
113         save = tok->t_End;
114         tok->t_End = tok->t_Ptr + bytes;
115         tok->t_Len = 0;
116         t = LexToken(tok);
117         tok->t_End = save;
118         return(t);
119 }
120
121 int
122 LexToken(token_t *tok)
123 {
124         const char *ptr;
125
126         tok->t_Type = TOK_EOF;
127         tok->t_Ptr += tok->t_Len;
128
129         while ((ptr = tok->t_Ptr) < tok->t_End) {
130                 switch(LexChar[(uint8_t)*ptr]) {
131                 case T_WS:
132                         ++tok->t_Ptr;
133                         continue;
134                 case T_AL:
135                         while (ptr < tok->t_End &&
136                                (LexChar[(uint8_t)*ptr] & T_ALNU)) {
137                                 ++ptr;
138                         }
139                         tok->t_Type = SymKeywordFind(tok->t_Ptr,
140                                                      ptr - tok->t_Ptr, 0);
141                         if (tok->t_Type == 0) {
142                                 tok->t_Type = lexSymTokenType(tok->t_Ptr,
143                                                         ptr - tok->t_Ptr);
144                         }
145                         break;
146                 case T_NU:
147                         ptr = lexNumeric(ptr, tok->t_End, &tok->t_Type);
148                         break;
149                 case T_QU:
150                         ptr = lexQuoted(ptr, tok->t_End, &tok->t_Type);
151                         break;
152                 case T_SP:
153                         /*
154                          * Single character special, special case .n
155                          * for floating point constant.
156                          */
157                         tok->t_Type = *ptr;
158                         ++ptr;
159                         if (tok->t_Type == '.' && ptr < tok->t_End &&
160                             (LexChar[(uint8_t)*ptr] & T_NU)) {
161                                 ptr = lexNumeric(ptr - 1, tok->t_End,
162                                                  &tok->t_Type);
163                         }
164                         break;
165                 case T_CM:
166                         /*
167                          * #line comment
168                          */
169                         while (ptr < tok->t_End && *ptr != '\n')
170                                 ++ptr;
171                         if (ptr < tok->t_End)
172                                 ++ptr;
173                         tok->t_Ptr = ptr;
174                         continue;
175                 case T_XP:
176                         /*
177                          * '/' character.  Look for / / or / *
178                          */
179                         ++ptr;
180                         if (ptr < tok->t_End && *ptr == '/') {
181                                 while (ptr < tok->t_End && *ptr != '\n')
182                                         ++ptr;
183                                 if (ptr < tok->t_End)
184                                         ++ptr;
185                                 tok->t_Ptr = ptr;
186                                 continue;
187                         } 
188                         if (ptr < tok->t_End && *ptr == '*') {
189                                 int good = 0;
190
191                                 ++ptr;
192                                 while (ptr < tok->t_End) {
193                                         if (ptr[0] == '*' &&
194                                                 ptr + 1 < tok->t_End &&
195                                                 ptr[1] == '/'
196                                         ) {
197                                                 good = 1;
198                                                 ptr += 2;
199                                                 break;
200                                         }
201                                         ++ptr;
202                                 }
203                                 if (good) {
204                                         tok->t_Ptr = ptr;
205                                         continue;
206                                 }
207                                 tok->t_Type = TOK_ERR_UNTERMINATED_COMMENT;
208                                 break;
209                         }
210                         /* fall through */
211                 case T_MP:
212                         /*
213                          * multi-character operators:
214                          *      "! % & * + - / < = > ? ^ _ | ~"
215                          * We have to pick out assignments '=' and parsed 
216                          * conditional flow '&&' and '||'.
217                          */
218                         {
219                                 int len;
220
221                                 while (ptr < tok->t_End &&
222                                        (LexChar[(uint8_t)*ptr] & T_MP)) {
223                                         ++ptr;
224                                 }
225                                 len = ptr - tok->t_Ptr;
226                                 if (len == 1 && ptr[-1] == '=') {
227                                         tok->t_Type = TOK_ASS;
228                                 } else if (len == 2 &&
229                                            ptr[-2] == '&' &&
230                                            ptr[-1] == '&') {
231                                         tok->t_Type = TOK_ANDAND;
232                                 } else if (len == 2 &&
233                                            ptr[-2] == '|' &&
234                                            ptr[-1] == '|') {
235                                         tok->t_Type = TOK_OROR;
236                                 } else if (len == 2 &&
237                                            ptr[-2] == '-' &&
238                                            ptr[-1] == '>') {
239                                         tok->t_Type = TOK_STRIND;
240                                 } else {
241                                         tok->t_Type = TOK_OPER;
242                                 }
243                         }
244                         break;
245                 default:
246                         tok->t_Type = TOK_ERR_UNEXPECTED_CHAR;
247                         ++ptr;
248                         break;
249                 }
250                 break;
251         }
252         tok->t_Len = ptr - tok->t_Ptr;
253         if (DebugOpt)
254                 LexDebugLine(tok);
255         return(tok->t_Type);
256 }
257
258 int
259 LexSkipToken(token_t *tok, int type)
260 {
261         if (tok->t_Type == type) {
262                 type = LexToken(tok);
263         } else {
264                 switch(type) {
265                 case TOK_OBRACE:
266                         type = TOK_ERR_MISSING_OBRACE;
267                         break;
268                 case TOK_CBRACE:
269                         type = TOK_ERR_MISSING_CBRACE;
270                         break;
271                 default:
272                         type = TOK_ERR_UNEXPECTED_TOKEN;
273                         break;
274                 }
275                 type = LexError(tok, type);
276         }
277         return(type);
278 }
279
280 int
281 LexPeekToken(token_t *tok)
282 {
283         token_t tmp = *tok;
284         return(LexToken(&tmp));
285 }
286
287 int
288 LexStripQuotes(token_t *tok, token_t *tmp)
289 {
290         switch(tok->t_Type) {
291         case TOK_DSTRING:
292         case TOK_SSTRING:
293         case TOK_BSTRING:
294                 *tmp = *tok;
295                 ++tmp->t_Ptr;
296                 tmp->t_Len -= 2;
297                 return(0);
298         default:
299                 break;
300         }
301         bzero(tmp, sizeof(token_t));
302         return(-1);
303 }
304
305 int
306 LexError(token_t *tok, int err)
307 {
308         if ((tok->t_Type & TOKF_ERROR) == 0) {
309                 tok->t_Type = err | TOKF_ERROR;
310         }
311         return(tok->t_Type);
312 }
313
314 void
315 LexPrintError(token_t *tok)
316 {
317         Lex *lex = tok->t_Lex;
318         int eno;
319         int len;
320         int boff = tok->t_Ptr - lex->l_Base;
321         int eoff = boff + tok->t_Len;
322
323         lexUpdateCache(lex, boff);
324
325         if ((tok->t_Type & TOKF_ERROR) == 0)
326                 eno = 0;
327         else if ((eno = tok->t_Type & TOKF_ERR_MASK) >= arysize(LexErrAry))
328                 eno = 1;
329
330         fprintf(stderr, "%s:%d: %s\n",
331                 lex->l_Path,
332                 lex->l_CacheLine,
333                 LexErrAry[eno]
334         );
335         fwrite(lex->l_Base + lex->l_CacheOff,
336                1, boff - lex->l_CacheOff, stderr);
337         fprintf(stderr, "\033[7m");
338         fwrite(lex->l_Base + boff, 1, eoff - boff, stderr);
339         fprintf(stderr, "\033[m");
340         for (len = eoff; len < lex->l_Bytes && lex->l_Base[len] != '\n'; ++len)
341                 ;
342         fwrite(lex->l_Base + eoff, 1, len - eoff, stderr);
343         fprintf(stderr, "\n");
344 }
345
346 void
347 LexPrintRef(LexRef *lr, int type)
348 {
349         Lex *lex = lr->lr_Lex;
350         int reopen = 0;
351         token_t t;
352
353         dassert(lex->l_Path != NULL);
354         if (lex->l_Fd < 0) {
355                 reopen = 1;
356                 if ((lex->l_Fd = open(lex->l_Path, O_RDONLY)) < 0) {
357                         fprintf(stderr,
358                                 "Error at offset %d in file %s.  "
359                                 "Unable to open file to access line\n",
360                                 lr->lr_Offset, lex->l_Path);
361                         return;
362                 }
363                 lex->l_Base = mmap(NULL, lex->l_Bytes,
364                                    PROT_READ, MAP_SHARED,
365                                    lex->l_Fd, 0);
366                 if (lex->l_Base == MAP_FAILED) {
367                         fprintf(stderr,
368                                 "Error at offset %d in file %s.  "
369                                 "Unable to mmap file to access line\n",
370                                 lr->lr_Offset, lex->l_Path);
371                         close(lex->l_Fd);
372                         lex->l_Fd = -1;
373                         return;
374                 }
375         }
376         bzero(&t, sizeof(t));
377         t.t_Ptr = lex->l_Base + lr->lr_Offset;
378         t.t_End = lex->l_Base + lex->l_Bytes;
379         t.t_Len = lr->lr_Len;
380         t.t_Type = type;
381         t.t_Lex = lex;
382         LexPrintError(&t);
383         if (reopen) {
384                 close(lex->l_Fd);
385                 lex->l_Fd = -1;
386                 munmap(lex->l_Base, lex->l_Bytes);
387                 lex->l_Base = MAP_FAILED;
388         }
389 }
390
391 void
392 LexInitRef(LexRef *lr, token_t *tok)
393 {
394         Lex *lex = tok->t_Lex;
395
396         lr->lr_Lex = lex;
397         lr->lr_Offset = tok->t_Ptr - lex->l_Base;
398         lr->lr_Len = tok->t_Len;
399         ++lex->l_RefCount;
400 }
401
402 void
403 LexDupRef(LexRef *s, LexRef *d)
404 {
405         *d = *s;
406         if (s->lr_Lex)
407                 ++s->lr_Lex->l_RefCount;
408 }
409
410 void
411 LexDoneRef(LexRef *lr)
412 {
413         Lex *lex;
414
415         if ((lex = lr->lr_Lex) != NULL) {
416                 dassert(lex->l_RefCount > 0);
417                 if (--lex->l_RefCount == 0 && lex->l_OpenCount == 0) {
418                         ++lex->l_OpenCount;
419                         LexClose(&lr->lr_Lex);
420                 }
421         }
422 }
423
424 const char *
425 LexErrStr(token_t *tok)
426 {
427         size_t eno = tok->t_Type & TOKF_ERR_MASK;
428         if (eno >= arysize(LexErrAry))
429                 eno = 1;
430         return(LexErrAry[eno]);
431 }
432
433 /*
434  * lexUpdateCache() - optimized line number finder for error reporting, so
435  *                    we do not have to keep track of it while lexing.
436  *
437  *      This routines sets l_CacheOff to the base of the line containing
438  *      boff and sets l_CacheLine to the line number of that line.  The first
439  *      line of a file is always line 1.
440  */
441 void
442 lexUpdateCache(Lex *lex, int boff)
443 {
444         int off = lex->l_CacheOff;
445         const char *base = lex->l_Base;
446
447         /*
448          * Locate the lexical offset going forwards, keeping track of line
449          * boundries.
450          */
451         while (off < boff && off < lex->l_Bytes) {
452                 if (base[off] == '\n')
453                         ++lex->l_CacheLine;
454                 ++off;
455         }
456
457         /*
458          * Locate the lexical offset going backwards, keeping track of
459          * line boundries.
460          */
461         while (off > boff && off > 0) {
462                 --off;
463                 if (base[off] == '\n')
464                         --lex->l_CacheLine;
465         }
466
467         /*
468          * Locate the base of the line containing boff (the current line)
469          */
470         while (off > 0 && base[off-1] != '\n')
471                 --off;
472         lex->l_CacheOff = off;
473 }
474
475 static int
476 isHex(char c)
477 {
478         if (c >= '0' && c <= '9')
479                 return(1);
480         if (c >= 'a' && c <= 'f')
481                 return(1);
482         if (c >= 'A' && c <= 'F')
483                 return(1);
484         return(0);
485 }
486
487 /*
488  * Scan a numerical value
489  *
490  * dddd                         decimal or octal
491  * [dddd].[dddd][e[+,-]dddd]    floating
492  * 0xHEX                        hex (base 16)
493  *
494  * Suffixes:    F       Float32                 (if floating)
495  *              D       Float64                 (if floating)
496  *              X       Float128                (if floating)
497  *              B       8-bit integer           (if integral) SB or UB only
498  *              W       16-bit integer          (if integral)
499  *              I       32-bit integer          (if integral) (default)
500  *              L       64-bit integer          (if integral)
501  *              Q       128-bit integer         (if integral)
502  *
503  * Also:        U       make integer unsigned   (if integral)
504  *              S       make integer signed     (if integral) (default)
505  *
506  *
507  */
508 static const char *
509 lexNumeric(const char *ptr, const char *pend, int *type)
510 {
511         int base;       /* -1 for floating point */
512         int didExp;
513         int didExt;
514         int bad8;
515
516         bad8 = 0;
517         didExp = 0;
518         didExt = 0;
519
520         if (ptr + 2 <= pend && ptr[0] == '0' &&
521             (ptr[1] == 'x' || ptr[1] == 'X')) {
522                 base = 16;
523                 ptr += 2;
524         } else if (ptr[0] == '0') {
525                 base = 8;
526                 ++ptr;
527         } else if (ptr[0] == '.') {
528                 base = -1;
529                 ++ptr;
530         } else {
531                 base = 10;
532         }
533         while (ptr < pend) {
534                 if (*ptr >= '0' && *ptr <= '9') {
535                         if (*ptr >= '8')
536                                 bad8 = 1;
537                         ++ptr;
538                         continue;
539                 }
540                 if (*ptr == '.') {
541                         if (base == -1 || base == 16) {
542                                 *type = TOK_ERR_BAD_NUMERIC_CONST;
543                                 ++ptr;
544                                 return ptr;
545                         }
546                         base = -1;
547                         ++ptr;
548                         continue;
549                 }
550                 if (base == 16 &&
551                     ((*ptr >= 'a' && *ptr <= 'f') ||
552                      (*ptr >= 'A' && *ptr <= 'F'))) {
553                         ++ptr;
554                         continue;
555                 }
556                 if ((*ptr == 'e' || *ptr == 'E') && didExp == 0 &&
557                     (base == -1 || base == 8 || base == 10)) {
558                         base = -1;
559                         didExp = 1;
560                         ++ptr;
561                         if (ptr < pend && (*ptr == '+' || *ptr == '-'))
562                                 ++ptr;
563                         /* continue parsing digits */
564                         continue;
565                 }
566                 break;
567         }
568         while (ptr < pend && (LexChar[(uint8_t)*ptr] & T_AL)) {
569                 /* 
570                  * Any number of extra flags
571                  */
572                 if (*ptr == 'u' || *ptr == 'U') {
573                         ++ptr;
574                         if (didExt == 0)
575                                 didExt = -1;
576                         continue;
577                 }
578                 if (*ptr == 's' || *ptr == 'S') {
579                         ++ptr;
580                         if (didExt == 0)
581                                 didExt = -1;
582                         continue;
583                 }
584
585                 /*
586                  * Only one size extension flag
587                  */
588                 if (didExt > 0) {
589                         *type = TOK_ERR_BAD_NUMERIC_CONST;
590                         ++ptr;
591                         return ptr;
592                 }
593                 switch(*ptr) {
594                 case 'f':
595                 case 'F':
596                 case 'd':
597                 case 'D':
598                         if (base == -1)
599                                 break;
600                         *type = TOK_ERR_BAD_NUMERIC_CONST;
601                         ++ptr;
602                         return ptr;
603                 case 'b':
604                 case 'B':
605                         /*
606                          * SB or UB only 'B' extension alone can be confused
607                          * with hex so we do not allow it, even for octal or
608                          * decimal.
609                          */
610                         if (didExt == 0) {
611                                 fprintf(stderr,
612                                         "Integer constant 8-bit extension "
613                                         "must be SB or UB, not just B\n");
614                                 *type = TOK_ERR_BAD_NUMERIC_CONST;
615                                 ++ptr;
616                                 return ptr;
617                         }
618                         /* fall through */
619                 case 'w':
620                 case 'W':
621                 case 'i':
622                 case 'I':
623                 case 'l':
624                 case 'L':
625                         if (base != -1)
626                                 break;
627                         *type = TOK_ERR_BAD_NUMERIC_CONST;
628                         ++ptr;
629                         return ptr;
630                 case 'x':
631                 case 'X':
632                         /*
633                          * Means 128-bits for both floating and integer
634                          * constants.
635                          */
636                         break;
637                 }
638                 didExt = 1;
639                 ++ptr;
640         }
641
642         if (base == 9 && bad8) {
643                 *type = TOK_ERR_BAD_NUMERIC_CONST;
644         } else if (base == -1) {
645                 *type = TOK_FLOAT;
646         } else {
647                 *type = TOK_INTEGER;
648         }
649         return ptr;
650 }
651
652 /*
653  * Scan a quoted string.  The entire string including the quotes is returned.
654  * We also support string concatenation via multiple quoted strings,
655  * aka "abc" "def".
656  */
657 static const char *
658 lexQuoted(const char *ptr, const char *pend, int *type)
659 {
660         *type = *ptr;
661         ++ptr;
662
663         while (ptr < pend) {
664                 /*
665                  * Check terminator, loop on concat case for double-quoted
666                  * strings.
667                  */
668                 if (*ptr == (char)*type) {
669                         const char *save;
670
671                         save = ++ptr;
672                         if (*type == '"') {
673                                 while (ptr < pend &&
674                                        (LexChar[(uint8_t)*ptr] & T_WS)) {
675                                         ++ptr;
676                                 }
677                                 if (ptr < pend && *ptr == (char)*type) {
678                                         ++ptr;
679                                         continue;
680                                 }
681                                 ptr = save;
682                         }
683                         return(ptr);
684                 }
685
686                 /*
687                  * Embedded CR or LF is not allowed (use string concatenation
688                  * instead).
689                  */
690                 if (*ptr == '\n' || *ptr == '\r') {
691                         *type = TOK_ERR_EMBEDDED_CRLF;
692                         return(ptr);
693                 } else if ((uint8_t)*ptr < 0x20) {
694                         *type = TOK_ERR_ILLEGAL_ESCAPE;
695                         return(ptr);
696                 }
697
698                 /*
699                  * Escape handling
700                  */
701                 if (*ptr == '\\') {
702                         if (ptr + 1 >= pend)
703                                 break;
704                         ++ptr;
705                         if (*ptr == 'n' || *ptr == 't' ||
706                             *ptr == 'r' || *ptr == '\\') {
707                                 ++ptr;
708                                 continue;
709                         }
710                         if (*ptr == 'x') {
711                                 if (ptr + 2 >= pend ||
712                                     !isHex(ptr[0]) ||
713                                     !isHex(ptr[1])) {
714                                         *type = TOK_ERR_ILLEGAL_ESCAPE;
715                                         return(ptr);
716                                 }
717                                 ptr += 3;
718                                 continue;
719                         }
720                         if (*ptr >= '0' && *ptr <= '7') {
721                                 ++ptr;
722                                 if (ptr < pend &&
723                                     *ptr >= '0' && *ptr <= '7') {
724                                         ++ptr;
725                                         if (ptr < pend &&
726                                             *ptr >= '0' && *ptr <= '7')
727                                                 ++ptr;
728                                 }
729                                 continue;
730                         }
731                         *type = TOK_ERR_ILLEGAL_ESCAPE;
732                         return(ptr);
733                 }
734                 ++ptr;
735         }
736         *type = TOK_ERR_UNTERMINATED_STR;
737         return(ptr);
738 }
739
740 /*
741  * Calculate type of symbolic identifier.  It can be one of
742  * TOK_ID or TOK_CLASSID.  The rules are:
743  *
744  * - <anything>_t or <anything>_p or <anything>_m will be a TOK_CLASSID
745  * - all alpha characters are caps will be TOK_CONSTID (use TOK_ID for now)
746  * - first character is an upper-case alpha will be TOK_CLASSID
747  * - otherwise TOK_ID.
748  */
749 static
750 int
751 lexSymTokenType(const char *ptr, size_t len)
752 {
753         const char *pen;
754
755         /*
756          * Single lowercase alpha character suffix, must be a class
757          * identifier.
758          */
759         if (len >= 2 && ptr[len-2] == '_') {
760                 switch(ptr[len-1]) {
761                 case 't':
762                 case 'u':
763                 case 'p':
764                 case 'm':
765                         return(TOK_CLASSID);
766                 }
767         }
768
769         if (*ptr >= 'A' && *ptr <= 'Z') {
770                 pen = ptr + len;
771                 while (++ptr < pen) {
772                         if (*ptr >= 'a' && *ptr <= 'z')
773                                 return(TOK_CLASSID);
774                 }
775                 return (TOK_ID);
776         }
777         return (TOK_ID);
778 }