Rune - Change regsave burden for syscalls
[rune.git] / librune / lex.c
1 /*
2  * LEX.C        - Rune Language Lexer
3  *
4  * (c)Copyright 1993-2014, Matthew Dillon, All Rights Reserved.  See the  
5  *    COPYRIGHT file at the base of the distribution.
6  */
7
8 #include "defs.h"
9
10 static const u_int8_t LexChar[256] = {
11         /* 00 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
12         /* 08 */ 0x00,  T_WS,   T_WS,   0x00,   T_WS,   T_WS,   0x00,   0x00,
13         /* 10 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
14         /* 18 */ 0x00,  0x00,   0x00,   0x00,   0x00,   0x00,   0x00,   0x00,
15         /* 20 */ T_WS,  T_MP,   T_QU,   T_CM,   T_SP,   T_MP,   T_MP,   T_QU,
16         /* 28 */ T_SP,  T_SP,   T_MP,   T_MP,   T_SP,   T_MP,   T_SP,   T_XP,
17         /* 30 */ T_NU,  T_NU,   T_NU,   T_NU,   T_NU,   T_NU,   T_NU,   T_NU,
18         /* 38 */ T_NU,  T_NU,   T_SP,   T_SP,   T_MP,   T_MP,   T_MP,   T_MP,
19         /* 40 */ T_SP,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
20         /* 48 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
21         /* 50 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
22         /* 58 */ T_AL,  T_AL,   T_AL,   T_SP,   0x00,   T_SP,   T_MP,   T_AL,
23         /* 60 */ T_QU,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
24         /* 68 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
25         /* 70 */ T_AL,  T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,   T_AL,
26         /* 78 */ T_AL,  T_AL,   T_AL,   T_SP,   T_MP,   T_SP,   T_MP,   0x00
27 };
28
29 static const char * const LexErrAry[] = { LEX_ERR_STRINGS };
30
31 static int isHex(char c);
32 static const char *lexNumeric(const char *ptr, const char *pend, int *type);
33 static const char *lexQuoted(const char *ptr, const char *pend, int *type,
34                                 int *linep);
35 static int lexSymTokenType(const char *ptr, size_t len);
36
37 static lexlist_t LexList = RUNE_HEAD_INITIALIZER(LexList);
38
39 Lex *
40 LexOpen(const char *path, token_t *tok)
41 {
42         Lex *lex = zalloc(sizeof(Lex));
43
44         bzero(tok, sizeof(token_t));
45         lex->l_Base = MAP_FAILED;
46         lex->l_OpenCount = 1;
47         RUNE_INSERT_TAIL(&LexList, lex, l_Node);
48
49         if ((lex->l_Fd = open(path, O_RDONLY)) >= 0) {
50                 struct stat st;
51                 if (fstat(lex->l_Fd, &st) == 0) {
52                         lex->l_Base = mmap(NULL, st.st_size, PROT_READ,
53                                            MAP_SHARED, lex->l_Fd, 0);
54                         if (lex->l_Base != MAP_FAILED) {
55                                 lex->l_Bytes = st.st_size;
56                                 lex->l_CacheLine = 1;
57                                 lex->l_CacheOff = 0;
58                                 lex->l_Path = safe_strdup(path);
59                                 tok->t_Lex = lex;
60                                 tok->t_Ptr = lex->l_Base;
61                                 tok->t_End = lex->l_Base + lex->l_Bytes;
62                                 tok->t_Line = 1;
63                                 tok->t_LinesInToken = 0;
64
65                                 return(lex);
66                         }
67                 }
68         }
69         LexClose(&lex);
70         return(NULL);
71 }
72
73 /*
74  * Cleanup after we are done lexing.  This can be called
75  * multiple times without harm.
76  */
77 void
78 LexClose(Lex **plex)
79 {
80         Lex *lex;
81
82         if ((lex = *plex) != NULL) {
83                 *plex = NULL;
84
85                 dassert(lex->l_OpenCount > 0);
86                 if (--lex->l_OpenCount == 0) {
87                         if (lex->l_Base != MAP_FAILED) {
88                                 munmap(lex->l_Base, lex->l_Bytes);
89                                 lex->l_Base = MAP_FAILED;
90                         }
91                         if (lex->l_Fd >= 0) {
92                                 close(lex->l_Fd);
93                                 lex->l_Fd = -1;
94                         }
95                         if (lex->l_RefCount == 0) {
96                                 safe_free(&lex->l_Path);
97                                 RUNE_REMOVE(&LexList, lex, l_Node);
98                                 zfree_wipe(lex, sizeof(Lex));
99                         }
100                 }
101         }
102 }
103
104 /*
105  * Regenerate the first N bytes of the token.  This is used to
106  * chop-down the size of a token when the lexer gets it wrong.
107  * For example, the '**' operator in certain contexts is really
108  * two pointer-indirection tokens ('*' '*').
109  */
110 int
111 LexRedactToken(token_t *tok, int bytes)
112 {
113         const char *save;
114         int t;
115
116         dassert(bytes <= tok->t_Len);
117         save = tok->t_End;
118         tok->t_End = tok->t_Ptr + bytes;
119         tok->t_Len = 0;
120         t = LexToken(tok);
121         tok->t_End = save;
122
123         return(t);
124 }
125
126 int
127 LexToken(token_t *tok)
128 {
129         const char *ptr;
130         int otype = tok->t_Type;
131
132         tok->t_Type = TOK_EOF;
133         tok->t_Ptr += tok->t_Len;
134         tok->t_Line += tok->t_LinesInToken;
135         tok->t_LinesInToken = 0;
136
137         while ((ptr = tok->t_Ptr) < tok->t_End) {
138                 switch(LexChar[(uint8_t)*ptr]) {
139                 case T_WS:
140                         if (*ptr == '\n')
141                                 ++tok->t_LinesInToken;
142                         ++tok->t_Ptr;
143                         continue;
144                 case T_AL:
145                         while (ptr < tok->t_End &&
146                                (LexChar[(uint8_t)*ptr] & T_ALNU)) {
147                                 ++ptr;
148                         }
149                         tok->t_Type = SymKeywordFind(tok->t_Ptr,
150                                                      ptr - tok->t_Ptr, 0);
151                         if (tok->t_Type == 0) {
152                                 tok->t_Type = lexSymTokenType(tok->t_Ptr,
153                                                         ptr - tok->t_Ptr);
154                         }
155                         break;
156                 case T_NU:
157                         ptr = lexNumeric(ptr, tok->t_End, &tok->t_Type);
158                         break;
159                 case T_QU:
160                         ptr = lexQuoted(ptr, tok->t_End, &tok->t_Type,
161                                         &tok->t_LinesInToken);
162                         break;
163                 case T_SP:
164                         /*
165                          * Single character special, special case .n
166                          * for floating point constant.
167                          */
168                         tok->t_Type = *ptr;
169                         ++ptr;
170                         if (tok->t_Type == '.' && ptr < tok->t_End &&
171                             (LexChar[(uint8_t)*ptr] & T_NU)) {
172                                 ptr = lexNumeric(ptr - 1, tok->t_End,
173                                                  &tok->t_Type);
174                         }
175                         break;
176                 case T_CM:
177                         /*
178                          * #line comment
179                          */
180                         while (ptr < tok->t_End && *ptr != '\n')
181                                 ++ptr;
182                         if (ptr < tok->t_End) {
183                                 ++ptr;
184                                 ++tok->t_LinesInToken;
185                         }
186                         tok->t_Ptr = ptr;
187                         continue;
188                 case T_XP:
189                         /*
190                          * '/' character.  Look for / / or / *
191                          */
192                         ++ptr;
193                         if (ptr < tok->t_End && *ptr == '/') {
194                                 while (ptr < tok->t_End && *ptr != '\n')
195                                         ++ptr;
196                                 if (ptr < tok->t_End) {
197                                         ++ptr;
198                                         ++tok->t_LinesInToken;
199                                 }
200                                 tok->t_Ptr = ptr;
201                                 continue;
202                         } 
203                         if (ptr < tok->t_End && *ptr == '*') {
204                                 int good = 0;
205
206                                 ++ptr;
207                                 while (ptr < tok->t_End) {
208                                         if (*ptr == '\n')
209                                                 ++tok->t_LinesInToken;
210                                         if (ptr[0] == '*' &&
211                                                 ptr + 1 < tok->t_End &&
212                                                 ptr[1] == '/'
213                                         ) {
214                                                 good = 1;
215                                                 ptr += 2;
216                                                 break;
217                                         }
218                                         ++ptr;
219                                 }
220                                 if (good) {
221                                         tok->t_Ptr = ptr;
222                                         continue;
223                                 }
224                                 tok->t_Type = TOK_ERR_UNTERMINATED_COMMENT;
225                                 break;
226                         }
227                         /* fall through */
228                 case T_MP:
229                         /*
230                          * Multi-character operators:
231                          *
232                          *      "! % & * + - / < = > ? ^ _ | ~"
233                          *
234                          * We have to pick out assignments '=' and parsed 
235                          * conditional flow '&&' and '||'.
236                          *
237                          * Prefixed '*'s are handled by parser2.c via
238                          * LexRedactToken().  However, something like
239                          * ++* or --* is broken up here.  Rune does not
240                          * allow any '+*' or '-*' sequence to be an operator.
241                          * They are always treated as separate operators,
242                          * i.e. 'blah *'.
243                          */
244                         {
245                                 int len;
246
247                                 if (*ptr == '<' && otype == TOK_IMPORT) {
248                                         ptr = lexQuoted(ptr, tok->t_End,
249                                                         &tok->t_Type,
250                                                         &tok->t_LinesInToken);
251                                         break;
252                                 }
253                                 while (ptr < tok->t_End &&
254                                        (LexChar[(uint8_t)*ptr] & T_MP)) {
255                                         if (ptr[0] == '*' &&
256                                             ptr != tok->t_Ptr &&
257                                             (ptr[-1] == '+' || ptr[-1] == '-')){
258                                                 break;
259                                         }
260                                         ++ptr;
261                                 }
262                                 len = ptr - tok->t_Ptr;
263                                 if (len == 1 && ptr[-1] == '=') {
264                                         tok->t_Type = TOK_ASS;
265                                 } else if (len == 2 &&
266                                            ptr[-2] == '&' &&
267                                            ptr[-1] == '&') {
268                                         tok->t_Type = TOK_ANDAND;
269                                 } else if (len == 2 &&
270                                            ptr[-2] == '|' &&
271                                            ptr[-1] == '|') {
272                                         tok->t_Type = TOK_OROR;
273                                 } else if (len == 2 &&
274                                            ptr[-2] == '-' &&
275                                            ptr[-1] == '>') {
276                                         tok->t_Type = TOK_STRIND;
277                                 } else {
278                                         tok->t_Type = TOK_OPER;
279                                 }
280                         }
281                         break;
282                 default:
283                         tok->t_Type = TOK_ERR_UNEXPECTED_CHAR;
284                         ++ptr;
285                         break;
286                 }
287                 break;
288         }
289         tok->t_Len = ptr - tok->t_Ptr;
290         if (DebugOpt >= 9)
291                 LexDebugLine(tok);
292         return(tok->t_Type);
293 }
294
295 int
296 LexSkipToken(token_t *tok, int type)
297 {
298         if (tok->t_Type == type) {
299                 type = LexToken(tok);
300         } else {
301                 switch(type) {
302                 case TOK_OBRACE:
303                         type = TOK_ERR_MISSING_OBRACE;
304                         break;
305                 case TOK_CBRACE:
306                         type = TOK_ERR_MISSING_CBRACE;
307                         break;
308                 default:
309                         type = TOK_ERR_UNEXPECTED_TOKEN;
310                         break;
311                 }
312                 type = LexError(tok, type);
313         }
314         return(type);
315 }
316
317 int
318 LexPeekToken(token_t *tok)
319 {
320         token_t tmp = *tok;
321         return(LexToken(&tmp));
322 }
323
324 int
325 LexStripQuotes(token_t *tok, token_t *tmp)
326 {
327         switch(tok->t_Type) {
328         case TOK_ANGLESTR:
329         case TOK_DSTRING:
330         case TOK_SSTRING:
331         case TOK_BSTRING:
332                 *tmp = *tok;
333                 ++tmp->t_Ptr;
334                 tmp->t_Len -= 2;
335                 return(0);
336         default:
337                 break;
338         }
339         bzero(tmp, sizeof(token_t));
340         return(-1);
341 }
342
343 int
344 LexError(token_t *tok, int err)
345 {
346         if ((tok->t_Type & TOKF_ERROR) == 0) {
347                 tok->t_Type = err | TOKF_ERROR;
348         }
349         return(tok->t_Type);
350 }
351
352 void
353 LexPrintError(token_t *tok)
354 {
355         Lex *lex = tok->t_Lex;
356         int eno;
357         int len;
358         int boff = tok->t_Ptr - lex->l_Base;
359         int eoff = boff + tok->t_Len;
360
361         lexUpdateCache(lex, boff);
362
363         if ((tok->t_Type & TOKF_ERROR) == 0)
364                 eno = 0;
365         else if ((eno = tok->t_Type & TOKF_ERR_MASK) >= arysize(LexErrAry))
366                 eno = 1;
367
368         fprintf(stderr, "%s:%d: %s\n",
369                 lex->l_Path,
370                 lex->l_CacheLine,
371                 LexErrAry[eno]
372         );
373         fwrite(lex->l_Base + lex->l_CacheOff,
374                1, boff - lex->l_CacheOff, stderr);
375         fprintf(stderr, "\033[7m");
376         fwrite(lex->l_Base + boff, 1, eoff - boff, stderr);
377         fprintf(stderr, "\033[m");
378         for (len = eoff; len < lex->l_Bytes && lex->l_Base[len] != '\n'; ++len)
379                 ;
380         fwrite(lex->l_Base + eoff, 1, len - eoff, stderr);
381         fprintf(stderr, "\n");
382 }
383
384 void
385 LexPrintRef(LexRef *lr, int type)
386 {
387         Lex *lex = lr->lr_Lex;
388         int reopen = 0;
389         token_t t;
390
391         dassert(lex->l_Path != NULL);
392         if (lex->l_Fd < 0) {
393                 reopen = 1;
394                 if ((lex->l_Fd = open(lex->l_Path, O_RDONLY)) < 0) {
395                         fprintf(stderr,
396                                 "Error at offset %d in file %s.  "
397                                 "Unable to open file to access line\n",
398                                 lr->lr_Offset, lex->l_Path);
399                         return;
400                 }
401                 lex->l_Base = mmap(NULL, lex->l_Bytes,
402                                    PROT_READ, MAP_SHARED,
403                                    lex->l_Fd, 0);
404                 if (lex->l_Base == MAP_FAILED) {
405                         fprintf(stderr,
406                                 "Error at offset %d in file %s.  "
407                                 "Unable to mmap file to access line\n",
408                                 lr->lr_Offset, lex->l_Path);
409                         close(lex->l_Fd);
410                         lex->l_Fd = -1;
411                         return;
412                 }
413         }
414         bzero(&t, sizeof(t));
415         t.t_Ptr = lex->l_Base + lr->lr_Offset;
416         t.t_End = lex->l_Base + lex->l_Bytes;
417         t.t_Len = lr->lr_Len;
418         t.t_Type = type;
419         t.t_Lex = lex;
420         LexPrintError(&t);
421         if (reopen) {
422                 close(lex->l_Fd);
423                 lex->l_Fd = -1;
424                 munmap(lex->l_Base, lex->l_Bytes);
425                 lex->l_Base = MAP_FAILED;
426         }
427 }
428
429 void
430 LexInitRef(LexRef *lr, token_t *tok)
431 {
432         Lex *lex = tok->t_Lex;
433
434         lr->lr_Lex = lex;
435         lr->lr_Offset = tok->t_Ptr - lex->l_Base;
436         lr->lr_Len = tok->t_Len;
437         lr->lr_Line = tok->t_Line;
438         ++lex->l_RefCount;
439 }
440
441 void
442 LexDupRef(LexRef *s, LexRef *d)
443 {
444         *d = *s;
445         if (s->lr_Lex)
446                 ++s->lr_Lex->l_RefCount;
447 }
448
449 void
450 LexDoneRef(LexRef *lr)
451 {
452         Lex *lex;
453
454         if ((lex = lr->lr_Lex) != NULL) {
455                 dassert(lex->l_RefCount > 0);
456                 if (--lex->l_RefCount == 0 && lex->l_OpenCount == 0) {
457                         ++lex->l_OpenCount;
458                         LexClose(&lr->lr_Lex);
459                 }
460         }
461 }
462
463 const char *
464 LexErrStr(token_t *tok)
465 {
466         size_t eno = tok->t_Type & TOKF_ERR_MASK;
467         if (eno >= arysize(LexErrAry))
468                 eno = 1;
469         return(LexErrAry[eno]);
470 }
471
472 /*
473  * lexUpdateCache() - optimized line number finder for error reporting, so
474  *                    we do not have to keep track of it while lexing.
475  *
476  *      This routines sets l_CacheOff to the base of the line containing
477  *      boff and sets l_CacheLine to the line number of that line.  The first
478  *      line of a file is always line 1.
479  */
480 void
481 lexUpdateCache(Lex *lex, int boff)
482 {
483         int off = lex->l_CacheOff;
484         const char *base = lex->l_Base;
485
486         /*
487          * Locate the lexical offset going forwards, keeping track of line
488          * boundries.
489          */
490         while (off < boff && off < lex->l_Bytes) {
491                 if (base[off] == '\n')
492                         ++lex->l_CacheLine;
493                 ++off;
494         }
495
496         /*
497          * Locate the lexical offset going backwards, keeping track of
498          * line boundries.
499          */
500         while (off > boff && off > 0) {
501                 --off;
502                 if (base[off] == '\n')
503                         --lex->l_CacheLine;
504         }
505
506         /*
507          * Locate the base of the line containing boff (the current line)
508          */
509         while (off > 0 && base[off-1] != '\n')
510                 --off;
511         lex->l_CacheOff = off;
512 }
513
514 static int
515 isHex(char c)
516 {
517         if (c >= '0' && c <= '9')
518                 return(1);
519         if (c >= 'a' && c <= 'f')
520                 return(1);
521         if (c >= 'A' && c <= 'F')
522                 return(1);
523         return(0);
524 }
525
526 /*
527  * Scan a numerical value
528  *
529  * dddd                         decimal or octal
530  * [dddd].[dddd][e[+,-]dddd]    floating
531  * 0xHEX                        hex (base 16)
532  *
533  * Suffixes:    F       Float32                 (if floating)
534  *              D       Float64                 (if floating)
535  *              X       Float128                (if floating)
536  *              B       8-bit integer           (if integral) SB or UB only
537  *              W       16-bit integer          (if integral)
538  *              I       32-bit integer          (if integral) (default)
539  *              L       64-bit integer          (if integral)
540  *              Q       128-bit integer         (if integral)
541  *
542  * Also:        U       make integer unsigned   (if integral)
543  *              S       make integer signed     (if integral) (default)
544  *
545  *
546  */
547 static const char *
548 lexNumeric(const char *ptr, const char *pend, int *type)
549 {
550         int base;       /* -1 for floating point */
551         int didExp;
552         int didExt;
553         int bad8;
554
555         bad8 = 0;
556         didExp = 0;
557         didExt = 0;
558
559         if (ptr + 2 <= pend && ptr[0] == '0' &&
560             (ptr[1] == 'x' || ptr[1] == 'X')) {
561                 base = 16;
562                 ptr += 2;
563         } else if (ptr[0] == '0') {
564                 base = 8;
565                 ++ptr;
566         } else if (ptr[0] == '.') {
567                 base = -1;
568                 ++ptr;
569         } else {
570                 base = 10;
571         }
572         while (ptr < pend) {
573                 if (*ptr >= '0' && *ptr <= '9') {
574                         if (*ptr >= '8')
575                                 bad8 = 1;
576                         ++ptr;
577                         continue;
578                 }
579                 if (*ptr == '.') {
580                         if (base == -1 || base == 16) {
581                                 *type = TOK_ERR_BAD_NUMERIC_CONST;
582                                 ++ptr;
583                                 return ptr;
584                         }
585                         base = -1;
586                         ++ptr;
587                         continue;
588                 }
589                 if (base == 16 &&
590                     ((*ptr >= 'a' && *ptr <= 'f') ||
591                      (*ptr >= 'A' && *ptr <= 'F'))) {
592                         ++ptr;
593                         continue;
594                 }
595                 if ((*ptr == 'e' || *ptr == 'E') && didExp == 0 &&
596                     (base == -1 || base == 8 || base == 10)) {
597                         base = -1;
598                         didExp = 1;
599                         ++ptr;
600                         if (ptr < pend && (*ptr == '+' || *ptr == '-'))
601                                 ++ptr;
602                         /* continue parsing digits */
603                         continue;
604                 }
605                 break;
606         }
607         while (ptr < pend && (LexChar[(uint8_t)*ptr] & T_AL)) {
608                 /* 
609                  * Any number of extra flags
610                  */
611                 if (*ptr == 'u' || *ptr == 'U') {
612                         ++ptr;
613                         if (didExt == 0)
614                                 didExt = -1;
615                         continue;
616                 }
617                 if (*ptr == 's' || *ptr == 'S') {
618                         ++ptr;
619                         if (didExt == 0)
620                                 didExt = -1;
621                         continue;
622                 }
623
624                 /*
625                  * Only one size extension flag
626                  */
627                 if (didExt > 0) {
628                         *type = TOK_ERR_BAD_NUMERIC_CONST;
629                         ++ptr;
630                         return ptr;
631                 }
632                 switch(*ptr) {
633                 case 'f':
634                 case 'F':
635                 case 'd':
636                 case 'D':
637                         if (base == -1)
638                                 break;
639                         *type = TOK_ERR_BAD_NUMERIC_CONST;
640                         ++ptr;
641                         return ptr;
642                 case 'b':
643                 case 'B':
644                         /*
645                          * SB or UB only 'B' extension alone can be confused
646                          * with hex so we do not allow it, even for octal or
647                          * decimal.
648                          */
649                         if (didExt == 0) {
650                                 fprintf(stderr,
651                                         "Integer constant 8-bit extension "
652                                         "must be SB or UB, not just B\n");
653                                 *type = TOK_ERR_BAD_NUMERIC_CONST;
654                                 ++ptr;
655                                 return ptr;
656                         }
657                         /* fall through */
658                 case 'w':
659                 case 'W':
660                 case 'i':
661                 case 'I':
662                 case 'l':
663                 case 'L':
664                 case 'q':
665                 case 'Q':
666                         if (base != -1)
667                                 break;
668                         *type = TOK_ERR_BAD_NUMERIC_CONST;
669                         ++ptr;
670                         return ptr;
671                 case 'x':
672                 case 'X':
673                         /*
674                          * Means 128-bits for both floating and integer
675                          * constants.
676                          */
677                         break;
678                 }
679                 didExt = 1;
680                 ++ptr;
681         }
682
683         if (base == 9 && bad8) {
684                 *type = TOK_ERR_BAD_NUMERIC_CONST;
685         } else if (base == -1) {
686                 *type = TOK_FLOAT;
687         } else {
688                 *type = TOK_INTEGER;
689         }
690         return ptr;
691 }
692
693 /*
694  * Scan a quoted string.  The entire string including the quotes is returned.
695  * We also support string concatenation via multiple quoted strings,
696  * aka "abc" "def".  Mixing double and back-ticked strings is allowed.
697  */
698 static const char *
699 lexQuoted(const char *ptr, const char *pend, int *typep, int *linep)
700 {
701         char btype;
702         char etype;
703
704         *typep = btype = etype = *ptr;
705         ++ptr;
706         if (btype == '<')
707                 etype = '>';
708
709         while (ptr < pend) {
710                 /*
711                  * Check terminator, loop on concat case for double-quoted
712                  * and back-ticked strings.
713                  */
714                 if (*ptr == etype) {
715                         const char *save;
716
717                         save = ++ptr;
718                         if (etype == TOK_DSTRING || etype == TOK_BSTRING) {
719                                 while (ptr < pend &&
720                                        (LexChar[(uint8_t)*ptr] & T_WS)) {
721                                         if (*ptr == '\n')
722                                                 ++*linep;
723                                         ++ptr;
724                                 }
725                                 if (ptr < pend && *ptr == btype) {
726                                         ++ptr;
727                                         continue;
728                                 }
729                                 ptr = save;
730                         }
731                         return(ptr);
732                 }
733
734                 /*
735                  * Embedded CR or LF is not allowed (use string concatenation
736                  * instead).
737                  */
738                 if (*ptr == '\n' || *ptr == '\r') {
739                         *typep = TOK_ERR_EMBEDDED_CRLF;
740                         return(ptr);
741                 } else if ((uint8_t)*ptr < 0x20) {
742                         *typep = TOK_ERR_ILLEGAL_ESCAPE;
743                         return(ptr);
744                 }
745
746                 /*
747                  * Escape handling
748                  */
749                 if (etype != TOK_BSTRING && *ptr == '\\') {
750                         if (ptr + 1 >= pend)
751                                 break;
752                         ++ptr;
753                         if (*ptr == 'n' || *ptr == 't' ||
754                             *ptr == 'r' || *ptr == '\\') {
755                                 ++ptr;
756                                 continue;
757                         }
758                         if (*ptr == 'x') {
759                                 if (ptr + 2 >= pend ||
760                                     !isHex(ptr[0]) ||
761                                     !isHex(ptr[1])) {
762                                         *typep = TOK_ERR_ILLEGAL_ESCAPE;
763                                         return(ptr);
764                                 }
765                                 ptr += 3;
766                                 continue;
767                         }
768                         if (*ptr >= '0' && *ptr <= '7') {
769                                 ++ptr;
770                                 if (ptr < pend &&
771                                     *ptr >= '0' && *ptr <= '7') {
772                                         ++ptr;
773                                         if (ptr < pend &&
774                                             *ptr >= '0' && *ptr <= '7')
775                                                 ++ptr;
776                                 }
777                                 continue;
778                         }
779                         *typep = TOK_ERR_ILLEGAL_ESCAPE;
780                         return(ptr);
781                 }
782                 ++ptr;
783         }
784         *typep = TOK_ERR_UNTERMINATED_STR;
785
786         return(ptr);
787 }
788
789 /*
790  * Calculate type of symbolic identifier.  It can be one of
791  * TOK_ID or TOK_CLASSID.  The rules are:
792  *
793  * - <anything>_t or <anything>_p or <anything>_m will be a TOK_CLASSID
794  * - all alpha characters are caps will be TOK_CONSTID (use TOK_ID for now)
795  * - first character is an upper-case alpha will be TOK_CLASSID
796  * - otherwise TOK_ID.
797  */
798 static
799 int
800 lexSymTokenType(const char *ptr, size_t len)
801 {
802         const char *pen;
803
804         /*
805          * Single lowercase alpha character suffix, must be a class
806          * identifier.
807          */
808         if (len >= 2 && ptr[len-2] == '_') {
809                 switch(ptr[len-1]) {
810                 case 't':
811                 case 'u':
812                 case 'p':
813                 case 'm':
814                         return(TOK_CLASSID);
815                 }
816         }
817
818         if (*ptr >= 'A' && *ptr <= 'Z') {
819                 pen = ptr + len;
820                 while (++ptr < pen) {
821                         if (*ptr >= 'a' && *ptr <= 'z')
822                                 return(TOK_CLASSID);
823                 }
824                 return (TOK_ID);
825         }
826         return (TOK_ID);
827 }