Import of awk 20040207
[dragonfly.git] / contrib / awk20040207 / lex.c
CommitLineData
d945cdf9
EN
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE yylval;
33extern int infunc;
34
35int lineno = 1;
36int bracecnt = 0;
37int brackcnt = 0;
38int parencnt = 0;
39
40typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44} Keyword;
45
46Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90};
91
92#define DEBUG
93#ifdef DEBUG
94#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95#else
96#define RET(x) return(x)
97#endif
98
99int peek(void)
100{
101 int c = input();
102 unput(c);
103 return c;
104}
105
106int gettok(char **pbuf, int *psz) /* get next input token */
107{
108 int c, retc;
109 char *buf = *pbuf;
110 int sz = *psz;
111 char *bp = buf;
112
113 c = input();
114 if (c == 0)
115 return 0;
116 buf[0] = c;
117 buf[1] = 0;
118 if (!isalnum(c) && c != '.' && c != '_')
119 return c;
120
121 *bp++ = c;
122 if (isalpha(c) || c == '_') { /* it's a varname */
123 for ( ; (c = input()) != 0; ) {
124 if (bp-buf >= sz)
125 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126 FATAL( "out of space for name %.10s...", buf );
127 if (isalnum(c) || c == '_')
128 *bp++ = c;
129 else {
130 *bp = 0;
131 unput(c);
132 break;
133 }
134 }
135 *bp = 0;
136 retc = 'a'; /* alphanumeric */
137 } else { /* it's a number */
138 char *rem;
139 /* read input until can't be a number */
140 for ( ; (c = input()) != 0; ) {
141 if (bp-buf >= sz)
142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 FATAL( "out of space for number %.10s...", buf );
144 if (isdigit(c) || c == 'e' || c == 'E'
145 || c == '.' || c == '+' || c == '-')
146 *bp++ = c;
147 else {
148 unput(c);
149 break;
150 }
151 }
152 *bp = 0;
153 strtod(buf, &rem); /* parse the number */
154 unputstr(rem); /* put rest back for later */
155 if (rem == buf) { /* it wasn't a valid number at all */
156 buf[1] = 0; /* so return one character as token */
157 retc = buf[0]; /* character is its own type */
158 } else { /* some prefix was a number */
159 rem[0] = 0; /* so truncate where failure started */
160 retc = '0'; /* number */
161 }
162 }
163 *pbuf = buf;
164 *psz = sz;
165 return retc;
166}
167
168int word(char *);
169int string(void);
170int regexpr(void);
171int sc = 0; /* 1 => return a } right now */
172int reg = 0; /* 1 => return a REGEXPR now */
173
174int yylex(void)
175{
176 int c;
177 static char *buf = 0;
178 static int bufsize = 500;
179
180 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
181 FATAL( "out of space in yylex" );
182 if (sc) {
183 sc = 0;
184 RET('}');
185 }
186 if (reg) {
187 reg = 0;
188 return regexpr();
189 }
190 for (;;) {
191 c = gettok(&buf, &bufsize);
192 if (c == 0)
193 return 0;
194 if (isalpha(c) || c == '_')
195 return word(buf);
196 if (isdigit(c)) {
197 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
198 /* should this also have STR set? */
199 RET(NUMBER);
200 }
201
202 yylval.i = c;
203 switch (c) {
204 case '\n': /* {EOL} */
205 RET(NL);
206 case '\r': /* assume \n is coming */
207 case ' ': /* {WS}+ */
208 case '\t':
209 break;
210 case '#': /* #.* strip comments */
211 while ((c = input()) != '\n' && c != 0)
212 ;
213 unput(c);
214 break;
215 case ';':
216 RET(';');
217 case '\\':
218 if (peek() == '\n') {
219 input();
220 } else if (peek() == '\r') {
221 input(); input(); /* \n */
222 lineno++;
223 } else {
224 RET(c);
225 }
226 break;
227 case '&':
228 if (peek() == '&') {
229 input(); RET(AND);
230 } else
231 RET('&');
232 case '|':
233 if (peek() == '|') {
234 input(); RET(BOR);
235 } else
236 RET('|');
237 case '!':
238 if (peek() == '=') {
239 input(); yylval.i = NE; RET(NE);
240 } else if (peek() == '~') {
241 input(); yylval.i = NOTMATCH; RET(MATCHOP);
242 } else
243 RET(NOT);
244 case '~':
245 yylval.i = MATCH;
246 RET(MATCHOP);
247 case '<':
248 if (peek() == '=') {
249 input(); yylval.i = LE; RET(LE);
250 } else {
251 yylval.i = LT; RET(LT);
252 }
253 case '=':
254 if (peek() == '=') {
255 input(); yylval.i = EQ; RET(EQ);
256 } else {
257 yylval.i = ASSIGN; RET(ASGNOP);
258 }
259 case '>':
260 if (peek() == '=') {
261 input(); yylval.i = GE; RET(GE);
262 } else if (peek() == '>') {
263 input(); yylval.i = APPEND; RET(APPEND);
264 } else {
265 yylval.i = GT; RET(GT);
266 }
267 case '+':
268 if (peek() == '+') {
269 input(); yylval.i = INCR; RET(INCR);
270 } else if (peek() == '=') {
271 input(); yylval.i = ADDEQ; RET(ASGNOP);
272 } else
273 RET('+');
274 case '-':
275 if (peek() == '-') {
276 input(); yylval.i = DECR; RET(DECR);
277 } else if (peek() == '=') {
278 input(); yylval.i = SUBEQ; RET(ASGNOP);
279 } else
280 RET('-');
281 case '*':
282 if (peek() == '=') { /* *= */
283 input(); yylval.i = MULTEQ; RET(ASGNOP);
284 } else if (peek() == '*') { /* ** or **= */
285 input(); /* eat 2nd * */
286 if (peek() == '=') {
287 input(); yylval.i = POWEQ; RET(ASGNOP);
288 } else {
289 RET(POWER);
290 }
291 } else
292 RET('*');
293 case '/':
294 RET('/');
295 case '%':
296 if (peek() == '=') {
297 input(); yylval.i = MODEQ; RET(ASGNOP);
298 } else
299 RET('%');
300 case '^':
301 if (peek() == '=') {
302 input(); yylval.i = POWEQ; RET(ASGNOP);
303 } else
304 RET(POWER);
305
306 case '$':
307 /* BUG: awkward, if not wrong */
308 c = gettok(&buf, &bufsize);
309 if (isalpha(c)) {
310 if (strcmp(buf, "NF") == 0) { /* very special */
311 unputstr("(NF)");
312 RET(INDIRECT);
313 }
314 c = peek();
315 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
316 unputstr(buf);
317 RET(INDIRECT);
318 }
319 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
320 RET(IVAR);
321 } else if (c == 0) { /* */
322 SYNTAX( "unexpected end of input after $" );
323 RET(';');
324 } else {
325 unputstr(buf);
326 RET(INDIRECT);
327 }
328
329 case '}':
330 if (--bracecnt < 0)
331 SYNTAX( "extra }" );
332 sc = 1;
333 RET(';');
334 case ']':
335 if (--brackcnt < 0)
336 SYNTAX( "extra ]" );
337 RET(']');
338 case ')':
339 if (--parencnt < 0)
340 SYNTAX( "extra )" );
341 RET(')');
342 case '{':
343 bracecnt++;
344 RET('{');
345 case '[':
346 brackcnt++;
347 RET('[');
348 case '(':
349 parencnt++;
350 RET('(');
351
352 case '"':
353 return string(); /* BUG: should be like tran.c ? */
354
355 default:
356 RET(c);
357 }
358 }
359}
360
361int string(void)
362{
363 int c, n;
364 char *s, *bp;
365 static char *buf = 0;
366 static int bufsz = 500;
367
368 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
369 FATAL("out of space for strings");
370 for (bp = buf; (c = input()) != '"'; ) {
371 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
372 FATAL("out of space for string %.10s...", buf);
373 switch (c) {
374 case '\n':
375 case '\r':
376 case 0:
377 SYNTAX( "non-terminated string %.10s...", buf );
378 lineno++;
379 if (c == 0) /* hopeless */
380 FATAL( "giving up" );
381 break;
382 case '\\':
383 c = input();
384 switch (c) {
385 case '"': *bp++ = '"'; break;
386 case 'n': *bp++ = '\n'; break;
387 case 't': *bp++ = '\t'; break;
388 case 'f': *bp++ = '\f'; break;
389 case 'r': *bp++ = '\r'; break;
390 case 'b': *bp++ = '\b'; break;
391 case 'v': *bp++ = '\v'; break;
392 case 'a': *bp++ = '\007'; break;
393 case '\\': *bp++ = '\\'; break;
394
395 case '0': case '1': case '2': /* octal: \d \dd \ddd */
396 case '3': case '4': case '5': case '6': case '7':
397 n = c - '0';
398 if ((c = peek()) >= '0' && c < '8') {
399 n = 8 * n + input() - '0';
400 if ((c = peek()) >= '0' && c < '8')
401 n = 8 * n + input() - '0';
402 }
403 *bp++ = n;
404 break;
405
406 case 'x': /* hex \x0-9a-fA-F + */
407 { char xbuf[100], *px;
408 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
409 if (isdigit(c)
410 || (c >= 'a' && c <= 'f')
411 || (c >= 'A' && c <= 'F'))
412 *px++ = c;
413 else
414 break;
415 }
416 *px = 0;
417 unput(c);
418 sscanf(xbuf, "%x", &n);
419 *bp++ = n;
420 break;
421 }
422
423 default:
424 *bp++ = c;
425 break;
426 }
427 break;
428 default:
429 *bp++ = c;
430 break;
431 }
432 }
433 *bp = 0;
434 s = tostring(buf);
435 *bp++ = ' '; *bp++ = 0;
436 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
437 RET(STRING);
438}
439
440
441int binsearch(char *w, Keyword *kp, int n)
442{
443 int cond, low, mid, high;
444
445 low = 0;
446 high = n - 1;
447 while (low <= high) {
448 mid = (low + high) / 2;
449 if ((cond = strcmp(w, kp[mid].word)) < 0)
450 high = mid - 1;
451 else if (cond > 0)
452 low = mid + 1;
453 else
454 return mid;
455 }
456 return -1;
457}
458
459int word(char *w)
460{
461 Keyword *kp;
462 int c, n;
463
464 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
465 kp = keywords + n;
466 if (n != -1) { /* found in table */
467 yylval.i = kp->sub;
468 switch (kp->type) { /* special handling */
469 case FSYSTEM:
470 if (safe)
471 SYNTAX( "system is unsafe" );
472 RET(kp->type);
473 case FUNC:
474 if (infunc)
475 SYNTAX( "illegal nested function" );
476 RET(kp->type);
477 case RETURN:
478 if (!infunc)
479 SYNTAX( "return not in function" );
480 RET(kp->type);
481 case VARNF:
482 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483 RET(VARNF);
484 default:
485 RET(kp->type);
486 }
487 }
488 c = peek(); /* look for '(' */
489 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490 yylval.i = n;
491 RET(ARG);
492 } else {
493 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494 if (c == '(') {
495 RET(CALL);
496 } else {
497 RET(VAR);
498 }
499 }
500}
501
502void startreg(void) /* next call to yylex will return a regular expression */
503{
504 reg = 1;
505}
506
507int regexpr(void)
508{
509 int c;
510 static char *buf = 0;
511 static int bufsz = 500;
512 char *bp;
513
514 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515 FATAL("out of space for rex expr");
516 bp = buf;
517 for ( ; (c = input()) != '/' && c != 0; ) {
518 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
519 FATAL("out of space for reg expr %.10s...", buf);
520 if (c == '\n') {
521 SYNTAX( "newline in regular expression %.10s...", buf );
522 unput('\n');
523 break;
524 } else if (c == '\\') {
525 *bp++ = '\\';
526 *bp++ = input();
527 } else {
528 *bp++ = c;
529 }
530 }
531 *bp = 0;
532 if (c == 0)
533 SYNTAX("non-terminated regular expression %.10s...", buf);
534 yylval.s = tostring(buf);
535 unput('/');
536 RET(REGEXPR);
537}
538
539/* low-level lexical stuff, sort of inherited from lex */
540
541char ebuf[300];
542char *ep = ebuf;
543char yysbuf[100]; /* pushback buffer */
544char *yysptr = yysbuf;
545FILE *yyin = 0;
546
547int input(void) /* get next lexical input character */
548{
549 int c;
550 extern char *lexprog;
551
552 if (yysptr > yysbuf)
553 c = (uschar)*--yysptr;
554 else if (lexprog != NULL) { /* awk '...' */
555 if ((c = (uschar)*lexprog) != 0)
556 lexprog++;
557 } else /* awk -f ... */
558 c = pgetc();
559 if (c == '\n')
560 lineno++;
561 else if (c == EOF)
562 c = 0;
563 if (ep >= ebuf + sizeof ebuf)
564 ep = ebuf;
565 return *ep++ = c;
566}
567
568void unput(int c) /* put lexical character back on input */
569{
570 if (c == '\n')
571 lineno--;
572 if (yysptr >= yysbuf + sizeof(yysbuf))
573 FATAL("pushed back too much: %.20s...", yysbuf);
574 *yysptr++ = c;
575 if (--ep < ebuf)
576 ep = ebuf + sizeof(ebuf) - 1;
577}
578
579void unputstr(const char *s) /* put a string back on input */
580{
581 int i;
582
583 for (i = strlen(s)-1; i >= 0; i--)
584 unput(s[i]);
585}