Merge branch 'vendor/GCC44'
[dragonfly.git] / contrib / gcc-4.4 / libcpp / lex.c
CommitLineData
c251ad9e
SS
1/* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Per Bothner, 1994-95.
5 Based on CCCP program by Paul Rubin, June 1986
6 Adapted to ANSI C, Richard Stallman, Jan 1987
7 Broken out to separate file, Zack Weinberg, Mar 2000
8
9This program is free software; you can redistribute it and/or modify it
10under the terms of the GNU General Public License as published by the
11Free Software Foundation; either version 3, or (at your option) any
12later version.
13
14This program is distributed in the hope that it will be useful,
15but WITHOUT ANY WARRANTY; without even the implied warranty of
16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17GNU General Public License for more details.
18
19You should have received a copy of the GNU General Public License
20along with this program; see the file COPYING3. If not see
21<http://www.gnu.org/licenses/>. */
22
23#include "config.h"
24#include "system.h"
25#include "cpplib.h"
26#include "internal.h"
27
28enum spell_type
29{
30 SPELL_OPERATOR = 0,
31 SPELL_IDENT,
32 SPELL_LITERAL,
33 SPELL_NONE
34};
35
36struct token_spelling
37{
38 enum spell_type category;
39 const unsigned char *name;
40};
41
42static const unsigned char *const digraph_spellings[] =
43{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44
45#define OP(e, s) { SPELL_OPERATOR, UC s },
46#define TK(e, s) { SPELL_ ## s, UC #e },
47static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
48#undef OP
49#undef TK
50
51#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
52#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53
54static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
55static int skip_line_comment (cpp_reader *);
56static void skip_whitespace (cpp_reader *, cppchar_t);
57static void lex_string (cpp_reader *, cpp_token *, const uchar *);
58static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
59static void store_comment (cpp_reader *, cpp_token *);
60static void create_literal (cpp_reader *, cpp_token *, const uchar *,
61 unsigned int, enum cpp_ttype);
62static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
63static int name_p (cpp_reader *, const cpp_string *);
64static tokenrun *next_tokenrun (tokenrun *);
65
66static _cpp_buff *new_buff (size_t);
67
68
69/* Utility routine:
70
71 Compares, the token TOKEN to the NUL-terminated string STRING.
72 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
73int
74cpp_ideq (const cpp_token *token, const char *string)
75{
76 if (token->type != CPP_NAME)
77 return 0;
78
79 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
80}
81
82/* Record a note TYPE at byte POS into the current cleaned logical
83 line. */
84static void
85add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86{
87 if (buffer->notes_used == buffer->notes_cap)
88 {
89 buffer->notes_cap = buffer->notes_cap * 2 + 200;
90 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
91 buffer->notes_cap);
92 }
93
94 buffer->notes[buffer->notes_used].pos = pos;
95 buffer->notes[buffer->notes_used].type = type;
96 buffer->notes_used++;
97}
98
99/* Returns with a logical line that contains no escaped newlines or
100 trigraphs. This is a time-critical inner loop. */
101void
102_cpp_clean_line (cpp_reader *pfile)
103{
104 cpp_buffer *buffer;
105 const uchar *s;
106 uchar c, *d, *p;
107
108 buffer = pfile->buffer;
109 buffer->cur_note = buffer->notes_used = 0;
110 buffer->cur = buffer->line_base = buffer->next_line;
111 buffer->need_line = false;
112 s = buffer->next_line - 1;
113
114 if (!buffer->from_stage3)
115 {
116 const uchar *pbackslash = NULL;
117
118 /* Short circuit for the common case of an un-escaped line with
119 no trigraphs. The primary win here is by not writing any
120 data back to memory until we have to. */
121 for (;;)
122 {
123 c = *++s;
124 if (__builtin_expect (c == '\n', false)
125 || __builtin_expect (c == '\r', false))
126 {
127 d = (uchar *) s;
128
129 if (__builtin_expect (s == buffer->rlimit, false))
130 goto done;
131
132 /* DOS line ending? */
133 if (__builtin_expect (c == '\r', false)
134 && s[1] == '\n')
135 {
136 s++;
137 if (s == buffer->rlimit)
138 goto done;
139 }
140
141 if (__builtin_expect (pbackslash == NULL, true))
142 goto done;
143
144 /* Check for escaped newline. */
145 p = d;
146 while (is_nvspace (p[-1]))
147 p--;
148 if (p - 1 != pbackslash)
149 goto done;
150
151 /* Have an escaped newline; process it and proceed to
152 the slow path. */
153 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
154 d = p - 2;
155 buffer->next_line = p - 1;
156 break;
157 }
158 if (__builtin_expect (c == '\\', false))
159 pbackslash = s;
160 else if (__builtin_expect (c == '?', false)
161 && __builtin_expect (s[1] == '?', false)
162 && _cpp_trigraph_map[s[2]])
163 {
164 /* Have a trigraph. We may or may not have to convert
165 it. Add a line note regardless, for -Wtrigraphs. */
166 add_line_note (buffer, s, s[2]);
167 if (CPP_OPTION (pfile, trigraphs))
168 {
169 /* We do, and that means we have to switch to the
170 slow path. */
171 d = (uchar *) s;
172 *d = _cpp_trigraph_map[s[2]];
173 s += 2;
174 break;
175 }
176 }
177 }
178
179
180 for (;;)
181 {
182 c = *++s;
183 *++d = c;
184
185 if (c == '\n' || c == '\r')
186 {
187 /* Handle DOS line endings. */
188 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
189 s++;
190 if (s == buffer->rlimit)
191 break;
192
193 /* Escaped? */
194 p = d;
195 while (p != buffer->next_line && is_nvspace (p[-1]))
196 p--;
197 if (p == buffer->next_line || p[-1] != '\\')
198 break;
199
200 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
201 d = p - 2;
202 buffer->next_line = p - 1;
203 }
204 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
205 {
206 /* Add a note regardless, for the benefit of -Wtrigraphs. */
207 add_line_note (buffer, d, s[2]);
208 if (CPP_OPTION (pfile, trigraphs))
209 {
210 *d = _cpp_trigraph_map[s[2]];
211 s += 2;
212 }
213 }
214 }
215 }
216 else
217 {
218 do
219 s++;
220 while (*s != '\n' && *s != '\r');
221 d = (uchar *) s;
222
223 /* Handle DOS line endings. */
224 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
225 s++;
226 }
227
228 done:
229 *d = '\n';
230 /* A sentinel note that should never be processed. */
231 add_line_note (buffer, d + 1, '\n');
232 buffer->next_line = s + 1;
233}
234
235/* Return true if the trigraph indicated by NOTE should be warned
236 about in a comment. */
237static bool
238warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
239{
240 const uchar *p;
241
242 /* Within comments we don't warn about trigraphs, unless the
243 trigraph forms an escaped newline, as that may change
244 behavior. */
245 if (note->type != '/')
246 return false;
247
248 /* If -trigraphs, then this was an escaped newline iff the next note
249 is coincident. */
250 if (CPP_OPTION (pfile, trigraphs))
251 return note[1].pos == note->pos;
252
253 /* Otherwise, see if this forms an escaped newline. */
254 p = note->pos + 3;
255 while (is_nvspace (*p))
256 p++;
257
258 /* There might have been escaped newlines between the trigraph and the
259 newline we found. Hence the position test. */
260 return (*p == '\n' && p < note[1].pos);
261}
262
263/* Process the notes created by add_line_note as far as the current
264 location. */
265void
266_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
267{
268 cpp_buffer *buffer = pfile->buffer;
269
270 for (;;)
271 {
272 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
273 unsigned int col;
274
275 if (note->pos > buffer->cur)
276 break;
277
278 buffer->cur_note++;
279 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
280
281 if (note->type == '\\' || note->type == ' ')
282 {
283 if (note->type == ' ' && !in_comment)
284 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
285 "backslash and newline separated by space");
286
287 if (buffer->next_line > buffer->rlimit)
288 {
289 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
290 "backslash-newline at end of file");
291 /* Prevent "no newline at end of file" warning. */
292 buffer->next_line = buffer->rlimit;
293 }
294
295 buffer->line_base = note->pos;
296 CPP_INCREMENT_LINE (pfile, 0);
297 }
298 else if (_cpp_trigraph_map[note->type])
299 {
300 if (CPP_OPTION (pfile, warn_trigraphs)
301 && (!in_comment || warn_in_comment (pfile, note)))
302 {
303 if (CPP_OPTION (pfile, trigraphs))
304 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
305 "trigraph ??%c converted to %c",
306 note->type,
307 (int) _cpp_trigraph_map[note->type]);
308 else
309 {
310 cpp_error_with_line
311 (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
312 "trigraph ??%c ignored, use -trigraphs to enable",
313 note->type);
314 }
315 }
316 }
317 else
318 abort ();
319 }
320}
321
322/* Skip a C-style block comment. We find the end of the comment by
323 seeing if an asterisk is before every '/' we encounter. Returns
324 nonzero if comment terminated by EOF, zero otherwise.
325
326 Buffer->cur points to the initial asterisk of the comment. */
327bool
328_cpp_skip_block_comment (cpp_reader *pfile)
329{
330 cpp_buffer *buffer = pfile->buffer;
331 const uchar *cur = buffer->cur;
332 uchar c;
333
334 cur++;
335 if (*cur == '/')
336 cur++;
337
338 for (;;)
339 {
340 /* People like decorating comments with '*', so check for '/'
341 instead for efficiency. */
342 c = *cur++;
343
344 if (c == '/')
345 {
346 if (cur[-2] == '*')
347 break;
348
349 /* Warn about potential nested comments, but not if the '/'
350 comes immediately before the true comment delimiter.
351 Don't bother to get it right across escaped newlines. */
352 if (CPP_OPTION (pfile, warn_comments)
353 && cur[0] == '*' && cur[1] != '/')
354 {
355 buffer->cur = cur;
356 cpp_error_with_line (pfile, CPP_DL_WARNING,
357 pfile->line_table->highest_line, CPP_BUF_COL (buffer),
358 "\"/*\" within comment");
359 }
360 }
361 else if (c == '\n')
362 {
363 unsigned int cols;
364 buffer->cur = cur - 1;
365 _cpp_process_line_notes (pfile, true);
366 if (buffer->next_line >= buffer->rlimit)
367 return true;
368 _cpp_clean_line (pfile);
369
370 cols = buffer->next_line - buffer->line_base;
371 CPP_INCREMENT_LINE (pfile, cols);
372
373 cur = buffer->cur;
374 }
375 }
376
377 buffer->cur = cur;
378 _cpp_process_line_notes (pfile, true);
379 return false;
380}
381
382/* Skip a C++ line comment, leaving buffer->cur pointing to the
383 terminating newline. Handles escaped newlines. Returns nonzero
384 if a multiline comment. */
385static int
386skip_line_comment (cpp_reader *pfile)
387{
388 cpp_buffer *buffer = pfile->buffer;
389 source_location orig_line = pfile->line_table->highest_line;
390
391 while (*buffer->cur != '\n')
392 buffer->cur++;
393
394 _cpp_process_line_notes (pfile, true);
395 return orig_line != pfile->line_table->highest_line;
396}
397
398/* Skips whitespace, saving the next non-whitespace character. */
399static void
400skip_whitespace (cpp_reader *pfile, cppchar_t c)
401{
402 cpp_buffer *buffer = pfile->buffer;
403 bool saw_NUL = false;
404
405 do
406 {
407 /* Horizontal space always OK. */
408 if (c == ' ' || c == '\t')
409 ;
410 /* Just \f \v or \0 left. */
411 else if (c == '\0')
412 saw_NUL = true;
413 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
414 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
415 CPP_BUF_COL (buffer),
416 "%s in preprocessing directive",
417 c == '\f' ? "form feed" : "vertical tab");
418
419 c = *buffer->cur++;
420 }
421 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
422 while (is_nvspace (c));
423
424 if (saw_NUL)
425 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
426
427 buffer->cur--;
428}
429
430/* See if the characters of a number token are valid in a name (no
431 '.', '+' or '-'). */
432static int
433name_p (cpp_reader *pfile, const cpp_string *string)
434{
435 unsigned int i;
436
437 for (i = 0; i < string->len; i++)
438 if (!is_idchar (string->text[i]))
439 return 0;
440
441 return 1;
442}
443
444/* After parsing an identifier or other sequence, produce a warning about
445 sequences not in NFC/NFKC. */
446static void
447warn_about_normalization (cpp_reader *pfile,
448 const cpp_token *token,
449 const struct normalize_state *s)
450{
451 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
452 && !pfile->state.skipping)
453 {
454 /* Make sure that the token is printed using UCNs, even
455 if we'd otherwise happily print UTF-8. */
456 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
457 size_t sz;
458
459 sz = cpp_spell_token (pfile, token, buf, false) - buf;
460 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
461 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
462 "`%.*s' is not in NFKC", (int) sz, buf);
463 else
464 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
465 "`%.*s' is not in NFC", (int) sz, buf);
466 }
467}
468
469/* Returns TRUE if the sequence starting at buffer->cur is invalid in
470 an identifier. FIRST is TRUE if this starts an identifier. */
471static bool
472forms_identifier_p (cpp_reader *pfile, int first,
473 struct normalize_state *state)
474{
475 cpp_buffer *buffer = pfile->buffer;
476
477 if (*buffer->cur == '$')
478 {
479 if (!CPP_OPTION (pfile, dollars_in_ident))
480 return false;
481
482 buffer->cur++;
483 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
484 {
485 CPP_OPTION (pfile, warn_dollars) = 0;
486 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
487 }
488
489 return true;
490 }
491
492 /* Is this a syntactically valid UCN? */
493 if (CPP_OPTION (pfile, extended_identifiers)
494 && *buffer->cur == '\\'
495 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
496 {
497 buffer->cur += 2;
498 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
499 state))
500 return true;
501 buffer->cur -= 2;
502 }
503
504 return false;
505}
506
4b1e227d
SW
507/* Helper function to get the cpp_hashnode of the identifier BASE. */
508static cpp_hashnode *
509lex_identifier_intern (cpp_reader *pfile, const uchar *base)
510{
511 cpp_hashnode *result;
512 const uchar *cur;
513 unsigned int len;
514 unsigned int hash = HT_HASHSTEP (0, *base);
515
516 cur = base + 1;
517 while (ISIDNUM (*cur))
518 {
519 hash = HT_HASHSTEP (hash, *cur);
520 cur++;
521 }
522 len = cur - base;
523 hash = HT_HASHFINISH (hash, len);
524 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
525 base, len, hash, HT_ALLOC));
526
527 /* Rarely, identifiers require diagnostics when lexed. */
528 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
529 && !pfile->state.skipping, 0))
530 {
531 /* It is allowed to poison the same identifier twice. */
532 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
533 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
534 NODE_NAME (result));
535
536 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
537 replacement list of a variadic macro. */
538 if (result == pfile->spec_nodes.n__VA_ARGS__
539 && !pfile->state.va_args_ok)
540 cpp_error (pfile, CPP_DL_PEDWARN,
541 "__VA_ARGS__ can only appear in the expansion"
542 " of a C99 variadic macro");
543 }
544
545 return result;
546}
547
548/* Get the cpp_hashnode of an identifier specified by NAME in
549 the current cpp_reader object. If none is found, NULL is returned. */
550cpp_hashnode *
551_cpp_lex_identifier (cpp_reader *pfile, const char *name)
552{
553 cpp_hashnode *result;
554 result = lex_identifier_intern (pfile, (uchar *) name);
555 return result;
556}
557
c251ad9e
SS
558/* Lex an identifier starting at BUFFER->CUR - 1. */
559static cpp_hashnode *
560lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
561 struct normalize_state *nst)
562{
563 cpp_hashnode *result;
564 const uchar *cur;
565 unsigned int len;
566 unsigned int hash = HT_HASHSTEP (0, *base);
567
568 cur = pfile->buffer->cur;
569 if (! starts_ucn)
570 while (ISIDNUM (*cur))
571 {
572 hash = HT_HASHSTEP (hash, *cur);
573 cur++;
574 }
575 pfile->buffer->cur = cur;
576 if (starts_ucn || forms_identifier_p (pfile, false, nst))
577 {
578 /* Slower version for identifiers containing UCNs (or $). */
579 do {
580 while (ISIDNUM (*pfile->buffer->cur))
581 {
582 pfile->buffer->cur++;
583 NORMALIZE_STATE_UPDATE_IDNUM (nst);
584 }
585 } while (forms_identifier_p (pfile, false, nst));
586 result = _cpp_interpret_identifier (pfile, base,
587 pfile->buffer->cur - base);
588 }
589 else
590 {
591 len = cur - base;
592 hash = HT_HASHFINISH (hash, len);
593
594 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
595 base, len, hash, HT_ALLOC));
596 }
597
598 /* Rarely, identifiers require diagnostics when lexed. */
599 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
600 && !pfile->state.skipping, 0))
601 {
602 /* It is allowed to poison the same identifier twice. */
603 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
604 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
605 NODE_NAME (result));
606
607 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
608 replacement list of a variadic macro. */
609 if (result == pfile->spec_nodes.n__VA_ARGS__
610 && !pfile->state.va_args_ok)
611 cpp_error (pfile, CPP_DL_PEDWARN,
612 "__VA_ARGS__ can only appear in the expansion"
613 " of a C99 variadic macro");
614 }
615
616 return result;
617}
618
619/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
620static void
621lex_number (cpp_reader *pfile, cpp_string *number,
622 struct normalize_state *nst)
623{
624 const uchar *cur;
625 const uchar *base;
626 uchar *dest;
627
628 base = pfile->buffer->cur - 1;
629 do
630 {
631 cur = pfile->buffer->cur;
632
633 /* N.B. ISIDNUM does not include $. */
634 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
635 {
636 cur++;
637 NORMALIZE_STATE_UPDATE_IDNUM (nst);
638 }
639
640 pfile->buffer->cur = cur;
641 }
642 while (forms_identifier_p (pfile, false, nst));
643
644 number->len = cur - base;
645 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
646 memcpy (dest, base, number->len);
647 dest[number->len] = '\0';
648 number->text = dest;
649}
650
651/* Create a token of type TYPE with a literal spelling. */
652static void
653create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
654 unsigned int len, enum cpp_ttype type)
655{
656 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
657
658 memcpy (dest, base, len);
659 dest[len] = '\0';
660 token->type = type;
661 token->val.str.len = len;
662 token->val.str.text = dest;
663}
664
665/* Lexes a string, character constant, or angle-bracketed header file
666 name. The stored string contains the spelling, including opening
667 quote and leading any leading 'L', 'u' or 'U'. It returns the type
668 of the literal, or CPP_OTHER if it was not properly terminated, or
669 CPP_LESS for an unterminated header name which must be relexed as
670 normal tokens.
671
672 The spelling is NUL-terminated, but it is not guaranteed that this
673 is the first NUL since embedded NULs are preserved. */
674static void
675lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
676{
677 bool saw_NUL = false;
678 const uchar *cur;
679 cppchar_t terminator;
680 enum cpp_ttype type;
681
682 cur = base;
683 terminator = *cur++;
684 if (terminator == 'L' || terminator == 'u' || terminator == 'U')
685 terminator = *cur++;
686 if (terminator == '\"')
687 type = (*base == 'L' ? CPP_WSTRING :
688 *base == 'U' ? CPP_STRING32 :
689 *base == 'u' ? CPP_STRING16 : CPP_STRING);
690 else if (terminator == '\'')
691 type = (*base == 'L' ? CPP_WCHAR :
692 *base == 'U' ? CPP_CHAR32 :
693 *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
694 else
695 terminator = '>', type = CPP_HEADER_NAME;
696
697 for (;;)
698 {
699 cppchar_t c = *cur++;
700
701 /* In #include-style directives, terminators are not escapable. */
702 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
703 cur++;
704 else if (c == terminator)
705 break;
706 else if (c == '\n')
707 {
708 cur--;
709 /* Unmatched quotes always yield undefined behavior, but
710 greedy lexing means that what appears to be an unterminated
711 header name may actually be a legitimate sequence of tokens. */
712 if (terminator == '>')
713 {
714 token->type = CPP_LESS;
715 return;
716 }
717 type = CPP_OTHER;
718 break;
719 }
720 else if (c == '\0')
721 saw_NUL = true;
722 }
723
724 if (saw_NUL && !pfile->state.skipping)
725 cpp_error (pfile, CPP_DL_WARNING,
726 "null character(s) preserved in literal");
727
728 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
729 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
730 (int) terminator);
731
732 pfile->buffer->cur = cur;
733 create_literal (pfile, token, base, cur - base, type);
734}
735
736/* Return the comment table. The client may not make any assumption
737 about the ordering of the table. */
738cpp_comment_table *
739cpp_get_comments (cpp_reader *pfile)
740{
741 return &pfile->comments;
742}
743
744/* Append a comment to the end of the comment table. */
745static void
746store_comment (cpp_reader *pfile, cpp_token *token)
747{
748 int len;
749
750 if (pfile->comments.allocated == 0)
751 {
752 pfile->comments.allocated = 256;
753 pfile->comments.entries = (cpp_comment *) xmalloc
754 (pfile->comments.allocated * sizeof (cpp_comment));
755 }
756
757 if (pfile->comments.count == pfile->comments.allocated)
758 {
759 pfile->comments.allocated *= 2;
760 pfile->comments.entries = (cpp_comment *) xrealloc
761 (pfile->comments.entries,
762 pfile->comments.allocated * sizeof (cpp_comment));
763 }
764
765 len = token->val.str.len;
766
767 /* Copy comment. Note, token may not be NULL terminated. */
768 pfile->comments.entries[pfile->comments.count].comment =
769 (char *) xmalloc (sizeof (char) * (len + 1));
770 memcpy (pfile->comments.entries[pfile->comments.count].comment,
771 token->val.str.text, len);
772 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
773
774 /* Set source location. */
775 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
776
777 /* Increment the count of entries in the comment table. */
778 pfile->comments.count++;
779}
780
781/* The stored comment includes the comment start and any terminator. */
782static void
783save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
784 cppchar_t type)
785{
786 unsigned char *buffer;
787 unsigned int len, clen;
788
789 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
790
791 /* C++ comments probably (not definitely) have moved past a new
792 line, which we don't want to save in the comment. */
793 if (is_vspace (pfile->buffer->cur[-1]))
794 len--;
795
796 /* If we are currently in a directive, then we need to store all
797 C++ comments as C comments internally, and so we need to
798 allocate a little extra space in that case.
799
800 Note that the only time we encounter a directive here is
801 when we are saving comments in a "#define". */
802 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
803
804 buffer = _cpp_unaligned_alloc (pfile, clen);
805
806 token->type = CPP_COMMENT;
807 token->val.str.len = clen;
808 token->val.str.text = buffer;
809
810 buffer[0] = '/';
811 memcpy (buffer + 1, from, len - 1);
812
813 /* Finish conversion to a C comment, if necessary. */
814 if (pfile->state.in_directive && type == '/')
815 {
816 buffer[1] = '*';
817 buffer[clen - 2] = '*';
818 buffer[clen - 1] = '/';
819 }
820
821 /* Finally store this comment for use by clients of libcpp. */
822 store_comment (pfile, token);
823}
824
825/* Allocate COUNT tokens for RUN. */
826void
827_cpp_init_tokenrun (tokenrun *run, unsigned int count)
828{
829 run->base = XNEWVEC (cpp_token, count);
830 run->limit = run->base + count;
831 run->next = NULL;
832}
833
834/* Returns the next tokenrun, or creates one if there is none. */
835static tokenrun *
836next_tokenrun (tokenrun *run)
837{
838 if (run->next == NULL)
839 {
840 run->next = XNEW (tokenrun);
841 run->next->prev = run;
842 _cpp_init_tokenrun (run->next, 250);
843 }
844
845 return run->next;
846}
847
848/* Look ahead in the input stream. */
849const cpp_token *
850cpp_peek_token (cpp_reader *pfile, int index)
851{
852 cpp_context *context = pfile->context;
853 const cpp_token *peektok;
854 int count;
855
856 /* First, scan through any pending cpp_context objects. */
857 while (context->prev)
858 {
859 ptrdiff_t sz = (context->direct_p
860 ? LAST (context).token - FIRST (context).token
861 : LAST (context).ptoken - FIRST (context).ptoken);
862
863 if (index < (int) sz)
864 return (context->direct_p
865 ? FIRST (context).token + index
866 : *(FIRST (context).ptoken + index));
867
868 index -= (int) sz;
869 context = context->prev;
870 }
871
872 /* We will have to read some new tokens after all (and do so
873 without invalidating preceding tokens). */
874 count = index;
875 pfile->keep_tokens++;
876
877 do
878 {
879 peektok = _cpp_lex_token (pfile);
880 if (peektok->type == CPP_EOF)
881 return peektok;
882 }
883 while (index--);
884
885 _cpp_backup_tokens_direct (pfile, count + 1);
886 pfile->keep_tokens--;
887
888 return peektok;
889}
890
891/* Allocate a single token that is invalidated at the same time as the
892 rest of the tokens on the line. Has its line and col set to the
893 same as the last lexed token, so that diagnostics appear in the
894 right place. */
895cpp_token *
896_cpp_temp_token (cpp_reader *pfile)
897{
898 cpp_token *old, *result;
899 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
900 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
901
902 old = pfile->cur_token - 1;
903 /* Any pre-existing lookaheads must not be clobbered. */
904 if (la)
905 {
906 if (sz <= la)
907 {
908 tokenrun *next = next_tokenrun (pfile->cur_run);
909
910 if (sz < la)
911 memmove (next->base + 1, next->base,
912 (la - sz) * sizeof (cpp_token));
913
914 next->base[0] = pfile->cur_run->limit[-1];
915 }
916
917 if (sz > 1)
918 memmove (pfile->cur_token + 1, pfile->cur_token,
919 MIN (la, sz - 1) * sizeof (cpp_token));
920 }
921
922 if (!sz && pfile->cur_token == pfile->cur_run->limit)
923 {
924 pfile->cur_run = next_tokenrun (pfile->cur_run);
925 pfile->cur_token = pfile->cur_run->base;
926 }
927
928 result = pfile->cur_token++;
929 result->src_loc = old->src_loc;
930 return result;
931}
932
933/* Lex a token into RESULT (external interface). Takes care of issues
934 like directive handling, token lookahead, multiple include
935 optimization and skipping. */
936const cpp_token *
937_cpp_lex_token (cpp_reader *pfile)
938{
939 cpp_token *result;
940
941 for (;;)
942 {
943 if (pfile->cur_token == pfile->cur_run->limit)
944 {
945 pfile->cur_run = next_tokenrun (pfile->cur_run);
946 pfile->cur_token = pfile->cur_run->base;
947 }
948 /* We assume that the current token is somewhere in the current
949 run. */
950 if (pfile->cur_token < pfile->cur_run->base
951 || pfile->cur_token >= pfile->cur_run->limit)
952 abort ();
953
954 if (pfile->lookaheads)
955 {
956 pfile->lookaheads--;
957 result = pfile->cur_token++;
958 }
959 else
960 result = _cpp_lex_direct (pfile);
961
962 if (result->flags & BOL)
963 {
964 /* Is this a directive. If _cpp_handle_directive returns
965 false, it is an assembler #. */
966 if (result->type == CPP_HASH
967 /* 6.10.3 p 11: Directives in a list of macro arguments
968 gives undefined behavior. This implementation
969 handles the directive as normal. */
970 && pfile->state.parsing_args != 1)
971 {
972 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
973 {
974 if (pfile->directive_result.type == CPP_PADDING)
975 continue;
976 result = &pfile->directive_result;
977 }
978 }
979 else if (pfile->state.in_deferred_pragma)
980 result = &pfile->directive_result;
981
982 if (pfile->cb.line_change && !pfile->state.skipping)
983 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
984 }
985
986 /* We don't skip tokens in directives. */
987 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
988 break;
989
990 /* Outside a directive, invalidate controlling macros. At file
991 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
992 get here and MI optimization works. */
993 pfile->mi_valid = false;
994
995 if (!pfile->state.skipping || result->type == CPP_EOF)
996 break;
997 }
998
999 return result;
1000}
1001
1002/* Returns true if a fresh line has been loaded. */
1003bool
1004_cpp_get_fresh_line (cpp_reader *pfile)
1005{
1006 int return_at_eof;
1007
1008 /* We can't get a new line until we leave the current directive. */
1009 if (pfile->state.in_directive)
1010 return false;
1011
1012 for (;;)
1013 {
1014 cpp_buffer *buffer = pfile->buffer;
1015
1016 if (!buffer->need_line)
1017 return true;
1018
1019 if (buffer->next_line < buffer->rlimit)
1020 {
1021 _cpp_clean_line (pfile);
1022 return true;
1023 }
1024
1025 /* First, get out of parsing arguments state. */
1026 if (pfile->state.parsing_args)
1027 return false;
1028
1029 /* End of buffer. Non-empty files should end in a newline. */
1030 if (buffer->buf != buffer->rlimit
1031 && buffer->next_line > buffer->rlimit
1032 && !buffer->from_stage3)
1033 {
1034 /* Clip to buffer size. */
1035 buffer->next_line = buffer->rlimit;
1036 }
1037
1038 return_at_eof = buffer->return_at_eof;
1039 _cpp_pop_buffer (pfile);
1040 if (pfile->buffer == NULL || return_at_eof)
1041 return false;
1042 }
1043}
1044
1045#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
1046 do \
1047 { \
1048 result->type = ELSE_TYPE; \
1049 if (*buffer->cur == CHAR) \
1050 buffer->cur++, result->type = THEN_TYPE; \
1051 } \
1052 while (0)
1053
1054/* Lex a token into pfile->cur_token, which is also incremented, to
1055 get diagnostics pointing to the correct location.
1056
1057 Does not handle issues such as token lookahead, multiple-include
1058 optimization, directives, skipping etc. This function is only
1059 suitable for use by _cpp_lex_token, and in special cases like
1060 lex_expansion_token which doesn't care for any of these issues.
1061
1062 When meeting a newline, returns CPP_EOF if parsing a directive,
1063 otherwise returns to the start of the token buffer if permissible.
1064 Returns the location of the lexed token. */
1065cpp_token *
1066_cpp_lex_direct (cpp_reader *pfile)
1067{
1068 cppchar_t c;
1069 cpp_buffer *buffer;
1070 const unsigned char *comment_start;
1071 cpp_token *result = pfile->cur_token++;
1072
1073 fresh_line:
1074 result->flags = 0;
1075 buffer = pfile->buffer;
1076 if (buffer->need_line)
1077 {
1078 if (pfile->state.in_deferred_pragma)
1079 {
1080 result->type = CPP_PRAGMA_EOL;
1081 pfile->state.in_deferred_pragma = false;
1082 if (!pfile->state.pragma_allow_expansion)
1083 pfile->state.prevent_expansion--;
1084 return result;
1085 }
1086 if (!_cpp_get_fresh_line (pfile))
1087 {
1088 result->type = CPP_EOF;
1089 if (!pfile->state.in_directive)
1090 {
1091 /* Tell the compiler the line number of the EOF token. */
1092 result->src_loc = pfile->line_table->highest_line;
1093 result->flags = BOL;
1094 }
1095 return result;
1096 }
1097 if (!pfile->keep_tokens)
1098 {
1099 pfile->cur_run = &pfile->base_run;
1100 result = pfile->base_run.base;
1101 pfile->cur_token = result + 1;
1102 }
1103 result->flags = BOL;
1104 if (pfile->state.parsing_args == 2)
1105 result->flags |= PREV_WHITE;
1106 }
1107 buffer = pfile->buffer;
1108 update_tokens_line:
1109 result->src_loc = pfile->line_table->highest_line;
1110
1111 skipped_white:
1112 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1113 && !pfile->overlaid_buffer)
1114 {
1115 _cpp_process_line_notes (pfile, false);
1116 result->src_loc = pfile->line_table->highest_line;
1117 }
1118 c = *buffer->cur++;
1119
1120 LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1121 CPP_BUF_COLUMN (buffer, buffer->cur));
1122
1123 switch (c)
1124 {
1125 case ' ': case '\t': case '\f': case '\v': case '\0':
1126 result->flags |= PREV_WHITE;
1127 skip_whitespace (pfile, c);
1128 goto skipped_white;
1129
1130 case '\n':
1131 if (buffer->cur < buffer->rlimit)
1132 CPP_INCREMENT_LINE (pfile, 0);
1133 buffer->need_line = true;
1134 goto fresh_line;
1135
1136 case '0': case '1': case '2': case '3': case '4':
1137 case '5': case '6': case '7': case '8': case '9':
1138 {
1139 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1140 result->type = CPP_NUMBER;
1141 lex_number (pfile, &result->val.str, &nst);
1142 warn_about_normalization (pfile, result, &nst);
1143 break;
1144 }
1145
1146 case 'L':
1147 case 'u':
1148 case 'U':
1149 /* 'L', 'u' or 'U' may introduce wide characters or strings. */
1150 if (c == 'L' || CPP_OPTION (pfile, uliterals))
1151 {
1152 if (*buffer->cur == '\'' || *buffer->cur == '"')
1153 {
1154 lex_string (pfile, result, buffer->cur - 1);
1155 break;
1156 }
1157 }
1158 /* Fall through. */
1159
1160 case '_':
1161 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1162 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1163 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1164 case 's': case 't': case 'v': case 'w': case 'x':
1165 case 'y': case 'z':
1166 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1167 case 'G': case 'H': case 'I': case 'J': case 'K':
1168 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1169 case 'S': case 'T': case 'V': case 'W': case 'X':
1170 case 'Y': case 'Z':
1171 result->type = CPP_NAME;
1172 {
1173 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1174 result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1175 &nst);
1176 warn_about_normalization (pfile, result, &nst);
1177 }
1178
1179 /* Convert named operators to their proper types. */
1180 if (result->val.node->flags & NODE_OPERATOR)
1181 {
1182 result->flags |= NAMED_OP;
1183 result->type = (enum cpp_ttype) result->val.node->directive_index;
1184 }
1185 break;
1186
1187 case '\'':
1188 case '"':
1189 lex_string (pfile, result, buffer->cur - 1);
1190 break;
1191
1192 case '/':
1193 /* A potential block or line comment. */
1194 comment_start = buffer->cur;
1195 c = *buffer->cur;
1196
1197 if (c == '*')
1198 {
1199 if (_cpp_skip_block_comment (pfile))
1200 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1201 }
1202 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1203 || cpp_in_system_header (pfile)))
1204 {
1205 /* Warn about comments only if pedantically GNUC89, and not
1206 in system headers. */
1207 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1208 && ! buffer->warned_cplusplus_comments)
1209 {
1210 cpp_error (pfile, CPP_DL_PEDWARN,
1211 "C++ style comments are not allowed in ISO C90");
1212 cpp_error (pfile, CPP_DL_PEDWARN,
1213 "(this will be reported only once per input file)");
1214 buffer->warned_cplusplus_comments = 1;
1215 }
1216
1217 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1218 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1219 }
1220 else if (c == '=')
1221 {
1222 buffer->cur++;
1223 result->type = CPP_DIV_EQ;
1224 break;
1225 }
1226 else
1227 {
1228 result->type = CPP_DIV;
1229 break;
1230 }
1231
1232 if (!pfile->state.save_comments)
1233 {
1234 result->flags |= PREV_WHITE;
1235 goto update_tokens_line;
1236 }
1237
1238 /* Save the comment as a token in its own right. */
1239 save_comment (pfile, result, comment_start, c);
1240 break;
1241
1242 case '<':
1243 if (pfile->state.angled_headers)
1244 {
1245 lex_string (pfile, result, buffer->cur - 1);
1246 if (result->type != CPP_LESS)
1247 break;
1248 }
1249
1250 result->type = CPP_LESS;
1251 if (*buffer->cur == '=')
1252 buffer->cur++, result->type = CPP_LESS_EQ;
1253 else if (*buffer->cur == '<')
1254 {
1255 buffer->cur++;
1256 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1257 }
1258 else if (CPP_OPTION (pfile, digraphs))
1259 {
1260 if (*buffer->cur == ':')
1261 {
1262 buffer->cur++;
1263 result->flags |= DIGRAPH;
1264 result->type = CPP_OPEN_SQUARE;
1265 }
1266 else if (*buffer->cur == '%')
1267 {
1268 buffer->cur++;
1269 result->flags |= DIGRAPH;
1270 result->type = CPP_OPEN_BRACE;
1271 }
1272 }
1273 break;
1274
1275 case '>':
1276 result->type = CPP_GREATER;
1277 if (*buffer->cur == '=')
1278 buffer->cur++, result->type = CPP_GREATER_EQ;
1279 else if (*buffer->cur == '>')
1280 {
1281 buffer->cur++;
1282 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1283 }
1284 break;
1285
1286 case '%':
1287 result->type = CPP_MOD;
1288 if (*buffer->cur == '=')
1289 buffer->cur++, result->type = CPP_MOD_EQ;
1290 else if (CPP_OPTION (pfile, digraphs))
1291 {
1292 if (*buffer->cur == ':')
1293 {
1294 buffer->cur++;
1295 result->flags |= DIGRAPH;
1296 result->type = CPP_HASH;
1297 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1298 buffer->cur += 2, result->type = CPP_PASTE;
1299 }
1300 else if (*buffer->cur == '>')
1301 {
1302 buffer->cur++;
1303 result->flags |= DIGRAPH;
1304 result->type = CPP_CLOSE_BRACE;
1305 }
1306 }
1307 break;
1308
1309 case '.':
1310 result->type = CPP_DOT;
1311 if (ISDIGIT (*buffer->cur))
1312 {
1313 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1314 result->type = CPP_NUMBER;
1315 lex_number (pfile, &result->val.str, &nst);
1316 warn_about_normalization (pfile, result, &nst);
1317 }
1318 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1319 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1320 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1321 buffer->cur++, result->type = CPP_DOT_STAR;
1322 break;
1323
1324 case '+':
1325 result->type = CPP_PLUS;
1326 if (*buffer->cur == '+')
1327 buffer->cur++, result->type = CPP_PLUS_PLUS;
1328 else if (*buffer->cur == '=')
1329 buffer->cur++, result->type = CPP_PLUS_EQ;
1330 break;
1331
1332 case '-':
1333 result->type = CPP_MINUS;
1334 if (*buffer->cur == '>')
1335 {
1336 buffer->cur++;
1337 result->type = CPP_DEREF;
1338 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1339 buffer->cur++, result->type = CPP_DEREF_STAR;
1340 }
1341 else if (*buffer->cur == '-')
1342 buffer->cur++, result->type = CPP_MINUS_MINUS;
1343 else if (*buffer->cur == '=')
1344 buffer->cur++, result->type = CPP_MINUS_EQ;
1345 break;
1346
1347 case '&':
1348 result->type = CPP_AND;
1349 if (*buffer->cur == '&')
1350 buffer->cur++, result->type = CPP_AND_AND;
1351 else if (*buffer->cur == '=')
1352 buffer->cur++, result->type = CPP_AND_EQ;
1353 break;
1354
1355 case '|':
1356 result->type = CPP_OR;
1357 if (*buffer->cur == '|')
1358 buffer->cur++, result->type = CPP_OR_OR;
1359 else if (*buffer->cur == '=')
1360 buffer->cur++, result->type = CPP_OR_EQ;
1361 break;
1362
1363 case ':':
1364 result->type = CPP_COLON;
1365 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1366 buffer->cur++, result->type = CPP_SCOPE;
1367 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1368 {
1369 buffer->cur++;
1370 result->flags |= DIGRAPH;
1371 result->type = CPP_CLOSE_SQUARE;
1372 }
1373 break;
1374
1375 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1376 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1377 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1378 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1379 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1380
1381 case '?': result->type = CPP_QUERY; break;
1382 case '~': result->type = CPP_COMPL; break;
1383 case ',': result->type = CPP_COMMA; break;
1384 case '(': result->type = CPP_OPEN_PAREN; break;
1385 case ')': result->type = CPP_CLOSE_PAREN; break;
1386 case '[': result->type = CPP_OPEN_SQUARE; break;
1387 case ']': result->type = CPP_CLOSE_SQUARE; break;
1388 case '{': result->type = CPP_OPEN_BRACE; break;
1389 case '}': result->type = CPP_CLOSE_BRACE; break;
1390 case ';': result->type = CPP_SEMICOLON; break;
1391
1392 /* @ is a punctuator in Objective-C. */
1393 case '@': result->type = CPP_ATSIGN; break;
1394
1395 case '$':
1396 case '\\':
1397 {
1398 const uchar *base = --buffer->cur;
1399 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1400
1401 if (forms_identifier_p (pfile, true, &nst))
1402 {
1403 result->type = CPP_NAME;
1404 result->val.node = lex_identifier (pfile, base, true, &nst);
1405 warn_about_normalization (pfile, result, &nst);
1406 break;
1407 }
1408 buffer->cur++;
1409 }
1410
1411 default:
1412 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1413 break;
1414 }
1415
1416 return result;
1417}
1418
1419/* An upper bound on the number of bytes needed to spell TOKEN.
1420 Does not include preceding whitespace. */
1421unsigned int
1422cpp_token_len (const cpp_token *token)
1423{
1424 unsigned int len;
1425
1426 switch (TOKEN_SPELL (token))
1427 {
1428 default: len = 6; break;
1429 case SPELL_LITERAL: len = token->val.str.len; break;
1430 case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
1431 }
1432
1433 return len;
1434}
1435
1436/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1437 Return the number of bytes read out of NAME. (There are always
1438 10 bytes written to BUFFER.) */
1439
1440static size_t
1441utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1442{
1443 int j;
1444 int ucn_len = 0;
1445 int ucn_len_c;
1446 unsigned t;
1447 unsigned long utf32;
1448
1449 /* Compute the length of the UTF-8 sequence. */
1450 for (t = *name; t & 0x80; t <<= 1)
1451 ucn_len++;
1452
1453 utf32 = *name & (0x7F >> ucn_len);
1454 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1455 {
1456 utf32 = (utf32 << 6) | (*++name & 0x3F);
1457
1458 /* Ill-formed UTF-8. */
1459 if ((*name & ~0x3F) != 0x80)
1460 abort ();
1461 }
1462
1463 *buffer++ = '\\';
1464 *buffer++ = 'U';
1465 for (j = 7; j >= 0; j--)
1466 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1467 return ucn_len;
1468}
1469
1470
1471/* Write the spelling of a token TOKEN to BUFFER. The buffer must
1472 already contain the enough space to hold the token's spelling.
1473 Returns a pointer to the character after the last character written.
1474 FORSTRING is true if this is to be the spelling after translation
1475 phase 1 (this is different for UCNs).
1476 FIXME: Would be nice if we didn't need the PFILE argument. */
1477unsigned char *
1478cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1479 unsigned char *buffer, bool forstring)
1480{
1481 switch (TOKEN_SPELL (token))
1482 {
1483 case SPELL_OPERATOR:
1484 {
1485 const unsigned char *spelling;
1486 unsigned char c;
1487
1488 if (token->flags & DIGRAPH)
1489 spelling
1490 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1491 else if (token->flags & NAMED_OP)
1492 goto spell_ident;
1493 else
1494 spelling = TOKEN_NAME (token);
1495
1496 while ((c = *spelling++) != '\0')
1497 *buffer++ = c;
1498 }
1499 break;
1500
1501 spell_ident:
1502 case SPELL_IDENT:
1503 if (forstring)
1504 {
1505 memcpy (buffer, NODE_NAME (token->val.node),
1506 NODE_LEN (token->val.node));
1507 buffer += NODE_LEN (token->val.node);
1508 }
1509 else
1510 {
1511 size_t i;
1512 const unsigned char * name = NODE_NAME (token->val.node);
1513
1514 for (i = 0; i < NODE_LEN (token->val.node); i++)
1515 if (name[i] & ~0x7F)
1516 {
1517 i += utf8_to_ucn (buffer, name + i) - 1;
1518 buffer += 10;
1519 }
1520 else
1521 *buffer++ = NODE_NAME (token->val.node)[i];
1522 }
1523 break;
1524
1525 case SPELL_LITERAL:
1526 memcpy (buffer, token->val.str.text, token->val.str.len);
1527 buffer += token->val.str.len;
1528 break;
1529
1530 case SPELL_NONE:
1531 cpp_error (pfile, CPP_DL_ICE,
1532 "unspellable token %s", TOKEN_NAME (token));
1533 break;
1534 }
1535
1536 return buffer;
1537}
1538
1539/* Returns TOKEN spelt as a null-terminated string. The string is
1540 freed when the reader is destroyed. Useful for diagnostics. */
1541unsigned char *
1542cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1543{
1544 unsigned int len = cpp_token_len (token) + 1;
1545 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1546
1547 end = cpp_spell_token (pfile, token, start, false);
1548 end[0] = '\0';
1549
1550 return start;
1551}
1552
1553/* Used by C front ends, which really should move to using
1554 cpp_token_as_text. */
1555const char *
1556cpp_type2name (enum cpp_ttype type)
1557{
1558 return (const char *) token_spellings[type].name;
1559}
1560
1561/* Writes the spelling of token to FP, without any preceding space.
1562 Separated from cpp_spell_token for efficiency - to avoid stdio
1563 double-buffering. */
1564void
1565cpp_output_token (const cpp_token *token, FILE *fp)
1566{
1567 switch (TOKEN_SPELL (token))
1568 {
1569 case SPELL_OPERATOR:
1570 {
1571 const unsigned char *spelling;
1572 int c;
1573
1574 if (token->flags & DIGRAPH)
1575 spelling
1576 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1577 else if (token->flags & NAMED_OP)
1578 goto spell_ident;
1579 else
1580 spelling = TOKEN_NAME (token);
1581
1582 c = *spelling;
1583 do
1584 putc (c, fp);
1585 while ((c = *++spelling) != '\0');
1586 }
1587 break;
1588
1589 spell_ident:
1590 case SPELL_IDENT:
1591 {
1592 size_t i;
1593 const unsigned char * name = NODE_NAME (token->val.node);
1594
1595 for (i = 0; i < NODE_LEN (token->val.node); i++)
1596 if (name[i] & ~0x7F)
1597 {
1598 unsigned char buffer[10];
1599 i += utf8_to_ucn (buffer, name + i) - 1;
1600 fwrite (buffer, 1, 10, fp);
1601 }
1602 else
1603 fputc (NODE_NAME (token->val.node)[i], fp);
1604 }
1605 break;
1606
1607 case SPELL_LITERAL:
1608 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1609 break;
1610
1611 case SPELL_NONE:
1612 /* An error, most probably. */
1613 break;
1614 }
1615}
1616
1617/* Compare two tokens. */
1618int
1619_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1620{
1621 if (a->type == b->type && a->flags == b->flags)
1622 switch (TOKEN_SPELL (a))
1623 {
1624 default: /* Keep compiler happy. */
1625 case SPELL_OPERATOR:
1626 return 1;
1627 case SPELL_NONE:
1628 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1629 case SPELL_IDENT:
1630 return a->val.node == b->val.node;
1631 case SPELL_LITERAL:
1632 return (a->val.str.len == b->val.str.len
1633 && !memcmp (a->val.str.text, b->val.str.text,
1634 a->val.str.len));
1635 }
1636
1637 return 0;
1638}
1639
1640/* Returns nonzero if a space should be inserted to avoid an
1641 accidental token paste for output. For simplicity, it is
1642 conservative, and occasionally advises a space where one is not
1643 needed, e.g. "." and ".2". */
1644int
1645cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1646 const cpp_token *token2)
1647{
1648 enum cpp_ttype a = token1->type, b = token2->type;
1649 cppchar_t c;
1650
1651 if (token1->flags & NAMED_OP)
1652 a = CPP_NAME;
1653 if (token2->flags & NAMED_OP)
1654 b = CPP_NAME;
1655
1656 c = EOF;
1657 if (token2->flags & DIGRAPH)
1658 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1659 else if (token_spellings[b].category == SPELL_OPERATOR)
1660 c = token_spellings[b].name[0];
1661
1662 /* Quickly get everything that can paste with an '='. */
1663 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1664 return 1;
1665
1666 switch (a)
1667 {
1668 case CPP_GREATER: return c == '>';
1669 case CPP_LESS: return c == '<' || c == '%' || c == ':';
1670 case CPP_PLUS: return c == '+';
1671 case CPP_MINUS: return c == '-' || c == '>';
1672 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1673 case CPP_MOD: return c == ':' || c == '>';
1674 case CPP_AND: return c == '&';
1675 case CPP_OR: return c == '|';
1676 case CPP_COLON: return c == ':' || c == '>';
1677 case CPP_DEREF: return c == '*';
1678 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1679 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1680 case CPP_NAME: return ((b == CPP_NUMBER
1681 && name_p (pfile, &token2->val.str))
1682 || b == CPP_NAME
1683 || b == CPP_CHAR || b == CPP_STRING); /* L */
1684 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1685 || c == '.' || c == '+' || c == '-');
1686 /* UCNs */
1687 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1688 && b == CPP_NAME)
1689 || (CPP_OPTION (pfile, objc)
1690 && token1->val.str.text[0] == '@'
1691 && (b == CPP_NAME || b == CPP_STRING)));
1692 default: break;
1693 }
1694
1695 return 0;
1696}
1697
1698/* Output all the remaining tokens on the current line, and a newline
1699 character, to FP. Leading whitespace is removed. If there are
1700 macros, special token padding is not performed. */
1701void
1702cpp_output_line (cpp_reader *pfile, FILE *fp)
1703{
1704 const cpp_token *token;
1705
1706 token = cpp_get_token (pfile);
1707 while (token->type != CPP_EOF)
1708 {
1709 cpp_output_token (token, fp);
1710 token = cpp_get_token (pfile);
1711 if (token->flags & PREV_WHITE)
1712 putc (' ', fp);
1713 }
1714
1715 putc ('\n', fp);
1716}
1717
1718/* Return a string representation of all the remaining tokens on the
1719 current line. The result is allocated using xmalloc and must be
1720 freed by the caller. */
1721unsigned char *
1722cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1723{
1724 const cpp_token *token;
1725 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1726 unsigned int alloced = 120 + out;
1727 unsigned char *result = (unsigned char *) xmalloc (alloced);
1728
1729 /* If DIR_NAME is empty, there are no initial contents. */
1730 if (dir_name)
1731 {
1732 sprintf ((char *) result, "#%s ", dir_name);
1733 out += 2;
1734 }
1735
1736 token = cpp_get_token (pfile);
1737 while (token->type != CPP_EOF)
1738 {
1739 unsigned char *last;
1740 /* Include room for a possible space and the terminating nul. */
1741 unsigned int len = cpp_token_len (token) + 2;
1742
1743 if (out + len > alloced)
1744 {
1745 alloced *= 2;
1746 if (out + len > alloced)
1747 alloced = out + len;
1748 result = (unsigned char *) xrealloc (result, alloced);
1749 }
1750
1751 last = cpp_spell_token (pfile, token, &result[out], 0);
1752 out = last - result;
1753
1754 token = cpp_get_token (pfile);
1755 if (token->flags & PREV_WHITE)
1756 result[out++] = ' ';
1757 }
1758
1759 result[out] = '\0';
1760 return result;
1761}
1762
1763/* Memory buffers. Changing these three constants can have a dramatic
1764 effect on performance. The values here are reasonable defaults,
1765 but might be tuned. If you adjust them, be sure to test across a
1766 range of uses of cpplib, including heavy nested function-like macro
1767 expansion. Also check the change in peak memory usage (NJAMD is a
1768 good tool for this). */
1769#define MIN_BUFF_SIZE 8000
1770#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1771#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1772 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1773
1774#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1775 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1776#endif
1777
1778/* Create a new allocation buffer. Place the control block at the end
1779 of the buffer, so that buffer overflows will cause immediate chaos. */
1780static _cpp_buff *
1781new_buff (size_t len)
1782{
1783 _cpp_buff *result;
1784 unsigned char *base;
1785
1786 if (len < MIN_BUFF_SIZE)
1787 len = MIN_BUFF_SIZE;
1788 len = CPP_ALIGN (len);
1789
1790 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1791 result = (_cpp_buff *) (base + len);
1792 result->base = base;
1793 result->cur = base;
1794 result->limit = base + len;
1795 result->next = NULL;
1796 return result;
1797}
1798
1799/* Place a chain of unwanted allocation buffers on the free list. */
1800void
1801_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1802{
1803 _cpp_buff *end = buff;
1804
1805 while (end->next)
1806 end = end->next;
1807 end->next = pfile->free_buffs;
1808 pfile->free_buffs = buff;
1809}
1810
1811/* Return a free buffer of size at least MIN_SIZE. */
1812_cpp_buff *
1813_cpp_get_buff (cpp_reader *pfile, size_t min_size)
1814{
1815 _cpp_buff *result, **p;
1816
1817 for (p = &pfile->free_buffs;; p = &(*p)->next)
1818 {
1819 size_t size;
1820
1821 if (*p == NULL)
1822 return new_buff (min_size);
1823 result = *p;
1824 size = result->limit - result->base;
1825 /* Return a buffer that's big enough, but don't waste one that's
1826 way too big. */
1827 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1828 break;
1829 }
1830
1831 *p = result->next;
1832 result->next = NULL;
1833 result->cur = result->base;
1834 return result;
1835}
1836
1837/* Creates a new buffer with enough space to hold the uncommitted
1838 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1839 the excess bytes to the new buffer. Chains the new buffer after
1840 BUFF, and returns the new buffer. */
1841_cpp_buff *
1842_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1843{
1844 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1845 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1846
1847 buff->next = new_buff;
1848 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1849 return new_buff;
1850}
1851
1852/* Creates a new buffer with enough space to hold the uncommitted
1853 remaining bytes of the buffer pointed to by BUFF, and at least
1854 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1855 Chains the new buffer before the buffer pointed to by BUFF, and
1856 updates the pointer to point to the new buffer. */
1857void
1858_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1859{
1860 _cpp_buff *new_buff, *old_buff = *pbuff;
1861 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1862
1863 new_buff = _cpp_get_buff (pfile, size);
1864 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1865 new_buff->next = old_buff;
1866 *pbuff = new_buff;
1867}
1868
1869/* Free a chain of buffers starting at BUFF. */
1870void
1871_cpp_free_buff (_cpp_buff *buff)
1872{
1873 _cpp_buff *next;
1874
1875 for (; buff; buff = next)
1876 {
1877 next = buff->next;
1878 free (buff->base);
1879 }
1880}
1881
1882/* Allocate permanent, unaligned storage of length LEN. */
1883unsigned char *
1884_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1885{
1886 _cpp_buff *buff = pfile->u_buff;
1887 unsigned char *result = buff->cur;
1888
1889 if (len > (size_t) (buff->limit - result))
1890 {
1891 buff = _cpp_get_buff (pfile, len);
1892 buff->next = pfile->u_buff;
1893 pfile->u_buff = buff;
1894 result = buff->cur;
1895 }
1896
1897 buff->cur = result + len;
1898 return result;
1899}
1900
1901/* Allocate permanent, unaligned storage of length LEN from a_buff.
1902 That buffer is used for growing allocations when saving macro
1903 replacement lists in a #define, and when parsing an answer to an
1904 assertion in #assert, #unassert or #if (and therefore possibly
1905 whilst expanding macros). It therefore must not be used by any
1906 code that they might call: specifically the lexer and the guts of
1907 the macro expander.
1908
1909 All existing other uses clearly fit this restriction: storing
1910 registered pragmas during initialization. */
1911unsigned char *
1912_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1913{
1914 _cpp_buff *buff = pfile->a_buff;
1915 unsigned char *result = buff->cur;
1916
1917 if (len > (size_t) (buff->limit - result))
1918 {
1919 buff = _cpp_get_buff (pfile, len);
1920 buff->next = pfile->a_buff;
1921 pfile->a_buff = buff;
1922 result = buff->cur;
1923 }
1924
1925 buff->cur = result + len;
1926 return result;
1927}
1928
1929/* Say which field of TOK is in use. */
1930
1931enum cpp_token_fld_kind
1932cpp_token_val_index (cpp_token *tok)
1933{
1934 switch (TOKEN_SPELL (tok))
1935 {
1936 case SPELL_IDENT:
1937 return CPP_TOKEN_FLD_NODE;
1938 case SPELL_LITERAL:
1939 return CPP_TOKEN_FLD_STR;
1940 case SPELL_NONE:
1941 if (tok->type == CPP_MACRO_ARG)
1942 return CPP_TOKEN_FLD_ARG_NO;
1943 else if (tok->type == CPP_PADDING)
1944 return CPP_TOKEN_FLD_SOURCE;
1945 else if (tok->type == CPP_PRAGMA)
1946 return CPP_TOKEN_FLD_PRAGMA;
1947 /* else fall through */
1948 default:
1949 return CPP_TOKEN_FLD_NONE;
1950 }
1951}