1 /* __gmp_doscan -- formatted input internals.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
24 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
35 #include <stddef.h> /* for ptrdiff_t */
37 #include <stdlib.h> /* for strtol */
41 #include <langinfo.h> /* for nl_langinfo */
45 #include <locale.h> /* for localeconv */
49 # include <inttypes.h> /* for intmax_t */
57 #include <sys/types.h> /* for quad_t */
64 /* Change this to "#define TRACE(x) x" for some traces. */
70 It's necessary to parse up the format string to recognise the GMP
71 extra types F, Q and Z. Other types and conversions are passed
72 across to the standard sscanf or fscanf via funs->scan, for ease of
73 implementation. This is essential in the case of something like glibc
74 %p where the pointer format isn't actually documented.
76 Because funs->scan doesn't get the whole input it can't put the right
77 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
78 nor fscanf directly indicate how many characters were read, so an
79 extra %n is appended to each run for that. For fscanf this merely
80 supports our %n output, but for sscanf it lets funs->step move us
81 along the input string.
83 Whitespace and literal matches in the format string, including %%,
84 are handled directly within __gmp_doscan. This is reasonably
85 efficient, and avoids some suspicious behaviour observed in various
86 system libc's. GLIBC 2.2.4 for instance returns 0 on
90 sscanf(" ", " x%d",&n)
92 whereas we think they should return EOF, since end-of-string is
93 reached when a match of "x" is required.
95 For standard % conversions, funs->scan is called once for each
96 conversion. If we had vfscanf and vsscanf and could rely on their
97 fixed text matching behaviour then we could call them with multiple
98 consecutive standard conversions. But plain fscanf and sscanf work
99 fine, and parsing one field at a time shouldn't be too much of a
104 gmpscan reads a gmp type. It's only used from one place, but is a
105 separate subroutine to avoid a big chunk of complicated code in the
106 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
107 possible to share code for parsing integers, rationals and floats.
109 In gmpscan normally one char of lookahead is maintained, but when width
110 is reached that stops, on the principle that an fgetc/ungetc of a char
111 past where we're told to stop would be undesirable. "chars" is how many
112 characters have been read so far, including the current c. When
113 chars==width and another character is desired then a jump is done to the
114 "convert" stage. c is invalid and mustn't be unget'ed in this case;
115 chars is set to width+1 to indicate that.
117 gmpscan normally returns the number of characters read. -1 means an
118 invalid field, -2 means EOF reached before any matching characters
121 For hex floats, the mantissa part is passed to mpf_set_str, then the
122 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
123 than teaching mpf_set_str about an exponent factor (ie. 2) differing
124 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
125 mpf_div_2exp will preserve the application requested precision, so
126 nothing in that respect is lost by making this a two-step process.
130 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
131 string which is a match for the appropriate type, or a prefix of a
132 match. With that done, if it's only a prefix then the result is a
133 matching failure, ie. invalid input.
135 This rule seems fairly clear, but doesn't seem to be universally
136 applied in system C libraries. Even GLIBC doesn't seem to get it
137 right, insofar as it seems to accept some apparently invalid forms.
138 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
139 standard would suggest a non-empty sequence of digits should be
140 required after an "0x".
142 A footnote to 7.19.6.2 para 17 notes how this input item reading can
143 mean inputs acceptable to strtol are not acceptable to fscanf. We
144 think this confirms our reading of "0x" as invalid.
146 Clearly gmp_sscanf could backtrack to a longest input which was a
147 valid match for a given item, but this is not done, since C99 says
148 sscanf is identical to fscanf, so we make gmp_sscanf identical to
153 C99 says "ll" is for long long, and "L" is for long double floats.
154 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
155 doesn't affect us directly, since both are passed through to plain
156 scanf. It seems wisest not to try to enforce the C99 rule. This is
157 consistent with what we said before, though whether it actually
158 worked was always up to the C library.
162 Consideration was given to using separate code for gmp_fscanf and
163 gmp_sscanf. The sscanf case could zip across a string doing literal
164 matches or recognising digits in gmpscan, rather than making a
165 function call fun->get per character. The fscanf could use getc
166 rather than fgetc too, which might help those systems where getc is a
167 macro or otherwise inlined. But none of this scanning and converting
168 will be particularly fast, so the two are done together to keep it a
169 little simpler for now.
171 Various multibyte string issues are not addressed, for a start C99
172 scanf says the format string is multibyte. Since we pass %c, %s and
173 %[ to the system scanf, they might do multibyte reads already, but
174 it's another matter whether or not that can be used, since our digit
175 and whitespace parsing is only unibyte. The plan is to quietly
176 ignore multibyte locales for now. This is not as bad as it sounds,
177 since GMP is presumably used mostly on numbers, which can be
178 perfectly adequately treated in plain ASCII.
183 struct gmp_doscan_params_t {
193 ASSERT (chars <= width); \
197 (c) = (*funs->get) (data); \
200 /* store into "s", extending if necessary */
203 ASSERT (s_upto <= s_alloc); \
204 if (s_upto >= s_alloc) \
206 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
207 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
208 s_alloc = s_alloc_new; \
213 #define S_ALLOC_STEP 512
216 gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
217 const struct gmp_doscan_params_t *p, void *dst)
219 int chars, c, base, first, width, seen_point, seen_digit, hexfloat;
220 size_t s_upto, s_alloc, hexexp;
224 TRACE (printf ("gmpscan\n"));
226 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
228 c = (*funs->get) (data);
235 width = (p->width == 0 ? INT_MAX-1 : p->width);
237 s_alloc = S_ALLOC_STEP;
238 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
252 /* don't store '+', it's not accepted by mpz_set_str etc */
259 base = 10; /* decimal if no base indicator */
262 seen_digit = 1; /* 0 alone is a valid number */
264 base = 8; /* leading 0 is octal, for non-floats */
267 if (c == 'x' || c == 'X')
270 seen_digit = 0; /* must have digits after an 0x */
271 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
292 if (base == 8 && (c == '8' || c == '9'))
304 if (p->type == 'F' && ! seen_point)
306 /* For a multi-character decimal point, if the first character is
307 present then all of it must be, otherwise the input is
308 considered invalid. */
309 const char *point = GMP_DECIMAL_POINT;
310 int pc = (unsigned char) *point++;
317 pc = (unsigned char) *point++;
331 if (hexfloat && (c == 'p' || c == 'P'))
333 hexexp = s_upto; /* exponent location */
334 base = 10; /* exponent in decimal */
337 else if (! hexfloat && (c == 'e' || c == 'E'))
340 /* must have at least one digit in the mantissa, just an exponent
341 is not good enough */
354 if (p->type == 'Q' && c == '/')
356 /* must have at least one digit in the numerator */
360 /* now look for at least one digit in the denominator */
363 /* allow the base to be redetermined for "%i" */
380 TRACE (printf (" convert \"%s\"\n", s));
382 /* We ought to have parsed out a valid string above, so just test
383 mpz_set_str etc with an ASSERT. */
387 mpf_ptr f = (mpf_ptr) dst;
390 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
395 exp = strtol (s + hexexp + 1, &dummy, 10);
397 mpf_mul_2exp (f, f, (unsigned long) exp);
399 mpf_div_2exp (f, f, - (unsigned long) exp);
404 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
407 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
417 ASSERT (chars <= width+1);
418 if (chars != width+1)
420 (*funs->unget) (c, data);
421 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1));
425 (*__gmp_free_func) (s, s_alloc);
429 TRACE (printf (" invalid\n"));
433 TRACE (printf (" return %d chars (cf width %d)\n", chars, width));
438 /* Read and discard whitespace, if any. Return number of chars skipped.
439 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
440 it's not necessary to watch for EOF from funs->get, */
442 skip_white (const struct gmp_doscan_funs_t *funs, void *data)
449 c = (funs->get) (data);
454 (funs->unget) (c, data);
457 TRACE (printf (" skip white %d\n", ret));
463 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
464 const char *orig_fmt, va_list orig_ap)
466 struct gmp_doscan_params_t param;
469 const char *fmt, *this_fmt, *end_fmt;
470 size_t orig_fmt_len, alloc_fmt_size, len;
471 int new_fields, new_chars;
476 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
477 if (funs->scan == (gmp_doscan_scan_t) sscanf)
478 printf (" s=\"%s\"\n", * (const char **) data));
480 /* Don't modify orig_ap, if va_list is actually an array and hence call by
481 reference. It could be argued that it'd be more efficient to leave
482 callers to make a copy if they care, but doing so here is going to be a
483 very small part of the total work, and we may as well keep applications
485 va_copy (ap, orig_ap);
487 /* Parts of the format string are going to be copied so that a " %n" can
488 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
489 needed if fmt consists of a single "%" specifier, but otherwise is an
490 overestimate. We're not going to be very fast here, so use
491 __gmp_allocate_func rather than TMP_ALLOC. */
492 orig_fmt_len = strlen (orig_fmt);
493 alloc_fmt_size = orig_fmt_len + 4;
494 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
497 end_fmt = orig_fmt + orig_fmt_len;
509 chars += skip_white (funs, data);
517 c = (funs->get) (data);
520 (funs->unget) (c, data);
534 param.base = 0; /* for e,f,g,i */
539 TRACE (printf (" this_fmt \"%s\"\n", this_fmt));
543 ASSERT (fmt <= end_fmt);
548 case '\0': /* unterminated % sequence */
552 case '%': /* literal % */
555 case '[': /* character range */
559 /* ']' allowed as the first char (possibly after '^') */
564 ASSERT (fmt <= end_fmt);
567 /* unterminated % sequence */
576 case 'c': /* characters */
577 case 's': /* string of non-whitespace */
578 case 'p': /* pointer */
580 len = fmt - this_fmt;
581 memcpy (alloc_fmt, this_fmt, len);
582 alloc_fmt[len++] = '%';
583 alloc_fmt[len++] = 'n';
584 alloc_fmt[len] = '\0';
586 TRACE (printf (" scan \"%s\"\n", alloc_fmt);
587 if (funs->scan == (gmp_doscan_scan_t) sscanf)
588 printf (" s=\"%s\"\n", * (const char **) data));
593 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
594 ASSERT (new_fields == 0 || new_fields == EOF);
598 void *arg = va_arg (ap, void *);
599 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
600 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
603 goto done; /* invalid input */
606 ASSERT (new_chars != -1);
608 TRACE (printf (" new_fields %d new_chars %d\n",
609 new_fields, new_chars));
611 if (new_fields == -1)
612 goto eof_no_match; /* EOF before anything matched */
614 /* Under param.ignore, when new_fields==0 we don't know if
615 it's a successful match or an invalid field. new_chars
616 won't have been assigned if it was an invalid field. */
618 goto done; /* invalid input */
621 (*funs->step) (data, new_chars);
628 case 'd': /* decimal */
629 case 'u': /* decimal */
633 case 'e': /* float */
634 case 'E': /* float */
635 case 'f': /* float */
636 case 'g': /* float */
637 case 'G': /* float */
638 case 'i': /* integer with base marker */
640 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
643 chars += skip_white (funs, data);
645 new_chars = gmpscan (funs, data, ¶m,
646 param.ignore ? NULL : va_arg (ap, void*));
652 ASSERT (new_chars >= 0);
654 goto increment_fields;
656 case 'a': /* glibc allocate string */
657 case '\'': /* glibc digit groupings */
660 case 'F': /* mpf_t */
661 case 'j': /* intmax_t */
662 case 'L': /* long long */
663 case 'q': /* quad_t */
664 case 'Q': /* mpq_t */
665 case 't': /* ptrdiff_t */
666 case 'z': /* size_t */
667 case 'Z': /* mpz_t */
672 case 'h': /* short or char */
673 if (param.type != 'h')
675 param.type = 'H'; /* internal code for "hh" */
680 case 'l': /* long, long long, double or long double */
681 if (param.type != 'l')
683 param.type = 'L'; /* "ll" means "L" */
690 p = va_arg (ap, void *);
691 TRACE (printf (" store %%n to %p\n", p));
692 switch (param.type) {
693 case '\0': * (int *) p = chars; break;
694 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break;
695 case 'H': * (char *) p = chars; break;
696 case 'h': * (short *) p = chars; break;
698 case 'j': * (intmax_t *) p = chars; break;
700 case 'j': ASSERT_FAIL (intmax_t not available); break;
702 case 'l': * (long *) p = chars; break;
703 #if HAVE_QUAD_T && HAVE_LONG_LONG
705 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
708 case 'q': ASSERT_FAIL (quad_t not available); break;
711 case 'L': * (long long *) p = chars; break;
713 case 'L': ASSERT_FAIL (long long not available); break;
715 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
717 case 't': * (ptrdiff_t *) p = chars; break;
719 case 't': ASSERT_FAIL (ptrdiff_t not available); break;
721 case 'z': * (size_t *) p = chars; break;
722 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break;
723 default: ASSERT (0); break;
737 case '0': case '1': case '2': case '3': case '4':
738 case '5': case '6': case '7': case '8': case '9':
741 param.width = param.width * 10 + (fchar-'0');
743 } while (isdigit (fchar));
744 fmt--; /* unget the non-digit */
752 /* something invalid in a % sequence */
760 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);