2 * field.c - routines for dealing with fields and record parsing
6 * Copyright (C) 1986, 1988, 1989, 1991-2000 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
25 * $FreeBSD: src/contrib/awk/field.c,v 1.4.2.1 2001/01/23 22:08:31 asmodai Exp $
26 * $DragonFly: src/contrib/awk/Attic/field.c,v 1.2 2003/06/17 04:23:58 dillon Exp $
31 typedef void (* Setfunc) P((long, char *, long, NODE *));
33 static long (*parse_field) P((long, char **, int, NODE *,
34 Regexp *, Setfunc, NODE *));
35 static void rebuild_record P((void));
36 static long re_parse_field P((long, char **, int, NODE *,
37 Regexp *, Setfunc, NODE *));
38 static long def_parse_field P((long, char **, int, NODE *,
39 Regexp *, Setfunc, NODE *));
40 static long posix_def_parse_field P((long, char **, int, NODE *,
41 Regexp *, Setfunc, NODE *));
42 static long null_parse_field P((long, char **, int, NODE *,
43 Regexp *, Setfunc, NODE *));
44 static long sc_parse_field P((long, char **, int, NODE *,
45 Regexp *, Setfunc, NODE *));
46 static long fw_parse_field P((long, char **, int, NODE *,
47 Regexp *, Setfunc, NODE *));
48 static void set_element P((long num, char * str, long len, NODE *arr));
49 static void grow_fields_arr P((long num));
50 static void set_field P((long num, char *str, long len, NODE *dummy));
53 static char *parse_extent; /* marks where to restart parse of record */
54 static long parse_high_water = 0; /* field number that we have parsed so far */
55 static long nf_high_water = 0; /* size of fields_arr */
57 static NODE *save_FS; /* save current value of FS when line is read,
58 * to be used in deferred parsing
60 static int *FIELDWIDTHS = NULL;
62 NODE **fields_arr; /* array of pointers to the field nodes */
63 int field0_valid; /* $(>0) has not been changed yet */
64 int default_FS; /* TRUE when FS == " " */
65 Regexp *FS_regexp = NULL;
66 static NODE *Null_field = NULL;
68 /* using_FIELDWIDTHS --- static function, macro to avoid overhead */
69 #define using_FIELDWIDTHS() (parse_field == fw_parse_field)
71 /* init_fields --- set up the fields array to start with */
78 emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
81 n->flags |= (SCALAR|FIELD);
84 parse_extent = fields_arr[0]->stptr;
85 save_FS = dupnode(FS_node->var_value);
87 *Null_field = *Nnull_string;
88 Null_field->flags |= (SCALAR|FIELD);
89 Null_field->flags &= ~(NUM|NUMBER|MAYBE_NUM|PERM);
93 /* grow_fields --- acquire new fields as needed */
102 erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
103 for (t = nf_high_water + 1; t <= num; t++) {
111 /* set_field --- set the value of a particular field */
115 set_field(num, str, len, dummy)
119 NODE *dummy; /* not used -- just to make interface same as set_element */
123 if (num > nf_high_water)
124 grow_fields_arr(num);
128 n->flags = (STR|STRING|MAYBE_NUM|SCALAR|FIELD);
131 /* rebuild_record --- Someone assigned a value to $(something).
132 Fix up $0 to be right */
138 * use explicit unsigned longs for lengths, in case
139 * a size_t isn't big enough.
141 register unsigned long tlen;
142 register unsigned long ofslen;
152 ofs = force_string(OFS_node->var_value);
154 for (i = NF; i > 0; i--) {
156 tmp = force_string(tmp);
159 tlen += (NF - 1) * ofslen;
162 emalloc(ops, char *, tlen + 2, "rebuild_record");
165 for (i = 1; i <= NF; i++) {
169 *cops++ = tmp->stptr[0];
170 else if (tmp->stlen != 0) {
171 memcpy(cops, tmp->stptr, tmp->stlen);
177 *cops++ = ofs->stptr[0];
178 else if (ofslen != 0) {
179 memcpy(cops, ofs->stptr, ofslen);
184 tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
187 * Since we are about to unref fields_arr[0], we want to find
188 * any fields that still point into it, and have them point
189 * into the new field zero.
191 for (cops = ops, i = 1; i <= NF; i++) {
192 if (fields_arr[i]->stlen > 0) {
196 if ((fields_arr[i]->flags & FIELD) == 0) {
198 n->stlen = fields_arr[i]->stlen;
199 if ((fields_arr[i]->flags & (NUM|NUMBER)) != 0) {
200 n->flags |= (fields_arr[i]->flags & (NUM|NUMBER));
201 n->numbr = fields_arr[i]->numbr;
204 *n = *(fields_arr[i]);
205 n->flags &= ~(MALLOC|TEMP|PERM|STRING);
209 unref(fields_arr[i]);
212 cops += fields_arr[i]->stlen + ofslen;
215 unref(fields_arr[0]);
223 * setup $0, but defer parsing rest of line until reference is made to $(>0)
224 * or to NF. At that point, parse only as much as necessary.
226 * Manage a private buffer for the contents of $0. Doing so keeps us safe
227 * if `getline var' decides to rearrange the contents of the IOBUF that
228 * $0 might have been pointing into. The cost is the copying of the buffer;
229 * but better correct than fast.
232 set_record(buf, cnt, freeold)
233 char *buf; /* ignored if ! freeold */
234 int cnt; /* ignored if ! freeold */
239 static char *databuf;
240 static unsigned long databuf_size;
241 #define INITIAL_SIZE 512
242 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
245 for (i = 1; i <= parse_high_water; i++) {
246 unref(fields_arr[i]);
252 parse_high_water = 0;
254 * $0 = $0 should resplit using the current value of FS, thus,
255 * this is executed orthogonally to the value of freeold.
260 save_FS = dupnode(FS_node->var_value);
263 /* buffer management: */
264 if (databuf_size == 0) { /* first time */
265 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
266 databuf_size = INITIAL_SIZE;
269 * Make sure there's enough room. Since we sometimes need
270 * to place a sentinel at the end, we make sure
271 * databuf_size is > cnt after allocation.
273 if (cnt >= databuf_size) {
274 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
276 erealloc(databuf, char *, databuf_size, "set_record");
279 memcpy(databuf, buf, cnt);
281 /* manage field 0: */
282 unref(fields_arr[0]);
289 n->flags = (STRING|STR|MAYBE_NUM|SCALAR|FIELD);
292 fields_arr[0]->flags |= MAYBE_NUM;
299 /* reset_record --- start over again with current $0 */
304 (void) force_string(fields_arr[0]);
305 set_record(fields_arr[0]->stptr, fields_arr[0]->stlen, FALSE);
308 /* set_NF --- handle what happens to $0 and fields when NF is changed */
318 NF = (long) force_number(NF_node->var_value);
319 if (NF > nf_high_water)
321 if (parse_high_water < NF) {
322 for (i = parse_high_water + 1; i <= NF; i++) {
323 unref(fields_arr[i]);
328 } else if (parse_high_water > 0) {
329 for (i = NF + 1; i <= parse_high_water; i++) {
330 unref(fields_arr[i]);
335 parse_high_water = NF;
337 field0_valid = FALSE;
341 * re_parse_field --- parse fields using a regexp.
343 * This is called both from get_field() and from do_split()
344 * via (*parse_field)(). This variation is for when FS is a regular
345 * expression -- either user-defined or because RS=="" and FS==" "
348 re_parse_field(up_to, buf, len, fs, rp, set, n)
349 long up_to; /* parse only up to this field number */
350 char **buf; /* on input: string to parse; on output: point to start next */
354 Setfunc set; /* routine to set the value of the parsed field */
357 register char *scan = *buf;
358 register long nf = parse_high_water;
359 register char *field;
360 register char *end = scan + len;
367 if (RS_is_null && default_FS)
368 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
372 && research(rp, scan, 0, (end - scan), TRUE) != -1
374 if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
377 (*set)(++nf, field, (long)(scan - field), n);
384 (long)(scan + RESTART(rp, scan) - field), n);
385 scan += REEND(rp, scan);
387 if (scan == end) /* FS at end of record */
388 (*set)(++nf, field, 0L, n);
390 if (nf != up_to && scan < end) {
391 (*set)(++nf, scan, (long)(end - scan), n);
399 * def_parse_field --- default field parsing.
401 * This is called both from get_field() and from do_split()
402 * via (*parse_field)(). This variation is for when FS is a single space
407 def_parse_field(up_to, buf, len, fs, rp, set, n)
408 long up_to; /* parse only up to this field number */
409 char **buf; /* on input: string to parse; on output: point to start next */
413 Setfunc set; /* routine to set the value of the parsed field */
416 register char *scan = *buf;
417 register long nf = parse_high_water;
418 register char *field;
419 register char *end = scan + len;
428 * Nasty special case. If FS set to "", return whole record
429 * as first field. This is not worth a separate function.
431 if (fs->stlen == 0) {
432 (*set)(++nf, *buf, len, n);
437 /* before doing anything save the char at *end */
439 /* because it will be destroyed now: */
441 *end = ' '; /* sentinel character */
442 for (; nf < up_to; scan++) {
444 * special case: fs is single space, strip leading whitespace
446 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
451 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
453 (*set)(++nf, field, (long)(scan - field), n);
458 /* everything done, restore original char at *end */
466 * posix_def_parse_field --- default field parsing.
468 * This is called both from get_field() and from do_split()
469 * via (*parse_field)(). This variation is for when FS is a single space
470 * character. The only difference between this and def_parse_field()
471 * is that this one does not allow newlines to separate fields.
475 posix_def_parse_field(up_to, buf, len, fs, rp, set, n)
476 long up_to; /* parse only up to this field number */
477 char **buf; /* on input: string to parse; on output: point to start next */
481 Setfunc set; /* routine to set the value of the parsed field */
484 register char *scan = *buf;
485 register long nf = parse_high_water;
486 register char *field;
487 register char *end = scan + len;
496 * Nasty special case. If FS set to "", return whole record
497 * as first field. This is not worth a separate function.
499 if (fs->stlen == 0) {
500 (*set)(++nf, *buf, len, n);
505 /* before doing anything save the char at *end */
507 /* because it will be destroyed now: */
509 *end = ' '; /* sentinel character */
510 for (; nf < up_to; scan++) {
512 * special case: fs is single space, strip leading whitespace
514 while (scan < end && (*scan == ' ' || *scan == '\t'))
519 while (*scan != ' ' && *scan != '\t')
521 (*set)(++nf, field, (long)(scan - field), n);
526 /* everything done, restore original char at *end */
534 * null_parse_field --- each character is a separate field
536 * This is called both from get_field() and from do_split()
537 * via (*parse_field)(). This variation is for when FS is the null string.
540 null_parse_field(up_to, buf, len, fs, rp, set, n)
541 long up_to; /* parse only up to this field number */
542 char **buf; /* on input: string to parse; on output: point to start next */
546 Setfunc set; /* routine to set the value of the parsed field */
549 register char *scan = *buf;
550 register long nf = parse_high_water;
551 register char *end = scan + len;
558 for (; nf < up_to && scan < end; scan++)
559 (*set)(++nf, scan, 1L, n);
566 * sc_parse_field --- single character field separator
568 * This is called both from get_field() and from do_split()
569 * via (*parse_field)(). This variation is for when FS is a single character
573 sc_parse_field(up_to, buf, len, fs, rp, set, n)
574 long up_to; /* parse only up to this field number */
575 char **buf; /* on input: string to parse; on output: point to start next */
579 Setfunc set; /* routine to set the value of the parsed field */
582 register char *scan = *buf;
583 register char fschar;
584 register long nf = parse_high_water;
585 register char *field;
586 register char *end = scan + len;
595 if (RS_is_null && fs->stlen == 0)
598 fschar = fs->stptr[0];
600 onecase = (IGNORECASE && isalpha(fschar));
602 fschar = casetable[(int) fschar];
604 /* before doing anything save the char at *end */
606 /* because it will be destroyed now: */
607 *end = fschar; /* sentinel character */
609 for (; nf < up_to;) {
612 while (casetable[(int) *scan] != fschar)
615 while (*scan != fschar)
618 (*set)(++nf, field, (long)(scan - field), n);
622 if (scan == end) { /* FS at end of record */
623 (*set)(++nf, field, 0L, n);
628 /* everything done, restore original char at *end */
636 * fw_parse_field --- field parsing using FIELDWIDTHS spec
638 * This is called both from get_field() and from do_split()
639 * via (*parse_field)(). This variation is for fields are fixed widths.
642 fw_parse_field(up_to, buf, len, fs, rp, set, n)
643 long up_to; /* parse only up to this field number */
644 char **buf; /* on input: string to parse; on output: point to start next */
648 Setfunc set; /* routine to set the value of the parsed field */
651 register char *scan = *buf;
652 register long nf = parse_high_water;
653 register char *end = scan + len;
659 for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
660 if (len > end - scan)
662 (*set)(++nf, scan, (long) len, n);
672 /* get_field --- return a particular $n */
675 get_field(requested, assign)
676 register long requested;
677 Func_ptr *assign; /* this field is on the LHS of an assign */
680 * if requesting whole line but some other field has been altered,
681 * then the whole line must be rebuilt
683 if (requested == 0) {
684 if (! field0_valid) {
685 /* first, parse remainder of input record */
687 NF = (*parse_field)(HUGE-1, &parse_extent,
688 fields_arr[0]->stlen -
689 (parse_extent - fields_arr[0]->stptr),
690 save_FS, FS_regexp, set_field,
692 parse_high_water = NF;
697 *assign = reset_record;
698 return &fields_arr[0];
701 /* assert(requested > 0); */
704 field0_valid = FALSE; /* $0 needs reconstruction */
706 if (requested <= parse_high_water) /* already parsed this field */
707 return &fields_arr[requested];
709 if (NF == -1) { /* have not yet parsed to end of record */
711 * parse up to requested fields, calling set_field() for each,
712 * saving in parse_extent the point where the parse left off
714 if (parse_high_water == 0) /* starting at the beginning */
715 parse_extent = fields_arr[0]->stptr;
716 parse_high_water = (*parse_field)(requested, &parse_extent,
717 fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
718 save_FS, FS_regexp, set_field, (NODE *) NULL);
721 * if we reached the end of the record, set NF to the number of
722 * fields so far. Note that requested might actually refer to
723 * a field that is beyond the end of the record, but we won't
724 * set NF to that value at this point, since this is only a
725 * reference to the field and NF only gets set if the field
726 * is assigned to -- this case is handled below
728 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
729 NF = parse_high_water;
730 if (requested == HUGE-1) /* HUGE-1 means set NF */
731 requested = parse_high_water;
733 if (parse_high_water < requested) { /* requested beyond end of record */
734 if (assign != NULL) { /* expand record */
735 if (requested > nf_high_water)
736 grow_fields_arr(requested);
739 parse_high_water = requested;
744 return &fields_arr[requested];
747 /* set_element --- set an array element, used by do_split() */
750 set_element(num, s, len, n)
758 it = make_string(s, len);
759 it->flags |= MAYBE_NUM;
760 *assoc_lookup(n, tmp_number((AWKNUM) (num))) = it;
763 /* do_split --- implement split(), semantics are same as for field splitting */
769 NODE *src, *arr, *sep, *tmp;
772 long (*parseit) P((long, char **, int, NODE *,
773 Regexp *, Setfunc, NODE *));
777 * do dupnode(), to avoid problems like
778 * x = split(a[1], a, "blah")
779 * since we assoc_clear the array. gack.
780 * this also gives us complete call by value semantics.
782 tmp = tree_eval(tree->lnode);
786 arr = tree->rnode->lnode;
787 if (tree->rnode->rnode != NULL)
788 sep = tree->rnode->rnode->lnode; /* 3rd arg */
792 (void) force_string(src);
794 if (arr->type == Node_param_list)
795 arr = stack_ptr[arr->param_cnt];
796 if (arr->type == Node_array_ref)
797 arr = arr->orig_array;
798 if (arr->type != Node_var && arr->type != Node_var_array)
799 fatal("second argument of split is not an array");
800 arr->type = Node_var_array;
803 if ((sep->re_flags & FS_DFLT) != 0 && ! using_FIELDWIDTHS()) {
804 parseit = parse_field;
805 fs = force_string(FS_node->var_value);
808 tmp = force_string(tree_eval(sep->re_exp));
810 parseit = null_parse_field;
811 else if (tmp->stlen == 1 && (sep->re_flags & CONST) == 0) {
812 if (tmp->stptr[0] == ' ') {
814 parseit = posix_def_parse_field;
816 parseit = def_parse_field;
818 parseit = sc_parse_field;
820 parseit = re_parse_field;
827 tmp = tmp_number((AWKNUM) (*parseit)(HUGE, &s, (int) src->stlen,
828 fs, rp, set_element, arr));
834 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
842 static int fw_alloc = 1;
843 static int warned = FALSE;
844 extern double strtod();
846 if (do_lint && ! warned) {
848 warning("use of FIELDWIDTHS is a gawk extension");
850 if (do_traditional) /* quick and dirty, does the trick */
854 * If changing the way fields are split, obey least-suprise
855 * semantics, and force $0 to be split totally.
857 if (fields_arr != NULL)
858 (void) get_field(HUGE - 1, 0);
860 parse_field = fw_parse_field;
861 scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
863 if (FIELDWIDTHS == NULL)
864 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
869 erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
871 FIELDWIDTHS[i] = (int) strtod(scan, &end);
880 set_FS_if_not_FIELDWIDTHS()
882 if (parse_field != fw_parse_field)
886 /* set_FS --- handle things when FS is assigned to */
893 static NODE *save_fs = NULL;
894 static NODE *save_rs = NULL;
897 * If changing the way fields are split, obey least-suprise
898 * semantics, and force $0 to be split totally.
900 if (fields_arr != NULL)
901 (void) get_field(HUGE - 1, 0);
903 if (! (save_fs && cmp_nodes(FS_node->var_value, save_fs) == 0
904 && save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0)) {
906 save_fs = dupnode(FS_node->var_value);
908 save_rs = dupnode(RS_node->var_value);
917 fs = force_string(FS_node->var_value);
918 if (! do_traditional && fs->stlen == 0)
919 parse_field = null_parse_field;
920 else if (fs->stlen > 1)
921 parse_field = re_parse_field;
922 else if (RS_is_null) {
923 parse_field = sc_parse_field;
924 if (fs->stlen == 1) {
925 if (fs->stptr[0] == ' ') {
927 strcpy(buf, "[ \t\n]+");
928 } else if (fs->stptr[0] != '\n')
929 sprintf(buf, "[%c\n]", fs->stptr[0]);
933 parse_field = posix_def_parse_field;
935 parse_field = def_parse_field;
936 if (fs->stptr[0] == ' ' && fs->stlen == 1)
938 else if (fs->stptr[0] != ' ' && fs->stlen == 1) {
939 if (! IGNORECASE || ! isalpha(fs->stptr[0]))
940 parse_field = sc_parse_field;
941 else if (fs->stptr[0] == '\\')
942 /* yet another special case */
943 strcpy(buf, "[\\\\]");
945 sprintf(buf, "[%c]", fs->stptr[0]);
948 if (buf[0] != '\0') {
949 FS_regexp = make_regexp(buf, strlen(buf), IGNORECASE, TRUE);
950 parse_field = re_parse_field;
951 } else if (parse_field == re_parse_field) {
952 FS_regexp = make_regexp(fs->stptr, fs->stlen, IGNORECASE, TRUE);
957 /* using_fieldwidths --- is FS or FIELDWIDTHS in use? */
962 return using_FIELDWIDTHS();