From fecfcc569a652a96437e9330d182f61f4e406a41 Mon Sep 17 00:00:00 2001 From: John Marino Date: Wed, 8 Aug 2012 23:22:08 +0200 Subject: [PATCH] cut: Sync with FreeBSD Addresses: http://bugs.dragonflybsd.org/issues/2385 reported-by: Tim Darby --- usr.bin/cut/cut.1 | 72 +++++++---- usr.bin/cut/cut.c | 316 +++++++++++++++++++++++++++++++++------------- 2 files changed, 278 insertions(+), 110 deletions(-) diff --git a/usr.bin/cut/cut.1 b/usr.bin/cut/cut.1 index cc33bd707e..42e10a3a19 100644 --- a/usr.bin/cut/cut.1 +++ b/usr.bin/cut/cut.1 @@ -12,10 +12,6 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. @@ -33,15 +29,14 @@ .\" SUCH DAMAGE. .\" .\" @(#)cut.1 8.1 (Berkeley) 6/6/93 -.\" $FreeBSD: src/usr.bin/cut/cut.1,v 1.9.2.5 2003/05/26 12:35:35 hmp Exp $ -.\" $DragonFly: src/usr.bin/cut/cut.1,v 1.4 2008/03/09 16:06:08 swildner Exp $ +.\" $FreeBSD$ .\" -.Dd June 6, 1993 +.Dd December 21, 2006 .Dt CUT 1 .Os .Sh NAME .Nm cut -.Nd select portions of each line of a file +.Nd cut out selected portions of each line of a file .Sh SYNOPSIS .Nm .Fl b Ar list @@ -52,13 +47,13 @@ .Op Ar .Nm .Fl f Ar list -.Op Fl w | Fl d Ar delim +.Op Fl d Ar delim .Op Fl s .Op Ar .Sh DESCRIPTION The .Nm -utility selects portions of each line (as specified by +utility cuts out selected portions of each line (as specified by .Ar list ) from each .Ar file @@ -73,7 +68,7 @@ The items specified by .Ar list can be in terms of column position or in terms of fields delimited by a special character. -Column numbering starts from 1. +Column and field numbering start from 1. .Pp The .Ar list @@ -83,14 +78,14 @@ number ranges. Number ranges consist of a number, a dash .Pq Sq \- , and a second number -and select the fields or columns from the first number to the second, +and select the columns or fields from the first number to the second, inclusive. Numbers or number ranges may be preceded by a dash, which selects all -fields or columns from 1 to the first number. +columns or fields from 1 to the last number. Numbers or number ranges may be followed by a dash, which selects all -fields or columns from the last number to the end of the line. +columns or fields from the last number to the end of the line. Numbers and number ranges may be repeated, overlapping, and in any order. -It is not an error to select fields or columns not present in the +It is not an error to select columns or fields not present in the input line. .Pp The options are as follows: @@ -104,31 +99,62 @@ The .Ar list specifies character positions. .It Fl d Ar delim -Use the first character of +Use .Ar delim as the field delimiter character instead of the tab character. .It Fl f Ar list The .Ar list -specifies fields, delimited in the input by a single tab character. -Output fields are separated by a single tab character. +specifies fields, separated in the input by the field delimiter character +(see the +.Fl d +option). +Output fields are separated by a single occurrence of the field delimiter +character. .It Fl n Do not split multi-byte characters. +Characters will only be output if at least one byte is selected, and, +after a prefix of zero or more unselected bytes, the rest of the bytes +that form the character are selected. .It Fl s Suppress lines with no field delimiter characters. Unless specified, lines with no delimiters are passed through unmodified. -.It Fl w -Use whitespace (spaces and tabs) as the delimiter. -Consecutive spaces and tabs count as one single field separator. .El -.Pp +.Sh ENVIRONMENT The +.Ev LANG , LC_ALL +and +.Ev LC_CTYPE +environment variables affect the execution of .Nm -utility exits with 0 on success or 1 if an error occurred. +as described in +.Xr environ 7 . +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Extract users' login names and shells from the system +.Xr passwd 5 +file as +.Dq name:shell +pairs: +.Pp +.Dl "cut -d : -f 1,7 /etc/passwd" +.Pp +Show the names and login times of the currently logged in users: +.Pp +.Dl "who | cut -c 1-16,26-38" .Sh SEE ALSO +.Xr colrm 1 , .Xr paste 1 .Sh STANDARDS The .Nm utility conforms to .St -p1003.2-92 . +.Sh HISTORY +A +.Nm +command appeared in +.Tn AT&T +System III +.Ux . diff --git a/usr.bin/cut/cut.c b/usr.bin/cut/cut.c index 52edb75622..1da999d983 100644 --- a/usr.bin/cut/cut.c +++ b/usr.bin/cut/cut.c @@ -13,10 +13,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -36,69 +32,79 @@ * @(#) Copyright (c) 1989, 1993 The Regents of the University of California. All rights reserved. * @(#)cut.c 8.3 (Berkeley) 5/4/95 * $FreeBSD: src/usr.bin/cut/cut.c,v 1.9.2.3 2001/07/30 09:59:16 dd Exp $ - * $DragonFly: src/usr.bin/cut/cut.c,v 1.6 2008/03/09 16:05:39 swildner Exp $ */ #include #include +#include #include #include #include #include #include #include +#include -int cflag; -char dchar; -int dflag; -int fflag; -int sflag; -int wflag; +static int bflag; +static int cflag; +static wchar_t dchar; +static char dcharmb[MB_LEN_MAX + 1]; +static int dflag; +static int fflag; +static int nflag; +static int sflag; -static void c_cut(FILE *, const char *); -static void f_cut(FILE *, const char *); +static size_t autostart, autostop, maxval; +static char * positions; + +static int b_cut(FILE *, const char *); +static int b_n_cut(FILE *, const char *); +static int c_cut(FILE *, const char *); +static int f_cut(FILE *, const char *); static void get_list(char *); -static int is_delim(int); +static void needpos(size_t); static void usage(void); int -main(int argc, char **argv) +main(int argc, char *argv[]) { FILE *fp; - void (*fcn) (FILE *, const char *) = NULL; - int ch; + int (*fcn)(FILE *, const char *); + int ch, rval; + size_t n; - fcn = NULL; setlocale (LC_ALL, ""); + fcn = NULL; dchar = '\t'; /* default delimiter is \t */ + strcpy(dcharmb, "\t"); - /* Since we don't support multi-byte characters, the -c and -b - options are equivalent, and the -n option is meaningless. */ - while ((ch = getopt(argc, argv, "b:c:d:f:snw")) != -1) + while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { case 'b': + get_list(optarg); + bflag = 1; + break; case 'c': - fcn = c_cut; get_list(optarg); cflag = 1; break; case 'd': - dchar = *optarg; + n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL); + if (dchar == '\0' || n != strlen(optarg)) + errx(1, "bad delimiter"); + strcpy(dcharmb, optarg); dflag = 1; break; case 'f': get_list(optarg); - fcn = f_cut; fflag = 1; break; case 's': sflag = 1; break; case 'n': - break; - case 'w': - wflag = 1; + nflag = 1; break; case '?': default: @@ -108,27 +114,40 @@ main(int argc, char **argv) argv += optind; if (fflag) { - if (cflag || (wflag && dflag)) + if (bflag || cflag || nflag) + usage(); + } else if (!(bflag || cflag) || dflag || sflag) usage(); - } else if (!cflag || dflag || sflag || wflag) + else if (!bflag && nflag) usage(); + if (fflag) + fcn = f_cut; + else if (cflag) + fcn = MB_CUR_MAX > 1 ? c_cut : b_cut; + else if (bflag) + fcn = nflag && MB_CUR_MAX > 1 ? b_n_cut : b_cut; + + rval = 0; if (*argv) for (; *argv; ++argv) { - if (!(fp = fopen(*argv, "r"))) - err(1, "%s", *argv); + if (strcmp(*argv, "-") == 0) + rval |= fcn(stdin, "stdin"); + else { + if (!(fp = fopen(*argv, "r"))) { + warn("%s", *argv); + rval = 1; + continue; + } fcn(fp, *argv); (void)fclose(fp); } + } else - fcn(stdin, "stdin"); - exit(0); + rval = fcn(stdin, "stdin"); + exit(rval); } -size_t autostart, autostop, maxval; - -char positions[_POSIX2_LINE_MAX + 1]; - static void get_list(char *list) { @@ -139,10 +158,8 @@ get_list(char *list) /* * set a byte in the positions array to indicate if a field or * column is to be selected; use +1, it's 1-based, not 0-based. - * This parser is less restrictive than the Draft 9 POSIX spec. - * POSIX doesn't allow lists that aren't in increasing order or - * overlapping lists. We also handle "-3-5" although there's no - * real reason too. + * Numbers and number ranges may be overlapping, repeated, and in + * any order. We handle "-3-5" although there's no real reason to. */ for (; (p = strsep(&list, ", \t")) != NULL;) { setautostart = start = stop = 0; @@ -165,40 +182,58 @@ get_list(char *list) } } if (*p) - errx(1, "[-cf] list: illegal list value"); + errx(1, "[-bcf] list: illegal list value"); if (!stop || !start) - errx(1, "[-cf] list: values may not include zero"); - if (stop > _POSIX2_LINE_MAX) - errx(1, "[-cf] list: %ld too large (max %d)", - (long)stop, _POSIX2_LINE_MAX); - if (maxval < stop) + errx(1, "[-bcf] list: values may not include zero"); + if (maxval < stop) { maxval = stop; + needpos(maxval + 1); + } for (pos = positions + start; start++ <= stop; *pos++ = 1); } /* overlapping ranges */ - if (autostop && maxval > autostop) + if (autostop && maxval > autostop) { maxval = autostop; + needpos(maxval + 1); + } /* set autostart */ if (autostart) memset(positions + 1, '1', autostart); } -/* ARGSUSED */ static void -c_cut(FILE *fp, const char *fname) +needpos(size_t n) +{ + static size_t npos; + size_t oldnpos; + + /* Grow the positions array to at least the specified size. */ + if (n > npos) { + oldnpos = npos; + if (npos == 0) + npos = n; + while (n > npos) + npos *= 2; + if ((positions = realloc(positions, npos)) == NULL) + err(1, "realloc"); + memset((char *)positions + oldnpos, 0, npos - oldnpos); + } +} + +static int +b_cut(FILE *fp, const char *fname __unused) { int ch, col; char *pos; - fname = NULL; ch = 0; for (;;) { pos = positions + 1; for (col = maxval; col; --col) { if ((ch = getc(fp)) == EOF) - return; + return (0); if (ch == '\n') break; if (*pos++) @@ -213,32 +248,125 @@ c_cut(FILE *fp, const char *fname) } (void)putchar('\n'); } + return (0); } +/* + * Cut based on byte positions, taking care not to split multibyte characters. + * Although this function also handles the case where -n is not specified, + * b_cut() ought to be much faster. + */ static int -is_delim(int ch) +b_n_cut(FILE *fp, const char *fname) { - if (wflag) { - if (ch == ' ' || ch == '\t') - return 1; + size_t col, i, lbuflen; + char *lbuf; + int canwrite, clen, warned; + mbstate_t mbs; + + memset(&mbs, 0, sizeof(mbs)); + warned = 0; + while ((lbuf = fgetln(fp, &lbuflen)) != NULL) { + for (col = 0; lbuflen > 0; col += clen) { + if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) { + if (!warned) { + warn("%s", fname); + warned = 1; + } + memset(&mbs, 0, sizeof(mbs)); + clen = 1; + } + if (clen == 0 || *lbuf == '\n') + break; + if (col < maxval && !positions[1 + col]) { + /* + * Print the character if (1) after an initial + * segment of un-selected bytes, the rest of + * it is selected, and (2) the last byte is + * selected. + */ + i = col; + while (i < col + clen && i < maxval && + !positions[1 + i]) + i++; + canwrite = i < col + clen; + for (; i < col + clen && i < maxval; i++) + canwrite &= positions[1 + i]; + if (canwrite) + fwrite(lbuf, 1, clen, stdout); } else { - if (ch == dchar) - return 1; + /* + * Print the character if all of it has + * been selected. + */ + canwrite = 1; + for (i = col; i < col + clen; i++) + if ((i >= maxval && !autostop) || + (i < maxval && !positions[1 + i])) { + canwrite = 0; + break; + } + if (canwrite) + fwrite(lbuf, 1, clen, stdout); + } + lbuf += clen; + lbuflen -= clen; + } + if (lbuflen > 0) + putchar('\n'); } - return 0; + return (warned); } -static void -f_cut(FILE *fp, const char *fname __unused) +static int +c_cut(FILE *fp, const char *fname) +{ + wint_t ch; + int col; + char *pos; + + ch = 0; + for (;;) { + pos = positions + 1; + for (col = maxval; col; --col) { + if ((ch = getwc(fp)) == WEOF) + goto out; + if (ch == '\n') + break; + if (*pos++) + (void)putwchar(ch); + } + if (ch != '\n') { + if (autostop) + while ((ch = getwc(fp)) != WEOF && ch != '\n') + (void)putwchar(ch); + else + while ((ch = getwc(fp)) != WEOF && ch != '\n'); + } + (void)putwchar('\n'); + } +out: + if (ferror(fp)) { + warn("%s", fname); + return (1); + } + return (0); +} + +static int +f_cut(FILE *fp, const char *fname) { - int ch, field, isdelim; - char *pos, *p, sep; + wchar_t ch; + int field, i, isdelim; + char *pos, *p; + wchar_t sep; int output; - char *lbuf, *mlbuf = NULL; - size_t lbuflen; + char *lbuf, *mlbuf; + size_t clen, lbuflen, reallen; - sep = wflag ? ' ' : dchar; - while ((lbuf = fgetln(fp, &lbuflen)) != NULL) { + mlbuf = NULL; + for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) { + reallen = lbuflen; /* Assert EOL has a newline. */ if (*(lbuf + lbuflen - 1) != '\n') { /* Can't have > 1 line with no trailing newline. */ @@ -248,12 +376,20 @@ f_cut(FILE *fp, const char *fname __unused) memcpy(mlbuf, lbuf, lbuflen); *(mlbuf + lbuflen) = '\n'; lbuf = mlbuf; + reallen++; } output = 0; - for (isdelim = 0, p = lbuf;; ++p) { - ch = *p; + for (isdelim = 0, p = lbuf;; p += clen) { + clen = mbrtowc(&ch, p, lbuf + reallen - p, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) { + warnc(EILSEQ, "%s", fname); + free(mlbuf); + return (1); + } + if (clen == 0) + clen = 1; /* this should work if newline is delimiter */ - if (is_delim(ch)) + if (ch == sep) isdelim = 1; if (ch == '\n') { if (!isdelim && !sflag) @@ -266,20 +402,25 @@ f_cut(FILE *fp, const char *fname __unused) pos = positions + 1; for (field = maxval, p = lbuf; field; --field, ++pos) { - if (*pos) { - if (output++) - (void)putchar(sep); - while ((ch = *p++) != '\n' && !is_delim(ch)) - (void)putchar(ch); - /* compress whitespace */ - if (wflag && ch != '\n') - while (is_delim(*p)) p++; - } else { - while ((ch = *p++) != '\n' && !is_delim(ch)) - continue; - /* compress whitespace */ - if (wflag && ch != '\n') - while (is_delim(*p)) p++; + if (*pos && output++) + for (i = 0; dcharmb[i] != '\0'; i++) + putchar(dcharmb[i]); + for (;;) { + clen = mbrtowc(&ch, p, lbuf + reallen - p, + NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) { + warnc(EILSEQ, "%s", fname); + free(mlbuf); + return (1); + } + if (clen == 0) + clen = 1; + p += clen; + if (ch == '\n' || ch == sep) + break; + if (*pos) + for (i = 0; i < (int)clen; i++) + putchar(p[i - clen]); } if (ch == '\n') break; @@ -287,7 +428,8 @@ f_cut(FILE *fp, const char *fname __unused) if (ch != '\n') { if (autostop) { if (output) - (void)putchar(sep); + for (i = 0; dcharmb[i] != '\0'; i++) + putchar(dcharmb[i]); for (; (ch = *p) != '\n'; ++p) (void)putchar(ch); } else @@ -295,8 +437,8 @@ f_cut(FILE *fp, const char *fname __unused) } (void)putchar('\n'); } - if (mlbuf != NULL) free(mlbuf); + return (0); } static void @@ -305,6 +447,6 @@ usage(void) (void)fprintf(stderr, "%s\n%s\n%s\n", "usage: cut -b list [-n] [file ...]", " cut -c list [file ...]", - " cut -f list [-s] [-w | -d delim] [file ...]"); + " cut -f list [-s] [-d delim] [file ...]"); exit(1); } -- 2.41.0