From: Peter Avalos Date: Sun, 3 Jul 2011 00:29:49 +0000 (-1000) Subject: sh: Add $'quoting' (C-style escape sequences). X-Git-Tag: v2.12.0~179 X-Git-Url: https://gitweb.dragonflybsd.org/~nant/dragonfly.git/commitdiff_plain/e1489450c38a053dad66768fc12706aa2968f646 sh: Add $'quoting' (C-style escape sequences). A string between $' and ' may contain backslash escape sequences similar to the ones in a C string constant (except that a single-quote must be escaped and a double-quote need not be). Details are in the sh(1) man page. This construct is useful to include unprintable characters, tabs and newlines in strings; while this can be done with a command substitution containing a printf command, that needs ugly workarounds if the result is to end with a newline as command substitution removes all trailing newlines. The construct may also be useful in future to describe unprintable characters without needing to write those characters themselves in 'set -x', 'export -p' and the like. The implementation attempts to comply to the proposal for the next issue of the POSIX specification. Because this construct is not in POSIX.1-2008, using it in scripts intended to be portable is unwise. Matching the minimal locale support in the rest of sh, the \u and \U sequences are currently not useful. Obtained-from: FreeBSD SVN rev 221513 --- diff --git a/bin/sh/mksyntax.c b/bin/sh/mksyntax.c index def16a3de4..59c3b07e7f 100644 --- a/bin/sh/mksyntax.c +++ b/bin/sh/mksyntax.c @@ -35,7 +35,7 @@ * * @(#) Copyright (c) 1991, 1993 The Regents of the University of California. All rights reserved. * @(#)mksyntax.c 8.2 (Berkeley) 5/4/95 - * $FreeBSD: src/bin/sh/mksyntax.c,v 1.34 2010/11/20 14:30:28 jilles Exp $ + * $FreeBSD: src/bin/sh/mksyntax.c,v 1.35 2011/05/05 20:55:55 jilles Exp $ */ /* @@ -58,6 +58,7 @@ struct synclass synclass[] = { { "CWORD", "character is nothing special" }, { "CNL", "newline character" }, { "CBACK", "a backslash character" }, + { "CSBACK", "a backslash character in single quotes" }, { "CSQUOTE", "single quote" }, { "CDQUOTE", "double quote" }, { "CENDQUOTE", "a terminating quote" }, @@ -218,6 +219,7 @@ main(int argc __unused, char **argv __unused) init(); fputs("\n/* syntax table used when in single quotes */\n", cfile); add("\n", "CNL"); + add("\\", "CSBACK"); add("'", "CENDQUOTE"); /* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */ add("!*?[=~:/-", "CCTL"); diff --git a/bin/sh/parser.c b/bin/sh/parser.c index 7c7a2bfc77..40ffc24576 100644 --- a/bin/sh/parser.c +++ b/bin/sh/parser.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)parser.c 8.7 (Berkeley) 5/16/95 - * $FreeBSD: src/bin/sh/parser.c,v 1.108 2011/05/04 21:49:34 jilles Exp $ + * $FreeBSD: src/bin/sh/parser.c,v 1.109 2011/05/05 20:55:55 jilles Exp $ */ #include @@ -1125,6 +1125,127 @@ done: } +/* + * Called to parse a backslash escape sequence inside $'...'. + * The backslash has already been read. + */ +static char * +readcstyleesc(char *out) +{ + int c, v, i, n; + + c = pgetc(); + switch (c) { + case '\0': + synerror("Unterminated quoted string"); + case '\n': + plinno++; + if (doprompt) + setprompt(2); + else + setprompt(0); + return out; + case '\\': + case '\'': + case '"': + v = c; + break; + case 'a': v = '\a'; break; + case 'b': v = '\b'; break; + case 'e': v = '\033'; break; + case 'f': v = '\f'; break; + case 'n': v = '\n'; break; + case 'r': v = '\r'; break; + case 't': v = '\t'; break; + case 'v': v = '\v'; break; + case 'x': + v = 0; + for (;;) { + c = pgetc(); + if (c >= '0' && c <= '9') + v = (v << 4) + c - '0'; + else if (c >= 'A' && c <= 'F') + v = (v << 4) + c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + v = (v << 4) + c - 'a' + 10; + else + break; + } + pungetc(); + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + v = c - '0'; + c = pgetc(); + if (c >= '0' && c <= '7') { + v <<= 3; + v += c - '0'; + c = pgetc(); + if (c >= '0' && c <= '7') { + v <<= 3; + v += c - '0'; + } else + pungetc(); + } else + pungetc(); + break; + case 'c': + c = pgetc(); + if (c < 0x3f || c > 0x7a || c == 0x60) + synerror("Bad escape sequence"); + if (c == '\\' && pgetc() != '\\') + synerror("Bad escape sequence"); + if (c == '?') + v = 127; + else + v = c & 0x1f; + break; + case 'u': + case 'U': + n = c == 'U' ? 8 : 4; + v = 0; + for (i = 0; i < n; i++) { + c = pgetc(); + if (c >= '0' && c <= '9') + v = (v << 4) + c - '0'; + else if (c >= 'A' && c <= 'F') + v = (v << 4) + c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + v = (v << 4) + c - 'a' + 10; + else + synerror("Bad escape sequence"); + } + if (v == 0 || (v >= 0xd800 && v <= 0xdfff)) + synerror("Bad escape sequence"); + /* We really need iconv here. */ + if (v > 127) + v = '?'; + break; + default: + synerror("Bad escape sequence"); + } + v = (char)v; + /* + * We can't handle NUL bytes. + * POSIX says we should skip till the closing quote. + */ + if (v == '\0') { + while ((c = pgetc()) != '\'') { + if (c == '\\') + c = pgetc(); + if (c == PEOF) + synerror("Unterminated quoted string"); + } + pungetc(); + return out; + } + if (SQSYNTAX[v] == CCTL) + USTPUTC(CTLESC, out); + USTPUTC(v, out); + return out; +} + + /* * If eofmark is NULL, read a word or a redirection symbol. If eofmark * is not NULL, read a here document. In the latter case, eofmark is the @@ -1157,6 +1278,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs) struct tokenstate state_static[MAXNEST_STATIC]; int maxnest = MAXNEST_STATIC; struct tokenstate *state = state_static; + int sqiscstyle = 0; startlinno = plinno; quotef = 0; @@ -1187,6 +1309,12 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs) setprompt(0); c = pgetc(); goto loop; /* continue outer loop */ + case CSBACK: + if (sqiscstyle) { + out = readcstyleesc(out); + break; + } + /* FALLTHROUGH */ case CWORD: USTPUTC(c, out); break; @@ -1231,6 +1359,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs) case CSQUOTE: USTPUTC(CTLQUOTEMARK, out); state[level].syntax = SQSYNTAX; + sqiscstyle = 0; break; case CDQUOTE: USTPUTC(CTLQUOTEMARK, out); @@ -1449,11 +1578,7 @@ parsesub: { int c1; c = pgetc(); - if (c != '(' && c != '{' && (is_eof(c) || !is_name(c)) && - !is_special(c)) { - USTPUTC('$', out); - pungetc(); - } else if (c == '(') { /* $(command) or $((arith)) */ + if (c == '(') { /* $(command) or $((arith)) */ if (pgetc() == '(') { PARSEARITH(); } else { @@ -1464,7 +1589,7 @@ parsesub: { state[level].syntax == DQSYNTAX || state[level].syntax == ARISYNTAX); } - } else { + } else if (c == '{' || is_name(c) || is_special(c)) { USTPUTC(CTLVAR, out); typeloc = out - stackblock(); USTPUTC(VSNORMAL, out); @@ -1611,6 +1736,14 @@ varname: newvarnest++; } } + } else if (c == '\'' && state[level].syntax == BASESYNTAX) { + /* $'cstylequotes' */ + USTPUTC(CTLQUOTEMARK, out); + state[level].syntax = SQSYNTAX; + sqiscstyle = 1; + } else { + USTPUTC('$', out); + pungetc(); } goto parsesub_return; } diff --git a/bin/sh/sh.1 b/bin/sh/sh.1 index 7262ab9ee1..cfcd3c48d9 100644 --- a/bin/sh/sh.1 +++ b/bin/sh/sh.1 @@ -34,9 +34,9 @@ .\" SUCH DAMAGE. .\" .\" from: @(#)sh.1 8.6 (Berkeley) 5/4/95 -.\" $FreeBSD: src/bin/sh/sh.1,v 1.160 2011/03/20 23:52:45 jilles Exp $ +.\" $FreeBSD: src/bin/sh/sh.1,v 1.161 2011/05/05 20:55:55 jilles Exp $ .\" -.Dd March 20, 2011 +.Dd July 2, 2011 .Dt SH 1 .Os .Sh NAME @@ -407,13 +407,82 @@ Quoting is used to remove the special meaning of certain characters or words to the shell, such as operators, whitespace, keywords, or alias names. .Pp -There are three types of quoting: matched single quotes, +There are four types of quoting: matched single quotes, +dollar-single quotes, matched double quotes, and backslash. .Bl -tag -width indent .It Single Quotes Enclosing characters in single quotes preserves the literal meaning of all the characters (except single quotes, making it impossible to put single-quotes in a single-quoted string). +.It Dollar-Single Quotes +Enclosing characters between +.Li $' +and +.Li ' +preserves the literal meaning of all characters +except backslashes and single quotes. +A backslash introduces a C-style escape sequence: +.Bl -tag -width xUnnnnnnnn +.It \ea +Alert (ring the terminal bell) +.It \eb +Backspace +.It \ec Ns Ar c +The control character denoted by +.Li ^ Ns Ar c +in +.Xr stty 1 . +If +.Ar c +is a backslash, it must be doubled. +.It \ee +The ESC character +.Tn ( ASCII +0x1b) +.It \ef +Formfeed +.It \en +Newline +.It \er +Carriage return +.It \et +Horizontal tab +.It \ev +Vertical tab +.It \e\e +Literal backslash +.It \e\&' +Literal single-quote +.It \e\&" +Literal double-quote +.It \e Ns Ar nnn +The byte whose octal value is +.Ar nnn +(one to three digits) +.It \ex Ns Ar nn +The byte whose hexadecimal value is +.Ar nn +(one or more digits only the last two of which are used) +.It \eu Ns Ar nnnn +The Unicode code point +.Ar nnnn +(four hexadecimal digits) +.It \eU Ns Ar nnnnnnnn +The Unicode code point +.Ar nnnnnnnn +(eight hexadecimal digits) +.El +.Pp +The sequences for Unicode code points currently only provide useful results +for values below 128. +They reject code point 0 and UTF-16 surrogates. +.Pp +If an escape sequence would produce a byte with value 0, +that byte and the rest of the string until the matching single-quote +are ignored. +.Pp +Any other string starting with a backslash is an error. .It Double Quotes Enclosing characters within double quotes preserves the literal meaning of all characters except dollar sign diff --git a/tools/regression/bin/sh/parser/dollar-quote1.0 b/tools/regression/bin/sh/parser/dollar-quote1.0 new file mode 100644 index 0000000000..a1db85d9d7 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote1.0 @@ -0,0 +1,12 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote1.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +set -e + +[ $'hi' = hi ] +[ $'hi +there' = 'hi +there' ] +[ $'\"\'\\\a\b\f\t\v' = "\"'\\$(printf "\a\b\f\t\v")" ] +[ $'hi\nthere' = 'hi +there' ] +[ $'a\rb' = "$(printf "a\rb")" ] diff --git a/tools/regression/bin/sh/parser/dollar-quote2.0 b/tools/regression/bin/sh/parser/dollar-quote2.0 new file mode 100644 index 0000000000..080987f6e5 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote2.0 @@ -0,0 +1,5 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote2.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +# This depends on the ASCII character set. + +[ $'\e' = "$(printf "\033")" ] diff --git a/tools/regression/bin/sh/parser/dollar-quote3.0 b/tools/regression/bin/sh/parser/dollar-quote3.0 new file mode 100644 index 0000000000..7b501ab493 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote3.0 @@ -0,0 +1,22 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote3.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +unset LC_ALL +LC_CTYPE=en_US.ISO8859-1 +export LC_CTYPE + +e= +for i in 0 1 2 3; do + for j in 0 1 2 3 4 5 6 7; do + for k in 0 1 2 3 4 5 6 7; do + case $i$j$k in + 000) continue ;; + esac + e="$e\\$i$j$k" + done + done +done +ee=`printf "$e"` +[ "${#ee}" = 255 ] || echo length bad + +# Start a new shell so the locale change is picked up. +[ "$(${SH} -c "printf %s \$'$e'")" = "$ee" ] diff --git a/tools/regression/bin/sh/parser/dollar-quote4.0 b/tools/regression/bin/sh/parser/dollar-quote4.0 new file mode 100644 index 0000000000..9bde4a5836 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote4.0 @@ -0,0 +1,19 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote4.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +unset LC_ALL +LC_CTYPE=en_US.ISO8859-1 +export LC_CTYPE + +e= +for i in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do + for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do + case $i$j in + 00) continue ;; + esac + e="$e\x$i$j" + done +done + +# Start a new shell so the locale change is picked up. +ee="$(${SH} -c "printf %s \$'$e'")" +[ "${#ee}" = 255 ] || echo length bad diff --git a/tools/regression/bin/sh/parser/dollar-quote5.0 b/tools/regression/bin/sh/parser/dollar-quote5.0 new file mode 100644 index 0000000000..4aa11331e0 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote5.0 @@ -0,0 +1,12 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote5.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +# This depends on the ASCII character set. + +set -e + +[ $'\ca\cb\cc\cd\ce\cf\cg\ch\ci\cj\ck\cl\cm\cn\co\cp\cq\cr\cs\ct\cu\cv\cw\cx\cy\cz' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ] +[ $'\cA\cB\cC\cD\cE\cF\cG\cH\cI\cJ\cK\cL\cM\cN\cO\cP\cQ\cR\cS\cT\cU\cV\cW\cX\cY\cZ' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ] +[ $'\c[' = $'\033' ] +[ $'\c]' = $'\035' ] +[ $'\c^' = $'\036' ] +[ $'\c_' = $'\037' ] diff --git a/tools/regression/bin/sh/parser/dollar-quote6.0 b/tools/regression/bin/sh/parser/dollar-quote6.0 new file mode 100644 index 0000000000..b8995fb562 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote6.0 @@ -0,0 +1,5 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote6.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +# This depends on the ASCII character set. + +[ $'\c\\' = $'\034' ] diff --git a/tools/regression/bin/sh/parser/dollar-quote7.0 b/tools/regression/bin/sh/parser/dollar-quote7.0 new file mode 100644 index 0000000000..31a3dcd818 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote7.0 @@ -0,0 +1,6 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote7.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +set -e + +[ $'\u0024\u0040\u0060' = '$@`' ] +[ $'\U00000024\U00000040\U00000060' = '$@`' ] diff --git a/tools/regression/bin/sh/parser/dollar-quote8.0 b/tools/regression/bin/sh/parser/dollar-quote8.0 new file mode 100644 index 0000000000..ea81fbf3db --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote8.0 @@ -0,0 +1,11 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote8.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +[ $'hello\0' = hello ] +[ $'hello\0world' = hello ] +[ $'hello\0'$'world' = helloworld ] +[ $'hello\000' = hello ] +[ $'hello\000world' = hello ] +[ $'hello\000'$'world' = helloworld ] +[ $'hello\x00' = hello ] +[ $'hello\x00world' = hello ] +[ $'hello\x00'$'world' = helloworld ] diff --git a/tools/regression/bin/sh/parser/dollar-quote9.0 b/tools/regression/bin/sh/parser/dollar-quote9.0 new file mode 100644 index 0000000000..06bacd7dc5 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote9.0 @@ -0,0 +1,8 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote9.0,v 1.2 2011/06/07 08:46:13 attilio Exp $ + +# POSIX and C99 say D800-DFFF are undefined in a universal character name. +# We reject this but many other shells expand to something that looks like +# CESU-8. + +v=$( (eval ": \$'\uD800'") 2>&1 >/dev/null) +[ $? -ne 0 ] && [ -n "$v" ]