sh: Add $'quoting' (C-style escape sequences).
authorPeter Avalos <pavalos@dragonflybsd.org>
Sun, 3 Jul 2011 00:29:49 +0000 (14:29 -1000)
committerPeter Avalos <pavalos@dragonflybsd.org>
Sun, 21 Aug 2011 19:10:25 +0000 (12:10 -0700)
A string between $' and ' may contain backslash escape sequences similar
to the ones in a C string constant (except that a single-quote must be
escaped and a double-quote need not be). Details are in the sh(1) man
page.

This construct is useful to include unprintable characters, tabs and
newlines in strings; while this can be done with a command substitution
containing a printf command, that needs ugly workarounds if the result
is to end with a newline as command substitution removes all trailing
newlines.

The construct may also be useful in future to describe unprintable
characters without needing to write those characters themselves in 'set
-x', 'export -p' and the like.

The implementation attempts to comply to the proposal for the next issue
of the POSIX specification. Because this construct is not in
POSIX.1-2008, using it in scripts intended to be portable is unwise.

Matching the minimal locale support in the rest of sh, the \u and \U
sequences are currently not useful.

Obtained-from:   FreeBSD SVN rev 221513

12 files changed:
bin/sh/mksyntax.c
bin/sh/parser.c
bin/sh/sh.1
tools/regression/bin/sh/parser/dollar-quote1.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote2.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote3.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote4.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote5.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote6.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote7.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote8.0 [new file with mode: 0644]
tools/regression/bin/sh/parser/dollar-quote9.0 [new file with mode: 0644]

index def16a3..59c3b07 100644 (file)
@@ -35,7 +35,7 @@
  *
  * @(#) Copyright (c) 1991, 1993 The Regents of the University of California.  All rights reserved.
  * @(#)mksyntax.c      8.2 (Berkeley) 5/4/95
- * $FreeBSD: src/bin/sh/mksyntax.c,v 1.34 2010/11/20 14:30:28 jilles Exp $
+ * $FreeBSD: src/bin/sh/mksyntax.c,v 1.35 2011/05/05 20:55:55 jilles Exp $
  */
 
 /*
@@ -58,6 +58,7 @@ struct synclass synclass[] = {
        { "CWORD",      "character is nothing special" },
        { "CNL",        "newline character" },
        { "CBACK",      "a backslash character" },
+       { "CSBACK",     "a backslash character in single quotes" },
        { "CSQUOTE",    "single quote" },
        { "CDQUOTE",    "double quote" },
        { "CENDQUOTE",  "a terminating quote" },
@@ -218,6 +219,7 @@ main(int argc __unused, char **argv __unused)
        init();
        fputs("\n/* syntax table used when in single quotes */\n", cfile);
        add("\n", "CNL");
+       add("\\", "CSBACK");
        add("'", "CENDQUOTE");
        /* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */
        add("!*?[=~:/-", "CCTL");
index 7c7a2bf..40ffc24 100644 (file)
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  * @(#)parser.c        8.7 (Berkeley) 5/16/95
- * $FreeBSD: src/bin/sh/parser.c,v 1.108 2011/05/04 21:49:34 jilles Exp $
+ * $FreeBSD: src/bin/sh/parser.c,v 1.109 2011/05/05 20:55:55 jilles Exp $
  */
 
 #include <stdio.h>
@@ -1126,6 +1126,127 @@ done:
 
 
 /*
+ * Called to parse a backslash escape sequence inside $'...'.
+ * The backslash has already been read.
+ */
+static char *
+readcstyleesc(char *out)
+{
+       int c, v, i, n;
+
+       c = pgetc();
+       switch (c) {
+       case '\0':
+               synerror("Unterminated quoted string");
+       case '\n':
+               plinno++;
+               if (doprompt)
+                       setprompt(2);
+               else
+                       setprompt(0);
+               return out;
+       case '\\':
+       case '\'':
+       case '"':
+               v = c;
+               break;
+       case 'a': v = '\a'; break;
+       case 'b': v = '\b'; break;
+       case 'e': v = '\033'; break;
+       case 'f': v = '\f'; break;
+       case 'n': v = '\n'; break;
+       case 'r': v = '\r'; break;
+       case 't': v = '\t'; break;
+       case 'v': v = '\v'; break;
+       case 'x':
+                 v = 0;
+                 for (;;) {
+                         c = pgetc();
+                         if (c >= '0' && c <= '9')
+                                 v = (v << 4) + c - '0';
+                         else if (c >= 'A' && c <= 'F')
+                                 v = (v << 4) + c - 'A' + 10;
+                         else if (c >= 'a' && c <= 'f')
+                                 v = (v << 4) + c - 'a' + 10;
+                         else
+                                 break;
+                 }
+                 pungetc();
+                 break;
+       case '0': case '1': case '2': case '3':
+       case '4': case '5': case '6': case '7':
+                 v = c - '0';
+                 c = pgetc();
+                 if (c >= '0' && c <= '7') {
+                         v <<= 3;
+                         v += c - '0';
+                         c = pgetc();
+                         if (c >= '0' && c <= '7') {
+                                 v <<= 3;
+                                 v += c - '0';
+                         } else
+                                 pungetc();
+                 } else
+                         pungetc();
+                 break;
+       case 'c':
+                 c = pgetc();
+                 if (c < 0x3f || c > 0x7a || c == 0x60)
+                         synerror("Bad escape sequence");
+                 if (c == '\\' && pgetc() != '\\')
+                         synerror("Bad escape sequence");
+                 if (c == '?')
+                         v = 127;
+                 else
+                         v = c & 0x1f;
+                 break;
+       case 'u':
+       case 'U':
+                 n = c == 'U' ? 8 : 4;
+                 v = 0;
+                 for (i = 0; i < n; i++) {
+                         c = pgetc();
+                         if (c >= '0' && c <= '9')
+                                 v = (v << 4) + c - '0';
+                         else if (c >= 'A' && c <= 'F')
+                                 v = (v << 4) + c - 'A' + 10;
+                         else if (c >= 'a' && c <= 'f')
+                                 v = (v << 4) + c - 'a' + 10;
+                         else
+                                 synerror("Bad escape sequence");
+                 }
+                 if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
+                         synerror("Bad escape sequence");
+                 /* We really need iconv here. */
+                 if (v > 127)
+                         v = '?';
+                 break;
+       default:
+                 synerror("Bad escape sequence");
+       }
+       v = (char)v;
+       /*
+        * We can't handle NUL bytes.
+        * POSIX says we should skip till the closing quote.
+        */
+       if (v == '\0') {
+               while ((c = pgetc()) != '\'') {
+                       if (c == '\\')
+                               c = pgetc();
+                       if (c == PEOF)
+                               synerror("Unterminated quoted string");
+               }
+               pungetc();
+               return out;
+       }
+       if (SQSYNTAX[v] == CCTL)
+               USTPUTC(CTLESC, out);
+       USTPUTC(v, out);
+       return out;
+}
+
+
+/*
  * If eofmark is NULL, read a word or a redirection symbol.  If eofmark
  * is not NULL, read a here document.  In the latter case, eofmark is the
  * word which marks the end of the document and striptabs is true if
@@ -1157,6 +1278,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
        struct tokenstate state_static[MAXNEST_STATIC];
        int maxnest = MAXNEST_STATIC;
        struct tokenstate *state = state_static;
+       int sqiscstyle = 0;
 
        startlinno = plinno;
        quotef = 0;
@@ -1187,6 +1309,12 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
                                        setprompt(0);
                                c = pgetc();
                                goto loop;              /* continue outer loop */
+                       case CSBACK:
+                               if (sqiscstyle) {
+                                       out = readcstyleesc(out);
+                                       break;
+                               }
+                               /* FALLTHROUGH */
                        case CWORD:
                                USTPUTC(c, out);
                                break;
@@ -1231,6 +1359,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
                        case CSQUOTE:
                                USTPUTC(CTLQUOTEMARK, out);
                                state[level].syntax = SQSYNTAX;
+                               sqiscstyle = 0;
                                break;
                        case CDQUOTE:
                                USTPUTC(CTLQUOTEMARK, out);
@@ -1449,11 +1578,7 @@ parsesub: {
        int c1;
 
        c = pgetc();
-       if (c != '(' && c != '{' && (is_eof(c) || !is_name(c)) &&
-           !is_special(c)) {
-               USTPUTC('$', out);
-               pungetc();
-       } else if (c == '(') {  /* $(command) or $((arith)) */
+       if (c == '(') { /* $(command) or $((arith)) */
                if (pgetc() == '(') {
                        PARSEARITH();
                } else {
@@ -1464,7 +1589,7 @@ parsesub: {
                            state[level].syntax == DQSYNTAX ||
                            state[level].syntax == ARISYNTAX);
                }
-       } else {
+       } else if (c == '{' || is_name(c) || is_special(c)) {
                USTPUTC(CTLVAR, out);
                typeloc = out - stackblock();
                USTPUTC(VSNORMAL, out);
@@ -1611,6 +1736,14 @@ varname:
                                newvarnest++;
                        }
                }
+       } else if (c == '\'' && state[level].syntax == BASESYNTAX) {
+               /* $'cstylequotes' */
+               USTPUTC(CTLQUOTEMARK, out);
+               state[level].syntax = SQSYNTAX;
+               sqiscstyle = 1;
+       } else {
+               USTPUTC('$', out);
+               pungetc();
        }
        goto parsesub_return;
 }
index 7262ab9..cfcd3c4 100644 (file)
@@ -34,9 +34,9 @@
 .\" SUCH DAMAGE.
 .\"
 .\"    from: @(#)sh.1  8.6 (Berkeley) 5/4/95
-.\" $FreeBSD: src/bin/sh/sh.1,v 1.160 2011/03/20 23:52:45 jilles Exp $
+.\" $FreeBSD: src/bin/sh/sh.1,v 1.161 2011/05/05 20:55:55 jilles Exp $
 .\"
-.Dd March 20, 2011
+.Dd July 2, 2011
 .Dt SH 1
 .Os
 .Sh NAME
@@ -407,13 +407,82 @@ Quoting is used to remove the special meaning of certain characters
 or words to the shell, such as operators, whitespace, keywords,
 or alias names.
 .Pp
-There are three types of quoting: matched single quotes,
+There are four types of quoting: matched single quotes,
+dollar-single quotes,
 matched double quotes, and backslash.
 .Bl -tag -width indent
 .It Single Quotes
 Enclosing characters in single quotes preserves the literal
 meaning of all the characters (except single quotes, making
 it impossible to put single-quotes in a single-quoted string).
+.It Dollar-Single Quotes
+Enclosing characters between
+.Li $'
+and
+.Li '
+preserves the literal meaning of all characters
+except backslashes and single quotes.
+A backslash introduces a C-style escape sequence:
+.Bl -tag -width xUnnnnnnnn
+.It \ea
+Alert (ring the terminal bell)
+.It \eb
+Backspace
+.It \ec Ns Ar c
+The control character denoted by
+.Li ^ Ns Ar c
+in
+.Xr stty 1 .
+If
+.Ar c
+is a backslash, it must be doubled.
+.It \ee
+The ESC character
+.Tn ( ASCII
+0x1b)
+.It \ef
+Formfeed
+.It \en
+Newline
+.It \er
+Carriage return
+.It \et
+Horizontal tab
+.It \ev
+Vertical tab
+.It \e\e
+Literal backslash
+.It \e\&'
+Literal single-quote
+.It \e\&"
+Literal double-quote
+.It \e Ns Ar nnn
+The byte whose octal value is
+.Ar nnn
+(one to three digits)
+.It \ex Ns Ar nn
+The byte whose hexadecimal value is
+.Ar nn
+(one or more digits only the last two of which are used)
+.It \eu Ns Ar nnnn
+The Unicode code point
+.Ar nnnn
+(four hexadecimal digits)
+.It \eU Ns Ar nnnnnnnn
+The Unicode code point
+.Ar nnnnnnnn
+(eight hexadecimal digits)
+.El
+.Pp
+The sequences for Unicode code points currently only provide useful results
+for values below 128.
+They reject code point 0 and UTF-16 surrogates.
+.Pp
+If an escape sequence would produce a byte with value 0,
+that byte and the rest of the string until the matching single-quote
+are ignored.
+.Pp
+Any other string starting with a backslash is an error.
 .It Double Quotes
 Enclosing characters within double quotes preserves the literal
 meaning of all characters except dollar sign
diff --git a/tools/regression/bin/sh/parser/dollar-quote1.0 b/tools/regression/bin/sh/parser/dollar-quote1.0
new file mode 100644 (file)
index 0000000..a1db85d
--- /dev/null
@@ -0,0 +1,12 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote1.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+set -e
+
+[ $'hi' = hi ]
+[ $'hi
+there' = 'hi
+there' ]
+[ $'\"\'\\\a\b\f\t\v' = "\"'\\$(printf "\a\b\f\t\v")" ]
+[ $'hi\nthere' = 'hi
+there' ]
+[ $'a\rb' = "$(printf "a\rb")" ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote2.0 b/tools/regression/bin/sh/parser/dollar-quote2.0
new file mode 100644 (file)
index 0000000..080987f
--- /dev/null
@@ -0,0 +1,5 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote2.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+# This depends on the ASCII character set.
+
+[ $'\e' = "$(printf "\033")" ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote3.0 b/tools/regression/bin/sh/parser/dollar-quote3.0
new file mode 100644 (file)
index 0000000..7b501ab
--- /dev/null
@@ -0,0 +1,22 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote3.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+unset LC_ALL
+LC_CTYPE=en_US.ISO8859-1
+export LC_CTYPE
+
+e=
+for i in 0 1 2 3; do
+       for j in 0 1 2 3 4 5 6 7; do
+               for k in 0 1 2 3 4 5 6 7; do
+                       case $i$j$k in
+                       000) continue ;;
+                       esac
+                       e="$e\\$i$j$k"
+               done
+       done
+done
+ee=`printf "$e"`
+[ "${#ee}" = 255 ] || echo length bad
+
+# Start a new shell so the locale change is picked up.
+[ "$(${SH} -c "printf %s \$'$e'")" = "$ee" ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote4.0 b/tools/regression/bin/sh/parser/dollar-quote4.0
new file mode 100644 (file)
index 0000000..9bde4a5
--- /dev/null
@@ -0,0 +1,19 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote4.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+unset LC_ALL
+LC_CTYPE=en_US.ISO8859-1
+export LC_CTYPE
+
+e=
+for i in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
+       for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
+               case $i$j in
+               00) continue ;;
+               esac
+               e="$e\x$i$j"
+       done
+done
+
+# Start a new shell so the locale change is picked up.
+ee="$(${SH} -c "printf %s \$'$e'")"
+[ "${#ee}" = 255 ] || echo length bad
diff --git a/tools/regression/bin/sh/parser/dollar-quote5.0 b/tools/regression/bin/sh/parser/dollar-quote5.0
new file mode 100644 (file)
index 0000000..4aa1133
--- /dev/null
@@ -0,0 +1,12 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote5.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+# This depends on the ASCII character set.
+
+set -e
+
+[ $'\ca\cb\cc\cd\ce\cf\cg\ch\ci\cj\ck\cl\cm\cn\co\cp\cq\cr\cs\ct\cu\cv\cw\cx\cy\cz' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ]
+[ $'\cA\cB\cC\cD\cE\cF\cG\cH\cI\cJ\cK\cL\cM\cN\cO\cP\cQ\cR\cS\cT\cU\cV\cW\cX\cY\cZ' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ]
+[ $'\c[' = $'\033' ]
+[ $'\c]' = $'\035' ]
+[ $'\c^' = $'\036' ]
+[ $'\c_' = $'\037' ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote6.0 b/tools/regression/bin/sh/parser/dollar-quote6.0
new file mode 100644 (file)
index 0000000..b8995fb
--- /dev/null
@@ -0,0 +1,5 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote6.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+# This depends on the ASCII character set.
+
+[ $'\c\\' = $'\034' ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote7.0 b/tools/regression/bin/sh/parser/dollar-quote7.0
new file mode 100644 (file)
index 0000000..31a3dcd
--- /dev/null
@@ -0,0 +1,6 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote7.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+set -e
+
+[ $'\u0024\u0040\u0060' = '$@`' ]
+[ $'\U00000024\U00000040\U00000060' = '$@`' ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote8.0 b/tools/regression/bin/sh/parser/dollar-quote8.0
new file mode 100644 (file)
index 0000000..ea81fbf
--- /dev/null
@@ -0,0 +1,11 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote8.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+[ $'hello\0' = hello ]
+[ $'hello\0world' = hello ]
+[ $'hello\0'$'world' = helloworld ]
+[ $'hello\000' = hello ]
+[ $'hello\000world' = hello ]
+[ $'hello\000'$'world' = helloworld ]
+[ $'hello\x00' = hello ]
+[ $'hello\x00world' = hello ]
+[ $'hello\x00'$'world' = helloworld ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote9.0 b/tools/regression/bin/sh/parser/dollar-quote9.0
new file mode 100644 (file)
index 0000000..06bacd7
--- /dev/null
@@ -0,0 +1,8 @@
+# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote9.0,v 1.2 2011/06/07 08:46:13 attilio Exp $
+
+# POSIX and C99 say D800-DFFF are undefined in a universal character name.
+# We reject this but many other shells expand to something that looks like
+# CESU-8.
+
+v=$( (eval ": \$'\uD800'") 2>&1 >/dev/null)
+[ $? -ne 0 ] && [ -n "$v" ]