From: Peter Avalos Date: Sun, 3 Jul 2011 07:44:12 +0000 (-1000) Subject: sh: Add \u/\U support (in $'...') for UTF-8. X-Git-Tag: v2.12.0~171 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/145a4981faa6931742195464d71b1d2ba1b29522 sh: Add \u/\U support (in $'...') for UTF-8. Note that \u/\U are processed using the locale that was active when the shell started. This is necessary to avoid behaviour that depends on the parse/execute split (for example when placing braces around an entire script). Therefore, UTF-8 encoding is implemented manually. Obtained-from: FreeBSD SVN rev 221669 --- diff --git a/bin/sh/main.c b/bin/sh/main.c index 1466f57a16..2ed869a875 100644 --- a/bin/sh/main.c +++ b/bin/sh/main.c @@ -35,7 +35,7 @@ * * @(#) Copyright (c) 1991, 1993 The Regents of the University of California. All rights reserved. * @(#)main.c 8.6 (Berkeley) 5/28/95 - * $FreeBSD: src/bin/sh/main.c,v 1.46 2011/05/06 22:31:27 jilles Exp $ + * $FreeBSD: src/bin/sh/main.c,v 1.47 2011/05/08 17:40:10 jilles Exp $ */ #include @@ -70,7 +70,7 @@ int rootpid; int rootshell; struct jmploc main_handler; -int localeisutf8; +int localeisutf8, initial_localeisutf8; static void read_profile(const char *); static const char *find_dot_file(const char *); @@ -91,7 +91,7 @@ main(int argc, char *argv[]) char *shinit; setlocale(LC_ALL, ""); - updatecharset(); + initcharset(); state = 0; if (setjmp(main_handler.loc)) { switch (exception) { diff --git a/bin/sh/parser.c b/bin/sh/parser.c index 40ffc24576..6f37eb8aef 100644 --- a/bin/sh/parser.c +++ b/bin/sh/parser.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)parser.c 8.7 (Berkeley) 5/16/95 - * $FreeBSD: src/bin/sh/parser.c,v 1.109 2011/05/05 20:55:55 jilles Exp $ + * $FreeBSD: src/bin/sh/parser.c,v 1.110 2011/05/08 17:40:10 jilles Exp $ */ #include @@ -1218,6 +1218,29 @@ readcstyleesc(char *out) if (v == 0 || (v >= 0xd800 && v <= 0xdfff)) synerror("Bad escape sequence"); /* We really need iconv here. */ + if (initial_localeisutf8 && v > 127) { + CHECKSTRSPACE(4, out); + /* + * We cannot use wctomb() as the locale may have + * changed. + */ + if (v <= 0x7ff) { + USTPUTC(0xc0 | v >> 6, out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } else if (v <= 0xffff) { + USTPUTC(0xe0 | v >> 12, out); + USTPUTC(0x80 | ((v >> 6) & 0x3f), out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } else if (v <= 0x10ffff) { + USTPUTC(0xf0 | v >> 18, out); + USTPUTC(0x80 | ((v >> 12) & 0x3f), out); + USTPUTC(0x80 | ((v >> 6) & 0x3f), out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } + } if (v > 127) v = '?'; break; diff --git a/bin/sh/sh.1 b/bin/sh/sh.1 index 95268919e2..3cea1cc39e 100644 --- a/bin/sh/sh.1 +++ b/bin/sh/sh.1 @@ -34,7 +34,7 @@ .\" SUCH DAMAGE. .\" .\" from: @(#)sh.1 8.6 (Berkeley) 5/4/95 -.\" $FreeBSD: src/bin/sh/sh.1,v 1.162 2011/05/08 14:03:44 jilles Exp $ +.\" $FreeBSD: src/bin/sh/sh.1,v 1.163 2011/05/08 17:40:10 jilles Exp $ .\" .Dd July 2, 2011 .Dt SH 1 @@ -474,8 +474,8 @@ The Unicode code point (eight hexadecimal digits) .El .Pp -The sequences for Unicode code points currently only provide useful results -for values below 128. +The sequences for Unicode code points are currently only useful with +UTF-8 locales. They reject code point 0 and UTF-16 surrogates. .Pp If an escape sequence would produce a byte with value 0, diff --git a/bin/sh/var.c b/bin/sh/var.c index f41cf5fb4e..c9b93a3445 100644 --- a/bin/sh/var.c +++ b/bin/sh/var.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)var.c 8.3 (Berkeley) 5/4/95 - * $FreeBSD: src/bin/sh/var.c,v 1.58 2011/05/08 16:15:50 jilles Exp $ + * $FreeBSD: src/bin/sh/var.c,v 1.59 2011/05/08 17:40:10 jilles Exp $ */ #include @@ -514,6 +514,13 @@ updatecharset(void) localeisutf8 = !strcmp(charset, "UTF-8"); } +void +initcharset(void) +{ + updatecharset(); + initial_localeisutf8 = localeisutf8; +} + /* * Generate a list of exported variables. This routine is used to construct * the third argument to execve when executing a program. diff --git a/bin/sh/var.h b/bin/sh/var.h index b9b4a849da..d23ae32c76 100644 --- a/bin/sh/var.h +++ b/bin/sh/var.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)var.h 8.2 (Berkeley) 5/4/95 - * $FreeBSD: src/bin/sh/var.h,v 1.21 2011/05/08 16:15:50 jilles Exp $ + * $FreeBSD: src/bin/sh/var.h,v 1.22 2011/05/08 17:40:10 jilles Exp $ */ /* @@ -87,6 +87,8 @@ extern struct var vterm; #endif extern int localeisutf8; +/* The parser uses the locale that was in effect at startup. */ +extern int initial_localeisutf8; /* * The following macros access the values of the above variables. @@ -120,6 +122,7 @@ char *bltinlookup(const char *, int); void bltinsetlocale(void); void bltinunsetlocale(void); void updatecharset(void); +void initcharset(void); char **environment(void); int showvarscmd(int, char **); int exportcmd(int, char **); diff --git a/tools/regression/bin/sh/parser/dollar-quote10.0 b/tools/regression/bin/sh/parser/dollar-quote10.0 new file mode 100644 index 0000000000..0aa684d940 --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote10.0 @@ -0,0 +1,10 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote10.0,v 1.1 2011/05/08 17:40:10 jilles Exp $ + +# a umlaut +s=$(printf '\303\244') +# euro sign +s=$s$(printf '\342\202\254') + +# Start a new shell so the locale change is picked up. +ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\u00e4\u20ac'")" +[ "$s" = "$ss" ] diff --git a/tools/regression/bin/sh/parser/dollar-quote11.0 b/tools/regression/bin/sh/parser/dollar-quote11.0 new file mode 100644 index 0000000000..00061699db --- /dev/null +++ b/tools/regression/bin/sh/parser/dollar-quote11.0 @@ -0,0 +1,8 @@ +# $FreeBSD: src/tools/regression/bin/sh/parser/dollar-quote11.0,v 1.1 2011/05/08 17:40:10 jilles Exp $ + +# some sort of 't' outside BMP +s=$s$(printf '\360\235\225\245') + +# Start a new shell so the locale change is picked up. +ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\U0001d565'")" +[ "$s" = "$ss" ]