sh: Add UTF-8 support to pattern matching.
authorPeter Avalos <pavalos@dragonflybsd.org>
Sun, 21 Aug 2011 20:48:41 +0000 (13:48 -0700)
committerPeter Avalos <pavalos@dragonflybsd.org>
Sun, 21 Aug 2011 20:48:41 +0000 (13:48 -0700)
?, [...] patterns match codepoints instead of bytes. They do not match
invalid sequences. [...] patterns must not contain invalid sequences
otherwise they will not match anything. This is so that ${var#?} removes
the first codepoint, not the first byte, without putting UTF-8 knowledge
into the ${var#pattern} code. However, * continues to match any string
and an invalid sequence matches an identical invalid sequence. (This
differs from fnmatch(3).)

Obtained-from:   FreeBSD SVN rev 221646 & 223010

bin/sh/expand.c
tools/regression/bin/sh/builtins/case5.0 [new file with mode: 0644]
tools/regression/bin/sh/expansion/trim8.0 [new file with mode: 0644]

index b113a49..6f355b8 100644 (file)
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  * @(#)expand.c        8.5 (Berkeley) 5/15/95
- * $FreeBSD: src/bin/sh/expand.c,v 1.84 2011/05/07 14:32:16 jilles Exp $
+ * $FreeBSD: src/bin/sh/expand.c,v 1.85 2011/05/08 11:32:20 jilles Exp $
  */
 
 #include <sys/types.h>
@@ -51,6 +51,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -110,16 +111,16 @@ static void addfname(char *);
 static struct strlist *expsort(struct strlist *);
 static struct strlist *msort(struct strlist *, int);
 static char *cvtnum(int, char *);
-static int collate_range_cmp(int, int);
+static int collate_range_cmp(wchar_t, wchar_t);
 
 static int
-collate_range_cmp(int c1, int c2)
+collate_range_cmp(wchar_t c1, wchar_t c2)
 {
-       static char s1[2], s2[2];
+       static wchar_t s1[2], s2[2];
 
        s1[0] = c1;
        s2[0] = c2;
-       return (strcoll(s1, s2));
+       return (wcscoll(s1, s2));
 }
 
 /*
@@ -1376,6 +1377,23 @@ msort(struct strlist *list, int len)
 
 
 
+static wchar_t
+get_wc(const char **p)
+{
+       wchar_t c;
+       int chrlen;
+
+       chrlen = mbtowc(&c, *p, 4);
+       if (chrlen == 0)
+               return 0;
+       else if (chrlen == -1)
+               c = 0;
+       else
+               *p += chrlen;
+       return c;
+}
+
+
 /*
  * Returns true if the pattern matches the string.
  */
@@ -1385,6 +1403,7 @@ patmatch(const char *pattern, const char *string, int squoted)
 {
        const char *p, *q;
        char c;
+       wchar_t wc, wc2;
 
        p = pattern;
        q = string;
@@ -1403,7 +1422,11 @@ patmatch(const char *pattern, const char *string, int squoted)
                case '?':
                        if (squoted && *q == CTLESC)
                                q++;
-                       if (*q++ == '\0')
+                       if (localeisutf8)
+                               wc = get_wc(&q);
+                       else
+                               wc = (unsigned char)*q++;
+                       if (wc == '\0')
                                return 0;
                        break;
                case '*':
@@ -1433,7 +1456,7 @@ patmatch(const char *pattern, const char *string, int squoted)
                case '[': {
                        const char *endp;
                        int invert, found;
-                       char chr;
+                       wchar_t chr;
 
                        endp = p;
                        if (*endp == '!' || *endp == '^')
@@ -1454,9 +1477,12 @@ patmatch(const char *pattern, const char *string, int squoted)
                                p++;
                        }
                        found = 0;
-                       chr = *q++;
-                       if (squoted && chr == CTLESC)
-                               chr = *q++;
+                       if (squoted && *q == CTLESC)
+                               q++;
+                       if (localeisutf8)
+                               chr = get_wc(&q);
+                       else
+                               chr = (unsigned char)*q++;
                        if (chr == '\0')
                                return 0;
                        c = *p++;
@@ -1465,19 +1491,31 @@ patmatch(const char *pattern, const char *string, int squoted)
                                        continue;
                                if (c == CTLESC)
                                        c = *p++;
+                               if (localeisutf8 && c & 0x80) {
+                                       p--;
+                                       wc = get_wc(&p);
+                                       if (wc == 0) /* bad utf-8 */
+                                               return 0;
+                               } else
+                                       wc = (unsigned char)c;
                                if (*p == '-' && p[1] != ']') {
                                        p++;
                                        while (*p == CTLQUOTEMARK)
                                                p++;
                                        if (*p == CTLESC)
                                                p++;
-                                       if (   collate_range_cmp(chr, c) >= 0
-                                           && collate_range_cmp(chr, *p) <= 0
+                                       if (localeisutf8) {
+                                               wc2 = get_wc(&p);
+                                               if (wc2 == 0) /* bad utf-8 */
+                                                       return 0;
+                                       } else
+                                               wc2 = (unsigned char)*p++;
+                                       if (   collate_range_cmp(chr, wc) >= 0
+                                           && collate_range_cmp(chr, wc2) <= 0
                                           )
                                                found = 1;
-                                       p++;
                                } else {
-                                       if (chr == c)
+                                       if (chr == wc)
                                                found = 1;
                                }
                        } while ((c = *p++) != ']');
diff --git a/tools/regression/bin/sh/builtins/case5.0 b/tools/regression/bin/sh/builtins/case5.0
new file mode 100644 (file)
index 0000000..6909fbe
--- /dev/null
@@ -0,0 +1,57 @@
+# $FreeBSD: src/tools/regression/bin/sh/builtins/case5.0,v 1.1 2011/05/08 11:32:20 jilles Exp $
+
+unset LC_ALL
+LC_CTYPE=en_US.UTF-8
+export LC_CTYPE
+
+c1=e
+# a umlaut
+c2=$(printf '\303\244')
+# euro sign
+c3=$(printf '\342\202\254')
+# some sort of 't' outside BMP
+c4=$(printf '\360\235\225\245')
+
+ok=0
+case $c1$c2$c3$c4 in
+*) ok=1 ;;
+esac
+if [ $ok = 0 ]; then
+       echo wrong at $LINENO
+       exit 3
+fi
+
+case $c1$c2$c3$c4 in
+$c1$c2$c3$c4) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+"$c1$c2$c3$c4") ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+????) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1.$c2.$c3.$c4 in
+?.?.?.?) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+[!a][!b][!c][!d]) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+[$c1][$c2][$c3][$c4]) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+["$c1"]["$c2"]["$c3"]["$c4"]) ;;
+*) echo wrong at $LINENO ;;
+esac
diff --git a/tools/regression/bin/sh/expansion/trim8.0 b/tools/regression/bin/sh/expansion/trim8.0
new file mode 100644 (file)
index 0000000..fbb72a1
--- /dev/null
@@ -0,0 +1,75 @@
+# $FreeBSD: src/tools/regression/bin/sh/expansion/trim8.0,v 1.1 2011/05/08 11:32:20 jilles Exp $
+
+unset LC_ALL
+LC_CTYPE=en_US.UTF-8
+export LC_CTYPE
+
+c1=e
+# a umlaut
+c2=$(printf '\303\244')
+# euro sign
+c3=$(printf '\342\202\254')
+# some sort of 't' outside BMP
+c4=$(printf '\360\235\225\245')
+
+s=$c1$c2$c3$c4
+
+testcase() {
+       code="$1"
+       expected="$2"
+       oIFS="$IFS"
+       eval "$code"
+       IFS='|'
+       result="$#|$*"
+       IFS="$oIFS"
+       if [ "x$result" = "x$expected" ]; then
+               ok=x$ok
+       else
+               failures=x$failures
+               echo "For $code, expected $expected actual $result"
+       fi
+}
+
+testcase 'set -- "$s"'                         "1|$s"
+testcase 'set -- "${s#$c2}"'                   "1|$s"
+testcase 'set -- "${s#*}"'                     "1|$s"
+testcase 'set -- "${s#$c1}"'                   "1|$c2$c3$c4"
+testcase 'set -- "${s#$c1$c2}"'                        "1|$c3$c4"
+testcase 'set -- "${s#$c1$c2$c3}"'             "1|$c4"
+testcase 'set -- "${s#$c1$c2$c3$c4}"'          "1|"
+testcase 'set -- "${s#?}"'                     "1|$c2$c3$c4"
+testcase 'set -- "${s#??}"'                    "1|$c3$c4"
+testcase 'set -- "${s#???}"'                   "1|$c4"
+testcase 'set -- "${s#????}"'                  "1|"
+testcase 'set -- "${s#*$c3}"'                  "1|$c4"
+testcase 'set -- "${s%$c4}"'                   "1|$c1$c2$c3"
+testcase 'set -- "${s%$c3$c4}"'                        "1|$c1$c2"
+testcase 'set -- "${s%$c2$c3$c4}"'             "1|$c1"
+testcase 'set -- "${s%$c1$c2$c3$c4}"'          "1|"
+testcase 'set -- "${s%?}"'                     "1|$c1$c2$c3"
+testcase 'set -- "${s%??}"'                    "1|$c1$c2"
+testcase 'set -- "${s%???}"'                   "1|$c1"
+testcase 'set -- "${s%????}"'                  "1|"
+testcase 'set -- "${s%$c2*}"'                  "1|$c1"
+testcase 'set -- "${s##$c2}"'                  "1|$s"
+testcase 'set -- "${s##*}"'                    "1|"
+testcase 'set -- "${s##$c1}"'                  "1|$c2$c3$c4"
+testcase 'set -- "${s##$c1$c2}"'               "1|$c3$c4"
+testcase 'set -- "${s##$c1$c2$c3}"'            "1|$c4"
+testcase 'set -- "${s##$c1$c2$c3$c4}"'         "1|"
+testcase 'set -- "${s##?}"'                    "1|$c2$c3$c4"
+testcase 'set -- "${s##??}"'                   "1|$c3$c4"
+testcase 'set -- "${s##???}"'                  "1|$c4"
+testcase 'set -- "${s##????}"'                 "1|"
+testcase 'set -- "${s##*$c3}"'                 "1|$c4"
+testcase 'set -- "${s%%$c4}"'                  "1|$c1$c2$c3"
+testcase 'set -- "${s%%$c3$c4}"'               "1|$c1$c2"
+testcase 'set -- "${s%%$c2$c3$c4}"'            "1|$c1"
+testcase 'set -- "${s%%$c1$c2$c3$c4}"'         "1|"
+testcase 'set -- "${s%%?}"'                    "1|$c1$c2$c3"
+testcase 'set -- "${s%%??}"'                   "1|$c1$c2"
+testcase 'set -- "${s%%???}"'                  "1|$c1"
+testcase 'set -- "${s%%????}"'                 "1|"
+testcase 'set -- "${s%%$c2*}"'                 "1|$c1"
+
+test "x$failures" = x