From ead7935bf6ca5dacd4abb1380eb3ac771c15e8c9 Mon Sep 17 00:00:00 2001 From: Peter Avalos Date: Sun, 21 Aug 2011 13:48:41 -0700 Subject: [PATCH] sh: Add UTF-8 support to pattern matching. ?, [...] patterns match codepoints instead of bytes. They do not match invalid sequences. [...] patterns must not contain invalid sequences otherwise they will not match anything. This is so that ${var#?} removes the first codepoint, not the first byte, without putting UTF-8 knowledge into the ${var#pattern} code. However, * continues to match any string and an invalid sequence matches an identical invalid sequence. (This differs from fnmatch(3).) Obtained-from: FreeBSD SVN rev 221646 & 223010 --- bin/sh/expand.c | 66 +++++++++++++++----- tools/regression/bin/sh/builtins/case5.0 | 57 +++++++++++++++++ tools/regression/bin/sh/expansion/trim8.0 | 75 +++++++++++++++++++++++ 3 files changed, 184 insertions(+), 14 deletions(-) create mode 100644 tools/regression/bin/sh/builtins/case5.0 create mode 100644 tools/regression/bin/sh/expansion/trim8.0 diff --git a/bin/sh/expand.c b/bin/sh/expand.c index b113a49c19..6f355b8a4f 100644 --- a/bin/sh/expand.c +++ b/bin/sh/expand.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)expand.c 8.5 (Berkeley) 5/15/95 - * $FreeBSD: src/bin/sh/expand.c,v 1.84 2011/05/07 14:32:16 jilles Exp $ + * $FreeBSD: src/bin/sh/expand.c,v 1.85 2011/05/08 11:32:20 jilles Exp $ */ #include @@ -51,6 +51,7 @@ #include #include #include +#include /* * Routines to expand arguments to commands. We have to deal with @@ -110,16 +111,16 @@ static void addfname(char *); static struct strlist *expsort(struct strlist *); static struct strlist *msort(struct strlist *, int); static char *cvtnum(int, char *); -static int collate_range_cmp(int, int); +static int collate_range_cmp(wchar_t, wchar_t); static int -collate_range_cmp(int c1, int c2) +collate_range_cmp(wchar_t c1, wchar_t c2) { - static char s1[2], s2[2]; + static wchar_t s1[2], s2[2]; s1[0] = c1; s2[0] = c2; - return (strcoll(s1, s2)); + return (wcscoll(s1, s2)); } /* @@ -1376,6 +1377,23 @@ msort(struct strlist *list, int len) +static wchar_t +get_wc(const char **p) +{ + wchar_t c; + int chrlen; + + chrlen = mbtowc(&c, *p, 4); + if (chrlen == 0) + return 0; + else if (chrlen == -1) + c = 0; + else + *p += chrlen; + return c; +} + + /* * Returns true if the pattern matches the string. */ @@ -1385,6 +1403,7 @@ patmatch(const char *pattern, const char *string, int squoted) { const char *p, *q; char c; + wchar_t wc, wc2; p = pattern; q = string; @@ -1403,7 +1422,11 @@ patmatch(const char *pattern, const char *string, int squoted) case '?': if (squoted && *q == CTLESC) q++; - if (*q++ == '\0') + if (localeisutf8) + wc = get_wc(&q); + else + wc = (unsigned char)*q++; + if (wc == '\0') return 0; break; case '*': @@ -1433,7 +1456,7 @@ patmatch(const char *pattern, const char *string, int squoted) case '[': { const char *endp; int invert, found; - char chr; + wchar_t chr; endp = p; if (*endp == '!' || *endp == '^') @@ -1454,9 +1477,12 @@ patmatch(const char *pattern, const char *string, int squoted) p++; } found = 0; - chr = *q++; - if (squoted && chr == CTLESC) - chr = *q++; + if (squoted && *q == CTLESC) + q++; + if (localeisutf8) + chr = get_wc(&q); + else + chr = (unsigned char)*q++; if (chr == '\0') return 0; c = *p++; @@ -1465,19 +1491,31 @@ patmatch(const char *pattern, const char *string, int squoted) continue; if (c == CTLESC) c = *p++; + if (localeisutf8 && c & 0x80) { + p--; + wc = get_wc(&p); + if (wc == 0) /* bad utf-8 */ + return 0; + } else + wc = (unsigned char)c; if (*p == '-' && p[1] != ']') { p++; while (*p == CTLQUOTEMARK) p++; if (*p == CTLESC) p++; - if ( collate_range_cmp(chr, c) >= 0 - && collate_range_cmp(chr, *p) <= 0 + if (localeisutf8) { + wc2 = get_wc(&p); + if (wc2 == 0) /* bad utf-8 */ + return 0; + } else + wc2 = (unsigned char)*p++; + if ( collate_range_cmp(chr, wc) >= 0 + && collate_range_cmp(chr, wc2) <= 0 ) found = 1; - p++; } else { - if (chr == c) + if (chr == wc) found = 1; } } while ((c = *p++) != ']'); diff --git a/tools/regression/bin/sh/builtins/case5.0 b/tools/regression/bin/sh/builtins/case5.0 new file mode 100644 index 0000000000..6909fbe9d1 --- /dev/null +++ b/tools/regression/bin/sh/builtins/case5.0 @@ -0,0 +1,57 @@ +# $FreeBSD: src/tools/regression/bin/sh/builtins/case5.0,v 1.1 2011/05/08 11:32:20 jilles Exp $ + +unset LC_ALL +LC_CTYPE=en_US.UTF-8 +export LC_CTYPE + +c1=e +# a umlaut +c2=$(printf '\303\244') +# euro sign +c3=$(printf '\342\202\254') +# some sort of 't' outside BMP +c4=$(printf '\360\235\225\245') + +ok=0 +case $c1$c2$c3$c4 in +*) ok=1 ;; +esac +if [ $ok = 0 ]; then + echo wrong at $LINENO + exit 3 +fi + +case $c1$c2$c3$c4 in +$c1$c2$c3$c4) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +"$c1$c2$c3$c4") ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +????) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1.$c2.$c3.$c4 in +?.?.?.?) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +[!a][!b][!c][!d]) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +[$c1][$c2][$c3][$c4]) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +["$c1"]["$c2"]["$c3"]["$c4"]) ;; +*) echo wrong at $LINENO ;; +esac diff --git a/tools/regression/bin/sh/expansion/trim8.0 b/tools/regression/bin/sh/expansion/trim8.0 new file mode 100644 index 0000000000..fbb72a1823 --- /dev/null +++ b/tools/regression/bin/sh/expansion/trim8.0 @@ -0,0 +1,75 @@ +# $FreeBSD: src/tools/regression/bin/sh/expansion/trim8.0,v 1.1 2011/05/08 11:32:20 jilles Exp $ + +unset LC_ALL +LC_CTYPE=en_US.UTF-8 +export LC_CTYPE + +c1=e +# a umlaut +c2=$(printf '\303\244') +# euro sign +c3=$(printf '\342\202\254') +# some sort of 't' outside BMP +c4=$(printf '\360\235\225\245') + +s=$c1$c2$c3$c4 + +testcase() { + code="$1" + expected="$2" + oIFS="$IFS" + eval "$code" + IFS='|' + result="$#|$*" + IFS="$oIFS" + if [ "x$result" = "x$expected" ]; then + ok=x$ok + else + failures=x$failures + echo "For $code, expected $expected actual $result" + fi +} + +testcase 'set -- "$s"' "1|$s" +testcase 'set -- "${s#$c2}"' "1|$s" +testcase 'set -- "${s#*}"' "1|$s" +testcase 'set -- "${s#$c1}"' "1|$c2$c3$c4" +testcase 'set -- "${s#$c1$c2}"' "1|$c3$c4" +testcase 'set -- "${s#$c1$c2$c3}"' "1|$c4" +testcase 'set -- "${s#$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s#?}"' "1|$c2$c3$c4" +testcase 'set -- "${s#??}"' "1|$c3$c4" +testcase 'set -- "${s#???}"' "1|$c4" +testcase 'set -- "${s#????}"' "1|" +testcase 'set -- "${s#*$c3}"' "1|$c4" +testcase 'set -- "${s%$c4}"' "1|$c1$c2$c3" +testcase 'set -- "${s%$c3$c4}"' "1|$c1$c2" +testcase 'set -- "${s%$c2$c3$c4}"' "1|$c1" +testcase 'set -- "${s%$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s%?}"' "1|$c1$c2$c3" +testcase 'set -- "${s%??}"' "1|$c1$c2" +testcase 'set -- "${s%???}"' "1|$c1" +testcase 'set -- "${s%????}"' "1|" +testcase 'set -- "${s%$c2*}"' "1|$c1" +testcase 'set -- "${s##$c2}"' "1|$s" +testcase 'set -- "${s##*}"' "1|" +testcase 'set -- "${s##$c1}"' "1|$c2$c3$c4" +testcase 'set -- "${s##$c1$c2}"' "1|$c3$c4" +testcase 'set -- "${s##$c1$c2$c3}"' "1|$c4" +testcase 'set -- "${s##$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s##?}"' "1|$c2$c3$c4" +testcase 'set -- "${s##??}"' "1|$c3$c4" +testcase 'set -- "${s##???}"' "1|$c4" +testcase 'set -- "${s##????}"' "1|" +testcase 'set -- "${s##*$c3}"' "1|$c4" +testcase 'set -- "${s%%$c4}"' "1|$c1$c2$c3" +testcase 'set -- "${s%%$c3$c4}"' "1|$c1$c2" +testcase 'set -- "${s%%$c2$c3$c4}"' "1|$c1" +testcase 'set -- "${s%%$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s%%?}"' "1|$c1$c2$c3" +testcase 'set -- "${s%%??}"' "1|$c1$c2" +testcase 'set -- "${s%%???}"' "1|$c1" +testcase 'set -- "${s%%????}"' "1|" +testcase 'set -- "${s%%$c2*}"' "1|$c1" + +test "x$failures" = x -- 2.41.0